summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r--arch/x86/kvm/vmx/vmx.c7797
1 files changed, 4318 insertions, 3479 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6915f10e584..4cbe8c84b636 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
@@ -10,13 +11,9 @@
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/frame.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/kernel.h>
@@ -25,7 +22,9 @@
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
+#include <linux/objtool.h>
#include <linux/sched.h>
+#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
@@ -33,72 +32,91 @@
#include <asm/apic.h>
#include <asm/asm.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fred.h>
+#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
-#include <asm/kexec.h>
+#include <asm/reboot.h>
#include <asm/perf_event.h>
-#include <asm/mce.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
+#include <asm/msr.h>
+#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
-#include <asm/virtext.h>
#include <asm/vmx.h>
+#include <trace/events/ipi.h>
+
#include "capabilities.h"
+#include "common.h"
#include "cpuid.h"
-#include "evmcs.h"
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
-#include "ops.h"
#include "pmu.h"
+#include "sgx.h"
#include "trace.h"
#include "vmcs.h"
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "x86_ops.h"
+#include "smm.h"
+#include "vmx_onhyperv.h"
+#include "posted_intr.h"
+
+#include "mmu/spte.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
+#ifdef MODULE
static const struct x86_cpu_id vmx_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_VMX),
+ X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+#endif
bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
static bool __read_mostly enable_vnmi = 1;
-module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+module_param_named(vnmi, enable_vnmi, bool, 0444);
bool __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
bool __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
+module_param_named(ept, enable_ept, bool, 0444);
bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
- enable_unrestricted_guest, bool, S_IRUGO);
+ enable_unrestricted_guest, bool, 0444);
bool __read_mostly enable_ept_ad_bits = 1;
-module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
static bool __read_mostly emulate_invalid_guest_state = true;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+module_param(emulate_invalid_guest_state, bool, 0444);
static bool __read_mostly fasteoi = 1;
-module_param(fasteoi, bool, S_IRUGO);
+module_param(fasteoi, bool, 0444);
-static bool __read_mostly enable_apicv = 1;
-module_param(enable_apicv, bool, S_IRUGO);
+module_param(enable_apicv, bool, 0444);
+module_param(enable_ipiv, bool, 0444);
+
+module_param(enable_device_posted_irqs, bool, 0444);
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
@@ -106,12 +124,16 @@ module_param(enable_apicv, bool, S_IRUGO);
* use VMX instructions.
*/
static bool __read_mostly nested = 1;
-module_param(nested, bool, S_IRUGO);
-
-static u64 __read_mostly host_xss;
+module_param(nested, bool, 0444);
bool __read_mostly enable_pml = 1;
-module_param_named(pml, enable_pml, bool, S_IRUGO);
+module_param_named(pml, enable_pml, bool, 0444);
+
+static bool __read_mostly error_on_inconsistent_vmcs_config = true;
+module_param(error_on_inconsistent_vmcs_config, bool, 0444);
+
+static bool __read_mostly dump_invalid_vmcs = 0;
+module_param(dump_invalid_vmcs, bool, 0644);
#define MSR_BITMAP_MODE_X2APIC 1
#define MSR_BITMAP_MODE_X2APIC_APICV 2
@@ -125,14 +147,13 @@ static bool __read_mostly enable_preemption_timer = 1;
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
+extern bool __read_mostly allow_smaller_maxphyaddr;
+module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
+
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
- X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
-#define KVM_CR4_GUEST_OWNED_BITS \
- (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -145,9 +166,6 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
RTIT_STATUS_BYTECNT))
-#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
- (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
-
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
@@ -177,10 +195,15 @@ module_param(ple_window_shrink, uint, 0444);
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
-/* Default is SYSTEM mode, 1 for host-guest mode */
+/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
+#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
+#endif
+
+struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
+#ifdef CONFIG_CPU_MITIGATIONS
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
@@ -203,24 +226,24 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
-static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+
if (!enable_ept) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
return 0;
}
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
- u64 msr;
-
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
- if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
- return 0;
- }
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
}
/* If set to auto use the default l1tf mitigation method */
@@ -229,6 +252,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER;
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
@@ -245,6 +269,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ /*
+ * This allocation for vmx_l1d_flush_pages is not tied to a VM
+ * lifetime and so should not be charged to a memcg.
+ */
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return -ENOMEM;
@@ -275,6 +303,26 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
+static int vmx_setup_l1d_flush(void)
+{
+ /*
+ * Hand the parameter mitigation value in which was stored in the pre
+ * module init parser. If no parameter was given, it will contain
+ * 'auto' which will be turned into the default 'cond' mitigation mode.
+ */
+ return __vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+}
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;
@@ -312,7 +360,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
}
mutex_lock(&vmx_l1d_flush_mutex);
- ret = vmx_setup_l1d_flush(l1tf);
+ ret = __vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}
@@ -320,10 +368,99 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
- return sprintf(s, "???\n");
+ return sysfs_emit(s, "???\n");
+
+ return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ if (!static_branch_unlikely(&vmx_l1d_should_flush))
+ return;
+
+ /*
+ * This code is only executed when the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ /*
+ * Clear the per-cpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator,
+ * or from the interrupt handlers.
+ */
+ if (!kvm_get_cpu_l1tf_flush_l1d())
+ return;
+ kvm_clear_cpu_l1tf_flush_l1d();
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+#else /* CONFIG_CPU_MITIGATIONS*/
+static int vmx_setup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER;
+ return 0;
+}
+static void vmx_cleanup_l1d_flush(void)
+{
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
- return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n");
+ return 0;
+}
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sysfs_emit(s, "never\n");
+}
+#endif
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
@@ -331,13 +468,115 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
-static bool guest_state_valid(struct kvm_vcpu *vcpu);
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
+ msr |= FB_CLEAR_DIS;
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ /*
+ * Disable VERW's behavior of clearing CPU buffers for the guest if the
+ * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
+ * the mitigation. Disabling the clearing behavior provides a
+ * performance boost for guests that aren't aware that manually clearing
+ * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
+ * and VM-Exit.
+ */
+ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA);
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type);
void vmx_vmexit(void);
+#define vmx_insn_failed(fmt...) \
+do { \
+ WARN_ONCE(1, fmt); \
+ pr_warn_ratelimited(fmt); \
+} while (0)
+
+noinline void vmread_error(unsigned long field)
+{
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
+}
+
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
+{
+ if (fault) {
+ kvm_spurious_fault();
+ } else {
+ instrumentation_begin();
+ vmread_error(field);
+ instrumentation_end();
+ }
+}
+#endif
+
+noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
+ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
+{
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
+{
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
+ vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
+}
+
+noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
+{
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ ext, vpid, gva);
+}
+
+noinline void invept_error(unsigned long ext, u64 eptp)
+{
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
+}
+
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
/*
@@ -346,18 +585,11 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs);
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
- */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
-struct vmcs_config vmcs_config;
-struct vmx_capability vmx_capability;
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
@@ -383,103 +615,94 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
-u64 host_efer;
-/*
- * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
- * will emulate SYSCALL in legacy mode if the vendor string in guest
- * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
- * support this emulation, IA32_STAR must always be included in
- * vmx_msr_index[], even in i386 builds.
- */
-const u32 vmx_msr_index[] = {
-#ifdef CONFIG_X86_64
- MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
-#endif
- MSR_EFER, MSR_TSC_AUX, MSR_STAR,
-};
+static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
+static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu;
- u64 tmp_eptp = INVALID_PAGE;
- int i;
+ struct hv_enlightened_vmcs *evmcs;
+ hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!VALID_PAGE(tmp_eptp)) {
- tmp_eptp = to_vmx(vcpu)->ept_pointer;
- } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
- to_kvm_vmx(kvm)->ept_pointers_match
- = EPT_POINTERS_MISMATCH;
- return;
- }
- }
+ if (partition_assist_page == INVALID_PAGE)
+ return -ENOMEM;
- to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
+ evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
-int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
- void *data)
-{
- struct kvm_tlb_range *range = data;
+ evmcs->partition_assist_page = partition_assist_page;
+ evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
+ evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
- return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
- range->pages);
+ return 0;
}
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
+static __init void hv_init_evmcs(void)
{
- u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
/*
- * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
- * of the base of EPT PML4 table, strip off EPT configuration
- * information.
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
*/
- if (range)
- return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
- kvm_fill_hv_flush_list_func, (void *)range);
- else
- return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
-}
-
-static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
-{
- struct kvm_vcpu *vcpu;
- int ret = 0, i;
-
- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
- if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
- check_ept_pointer_match(kvm);
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
- if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- /* If ept_pointer is invalid pointer, bypass flush request. */
- if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
- ret |= __hv_remote_flush_tlb_with_range(
- kvm, vcpu, range);
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&__kvm_is_using_evmcs);
}
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vt_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
} else {
- ret = __hv_remote_flush_tlb_with_range(kvm,
- kvm_get_vcpu(kvm, 0), range);
+ enlightened_vmcs = false;
}
-
- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- return ret;
}
-static int hv_remote_flush_tlb(struct kvm *kvm)
+
+static void hv_reset_evmcs(void)
{
- return hv_remote_flush_tlb_with_range(kvm, NULL);
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!kvm_is_using_evmcs())
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
}
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
@@ -535,79 +758,80 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static inline bool report_flexpriority(void)
-{
- return flexpriority_enabled;
-}
-
-static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
-{
- int i;
-
- for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
- return i;
- return -1;
-}
-
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- i = __find_msr_index(vmx, msr);
+ i = kvm_find_user_return_msr(msr);
if (i >= 0)
- return &vmx->guest_msrs[i];
+ return &vmx->guest_uret_msrs[i];
return NULL;
}
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
+ struct vmx_uret_msr *msr, u64 data)
{
- vmcs_clear(loaded_vmcs->vmcs);
- if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
- vmcs_clear(loaded_vmcs->shadow_vmcs);
- loaded_vmcs->cpu = -1;
- loaded_vmcs->launched = 0;
+ unsigned int slot = msr - vmx->guest_uret_msrs;
+ int ret = 0;
+
+ if (msr->load_into_hardware) {
+ preempt_disable();
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
+ preempt_enable();
+ }
+ if (!ret)
+ msr->data = data;
+ return ret;
}
-#ifdef CONFIG_KEXEC_CORE
/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
*/
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
+static int kvm_cpu_vmxoff(void)
{
- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
+ asm goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
-static inline void crash_disable_local_vmclear(int cpu)
-{
- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
-static inline int crash_local_vmclear_enabled(int cpu)
-{
- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
+fault:
+ cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
}
-static void crash_vmclear_local_loaded_vmcss(void)
+void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
- if (!crash_local_vmclear_enabled(cpu))
+ kvm_rebooting = true;
+
+ /*
+ * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+ * set in task context. If this races with VMX is disabled by an NMI,
+ * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
+ * kvm_rebooting set.
+ */
+ if (!(__read_cr4() & X86_CR4_VMXE))
return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
+
+ kvm_cpu_vmxoff();
}
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
@@ -618,22 +842,27 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- crash_disable_local_vmclear(cpu);
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/*
- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
- * is before setting loaded_vmcs->vcpu to -1 which is done in
- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
- * then adds the vmcs into percpu list before it is deleted.
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->cpu to
+ * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
+ * and add loaded_vmcs to its percpu list before it's deleted from this
+ * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
- loaded_vmcs_init(loaded_vmcs);
- crash_enable_local_vmclear(cpu);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
}
-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
int cpu = loaded_vmcs->cpu;
@@ -648,8 +877,8 @@ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
- if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
- vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
+ if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
+ kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
@@ -693,13 +922,19 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
return *p;
}
-void update_exception_bitmap(struct kvm_vcpu *vcpu)
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
+ /*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
@@ -713,8 +948,8 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
- if (enable_ept)
- eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ if (!vmx_need_pf_intercept(vcpu))
+ eb &= ~(1u << PF_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
@@ -723,6 +958,31 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
*/
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
+ else {
+ int mask = 0, match = 0;
+
+ if (enable_ept && (eb & (1u << PF_VECTOR))) {
+ /*
+ * If EPT is enabled, #PF is currently only intercepted
+ * if MAXPHYADDR is smaller on the guest than on the
+ * host. In that case we only care about present,
+ * non-reserved faults. For vmcs02, however, PFEC_MASK
+ * and PFEC_MATCH are set in prepare_vmcs02_rare.
+ */
+ mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
+ match = PFERR_PRESENT_MASK;
+ }
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
+ }
+
+ /*
+ * Disabling xfd interception indicates that dynamic xfeatures
+ * might be used in the guest. Always trap #NM in this case
+ * to save guest xfd_err timely.
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ eb |= (1u << NM_VECTOR);
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -730,34 +990,44 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{
- unsigned long *msr_bitmap;
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
+ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
return true;
- msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+ return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr);
+}
- if (msr <= 0x1fff) {
- return !!test_bit(msr, msr_bitmap + 0x800 / f);
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- return !!test_bit(msr, msr_bitmap + 0xc00 / f);
- }
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
+{
+ unsigned int flags = 0;
- return true;
+ if (vmx->loaded_vmcs->launched)
+ flags |= VMX_RUN_VMRESUME;
+
+ /*
+ * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
+ * to change it directly without causing a vmexit. In that case read
+ * it after vmexit and store it in vmx->spec_ctrl.
+ */
+ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
+ flags |= VMX_RUN_SAVE_SPEC_CTRL;
+
+ if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) &&
+ kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
+ flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+
+ return flags;
}
-static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit)
{
vm_entry_controls_clearbit(vmx, entry);
vm_exit_controls_clearbit(vmx, exit);
}
-static int find_msr(struct vmx_msrs *m, unsigned int msr)
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
{
unsigned int i;
@@ -791,7 +1061,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
}
break;
}
- i = find_msr(&m->guest, msr);
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
@@ -799,7 +1069,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
skip_guest:
- i = find_msr(&m->host, msr);
+ i = vmx_find_loadstore_msr_slot(&m->host, msr);
if (i < 0)
return;
@@ -808,7 +1078,7 @@ skip_guest:
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
-static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit,
unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
u64 guest_val, u64 host_val)
@@ -855,14 +1125,15 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
* provide that period, so a CPU could write host's record into
* guest's memory.
*/
- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
}
- i = find_msr(&m->guest, msr);
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (!entry_only)
- j = find_msr(&m->host, msr);
+ j = vmx_find_loadstore_msr_slot(&m->host, msr);
- if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
+ if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
+ (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
@@ -885,22 +1156,15 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
m->host.val[j].value = host_val;
}
-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+static bool update_transition_efer(struct vcpu_vmx *vmx)
{
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
+ int i;
- if (!enable_ept) {
- /*
- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
- * host CPUID is more efficient than testing guest CPUID
- * or CR4. Host SMEP is anyway a requirement for guest SMEP.
- */
- if (boot_cpu_has(X86_FEATURE_SMEP))
- guest_efer |= EFER_NX;
- else if (!(guest_efer & EFER_NX))
- ignore_bits |= EFER_NX;
- }
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
@@ -919,26 +1183,30 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer, false);
+ guest_efer, kvm_host.efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
- } else {
- clear_atomic_switch_msr(vmx, MSR_EFER);
+ }
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ i = kvm_find_user_return_msr(MSR_EFER);
+ if (i < 0)
+ return false;
- vmx->guest_msrs[efer_offset].data = guest_efer;
- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+ clear_atomic_switch_msr(vmx, MSR_EFER);
- return true;
- }
+ guest_efer &= ~ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
+
+ vmx->guest_uret_msrs[i].data = guest_efer;
+ vmx->guest_uret_msrs[i].mask = ~ignore_bits;
+
+ return true;
}
#ifdef CONFIG_X86_32
@@ -970,17 +1238,29 @@ static unsigned long segment_base(u16 selector)
}
#endif
+static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
+{
+ return vmx_pt_mode_is_host_guest() &&
+ !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+}
+
+static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
+{
+ /* The base must be 128-byte aligned and a legal physical address. */
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
+}
+
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
- wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
- wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
- wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
- wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
- wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
- wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
@@ -988,50 +1268,82 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
- rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
- rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
- rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
- rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
- rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
- rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
static void pt_guest_enter(struct vcpu_vmx *vmx)
{
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
return;
/*
* GUEST_IA32_RTIT_CTL is already set in the VMCS.
* Save host state before VM entry.
*/
- rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- wrmsrl(MSR_IA32_RTIT_CTL, 0);
- pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
+ wrmsrq(MSR_IA32_RTIT_CTL, 0);
+ pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
}
static void pt_guest_exit(struct vcpu_vmx *vmx)
{
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
+ pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
}
- /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ /*
+ * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
+ * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
+ */
+ if (vmx->pt_desc.host.ctl)
+ wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+}
+
+void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base)
+{
+ if (unlikely(fs_sel != host->fs_sel)) {
+ if (!(fs_sel & 7))
+ vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+ else
+ vmcs_write16(HOST_FS_SELECTOR, 0);
+ host->fs_sel = fs_sel;
+ }
+ if (unlikely(gs_sel != host->gs_sel)) {
+ if (!(gs_sel & 7))
+ vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+ else
+ vmcs_write16(HOST_GS_SELECTOR, 0);
+ host->gs_sel = gs_sel;
+ }
+ if (unlikely(fs_base != host->fs_base)) {
+ vmcs_writel(HOST_FS_BASE, fs_base);
+ host->fs_base = fs_base;
+ }
+ if (unlikely(gs_base != host->gs_base)) {
+ vmcs_writel(HOST_GS_BASE, gs_base);
+ host->gs_base = gs_base;
+ }
}
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
@@ -1040,27 +1352,30 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
- vmx->req_immediate_exit = false;
-
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
* to/from long-mode by setting MSR_EFER.LMA.
*/
- if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
- vmx->guest_msrs_dirty = false;
- for (i = 0; i < vmx->save_nmsrs; ++i)
- kvm_set_shared_msr(vmx->guest_msrs[i].index,
- vmx->guest_msrs[i].data,
- vmx->guest_msrs[i].mask);
+ if (!vmx->guest_uret_msrs_loaded) {
+ vmx->guest_uret_msrs_loaded = true;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (!vmx->guest_uret_msrs[i].load_into_hardware)
+ continue;
+ kvm_set_user_return_msr(i,
+ vmx->guest_uret_msrs[i].data,
+ vmx->guest_uret_msrs[i].mask);
+ }
}
- if (vmx->loaded_cpu_state)
+ if (vmx->nested.need_vmcs12_to_shadow_sync)
+ nested_sync_vmcs12_to_shadow(vcpu);
+
+ if (vt->guest_state_loaded)
return;
- vmx->loaded_cpu_state = vmx->loaded_vmcs;
- host_state = &vmx->loaded_cpu_state->host_state;
+ host_state = &vmx->loaded_vmcs->host_state;
/*
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
@@ -1074,19 +1389,19 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
- save_fsgs_for_kvm();
+ current_save_fsgs();
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
- vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = read_msr(MSR_FS_BASE);
- vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
@@ -1094,45 +1409,23 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = segment_base(gs_sel);
#endif
- if (unlikely(fs_sel != host_state->fs_sel)) {
- if (!(fs_sel & 7))
- vmcs_write16(HOST_FS_SELECTOR, fs_sel);
- else
- vmcs_write16(HOST_FS_SELECTOR, 0);
- host_state->fs_sel = fs_sel;
- }
- if (unlikely(gs_sel != host_state->gs_sel)) {
- if (!(gs_sel & 7))
- vmcs_write16(HOST_GS_SELECTOR, gs_sel);
- else
- vmcs_write16(HOST_GS_SELECTOR, 0);
- host_state->gs_sel = gs_sel;
- }
- if (unlikely(fs_base != host_state->fs_base)) {
- vmcs_writel(HOST_FS_BASE, fs_base);
- host_state->fs_base = fs_base;
- }
- if (unlikely(gs_base != host_state->gs_base)) {
- vmcs_writel(HOST_GS_BASE, gs_base);
- host_state->gs_base = gs_base;
- }
+ vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+ vt->guest_state_loaded = true;
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
struct vmcs_host_state *host_state;
- if (!vmx->loaded_cpu_state)
+ if (!vmx->vt.guest_state_loaded)
return;
- WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
- host_state = &vmx->loaded_cpu_state->host_state;
+ host_state = &vmx->loaded_vmcs->host_state;
++vmx->vcpu.stat.host_state_reload;
- vmx->loaded_cpu_state = NULL;
#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
@@ -1152,114 +1445,114 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
+ vmx->vt.guest_state_loaded = false;
+ vmx->guest_uret_msrs_loaded = false;
}
#ifdef CONFIG_X86_64
-static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
{
preempt_disable();
- if (vmx->loaded_cpu_state)
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ if (vmx->vt.guest_state_loaded)
+ *cache = read_msr(msr);
preempt_enable();
- return vmx->msr_guest_kernel_gs_base;
+ return *cache;
}
-static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+static void vmx_write_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 data,
+ u64 *cache)
{
preempt_disable();
- if (vmx->loaded_cpu_state)
- wrmsrl(MSR_KERNEL_GS_BASE, data);
+ if (vmx->vt.guest_state_loaded)
+ wrmsrns(msr, data);
preempt_enable();
- vmx->msr_guest_kernel_gs_base = data;
+ *cache = data;
+}
+
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+{
+ return vmx_read_guest_host_msr(vmx, MSR_KERNEL_GS_BASE,
+ &vmx->msr_guest_kernel_gs_base);
+}
+
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+ vmx_write_guest_host_msr(vmx, MSR_KERNEL_GS_BASE, data,
+ &vmx->msr_guest_kernel_gs_base);
}
#endif
-static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+static void grow_ple_window(struct kvm_vcpu *vcpu)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
- /*
- * In case of hot-plug or hot-unplug, we may have to undo
- * vmx_vcpu_pi_put even if there is no assigned device. And we
- * always keep PI.NDST up to date for simplicity: it makes the
- * code easier, and CPU migration is not a fast path.
- */
- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
- return;
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
- /*
- * First handle the simple case where no cmpxchg is necessary; just
- * allow posting non-urgent interrupts.
- *
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block will do it for us and the wakeup_handler
- * expects the VCPU to be on the blocked_vcpu_list that matches
- * PI.NDST.
- */
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
- vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- return;
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
}
+}
- /* The full case. */
- do {
- old.control = new.control = pi_desc->control;
-
- dest = cpu_physical_id(cpu);
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
- new.sn = 0;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
}
-/*
- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
- * vcpu mutex is already taken.
- */
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+ struct vmcs *prev;
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
- crash_disable_local_vmclear(cpu);
/*
- * Read loaded_vmcs->cpu should be before fetching
- * loaded_vmcs->loaded_vmcss_on_cpu_link.
- * See the comments in __loaded_vmcs_clear().
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
- crash_enable_local_vmclear(cpu);
local_irq_enable();
}
- if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ prev = per_cpu(current_vmcs, cpu);
+ if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
- indirect_branch_prediction_barrier();
}
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
- unsigned long sysenter_esp;
+ /*
+ * Flush all EPTP/VPID contexts, the new pCPU may have stale
+ * TLB entries from its previous association with the vCPU.
+ */
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/*
@@ -1270,42 +1563,28 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- /*
- * VM exits change the host TR limit to 0x67 after a VM
- * exit. This is okay, since 0x67 covers everything except
- * the IO bitmap and have have code to handle the IO bitmap
- * being lost after a VM exit.
- */
- BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
-
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
vmx->loaded_vmcs->cpu = cpu;
}
-
- /* Setup TSC multiplier */
- if (kvm_has_tsc_control &&
- vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
- decache_tsc_multiplier(vmx);
-
- vmx_vcpu_pi_load(vcpu, cpu);
- vmx->host_pkru = read_pkru();
- vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return;
+ vmx_vcpu_load_vmcs(vcpu, cpu);
- /* Set SN when the vCPU is preempted */
- if (vcpu->preempted)
- pi_set_sn(pi_desc);
+ vmx_vcpu_pi_load(vcpu, cpu);
}
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1315,44 +1594,61 @@ void vmx_vcpu_put(struct kvm_vcpu *vcpu)
vmx_prepare_switch_to_host(to_vmx(vcpu));
}
-static bool emulation_required(struct kvm_vcpu *vcpu)
+bool vmx_emulation_required(struct kvm_vcpu *vcpu)
{
- return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+ return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
-
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long rflags, save_rflags;
- if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ save_rflags = vmx->rmode.save_rflags;
rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
}
- to_vmx(vcpu)->rflags = rflags;
+ vmx->rflags = rflags;
}
- return to_vmx(vcpu)->rflags;
+ return vmx->rflags;
}
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- unsigned long old_rflags = vmx_get_rflags(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long old_rflags;
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->rflags = rflags;
- if (to_vmx(vcpu)->rmode.vm86_active) {
- to_vmx(vcpu)->rmode.save_rflags = rflags;
+ /*
+ * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
+ * is an unrestricted guest in order to mark L2 as needing emulation
+ * if L1 runs L2 as a restricted guest.
+ */
+ if (is_unrestricted_guest(vcpu)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ vmx->rflags = rflags;
+ vmcs_writel(GUEST_RFLAGS, rflags);
+ return;
+ }
+
+ old_rflags = vmx_get_rflags(vcpu);
+ vmx->rflags = rflags;
+ if (vmx->rmode.vm86_active) {
+ vmx->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);
- if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
- to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
+ if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
+}
+
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -1401,7 +1697,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
- ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+ (data & RTIT_CTL_TRACEEN) &&
+ data != vmx->pt_desc.guest.ctl)
return 1;
/*
@@ -1417,7 +1714,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
/*
* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
- * utilize encodings marked reserved will casue a #GP fault.
+ * utilize encodings marked reserved will cause a #GP fault.
*/
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
@@ -1441,32 +1738,147 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
return 1;
return 0;
}
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ /*
+ * Emulation of instructions in SGX enclaves is impossible as RIP does
+ * not point at the failing instruction, and even if it did, the code
+ * stream is inaccessible. Inject #UD instead of exiting to userspace
+ * so that guest userspace can't DoS the guest simply by triggering
+ * emulation (enclaves are CPL3 only).
+ */
+ if (vmx_get_exit_reason(vcpu).enclave_mode) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ /* Check that emulation is possible during event vectoring */
+ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
+ return X86EMUL_CONTINUE;
+}
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rip;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
+ unsigned long rip, orig_rip;
+ u32 instr_len;
+
+ /*
+ * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
+ * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
+ * set when EPT misconfig occurs. In practice, real hardware updates
+ * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
+ * (namely Hyper-V) don't set it due to it being undefined behavior,
+ * i.e. we end up advancing IP with some random value.
+ */
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
+ instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- rip = kvm_rip_read(vcpu);
- rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- kvm_rip_write(vcpu, rip);
+ /*
+ * Emulating an enclave's instructions isn't supported as KVM
+ * cannot access the enclave's memory or its true RIP, e.g. the
+ * vmcs.GUEST_RIP points at the exit point of the enclave, not
+ * the RIP that actually triggered the VM-Exit. But, because
+ * most instructions that cause VM-Exit will #UD in an enclave,
+ * most instruction-based VM-Exits simply do not occur.
+ *
+ * There are a few exceptions, notably the debug instructions
+ * INT1ICEBRK and INT3, as they are allowed in debug enclaves
+ * and generate #DB/#BP as expected, which KVM might intercept.
+ * But again, the CPU does the dirty work and saves an instr
+ * length of zero so VMMs don't shoot themselves in the foot.
+ * WARN if KVM tries to skip a non-zero length instruction on
+ * a VM-Exit from an enclave.
+ */
+ if (!instr_len)
+ goto rip_updated;
+
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
+
+ orig_rip = kvm_rip_read(vcpu);
+ rip = orig_rip + instr_len;
+#ifdef CONFIG_X86_64
+ /*
+ * We need to mask out the high 32 bits of RIP if not in 64-bit
+ * mode, but just finding out that we are in 64-bit mode is
+ * quite expensive. Only do it if there was a carry.
+ */
+ if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
+ rip = (u32)rip;
+#endif
+ kvm_rip_write(vcpu, rip);
+ } else {
+ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ return 0;
+ }
+rip_updated:
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
+}
+
+/*
+ * Recognizes a pending MTF VM-exit and records the nested state for later
+ * delivery.
+ */
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!is_guest_mode(vcpu))
+ return;
+
+ /*
+ * Per the SDM, MTF takes priority over debug-trap exceptions besides
+ * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
+ * or ICEBP (in the emulator proper), and skipping of ICEBP after an
+ * intercepted #DB deliberately avoids single-step #DB and MTF updates
+ * as ICEBP is higher priority than both. As instruction emulation is
+ * completed at this point (i.e. KVM is at the instruction boundary),
+ * any #DB exception pending delivery must be a debug-trap of lower
+ * priority than MTF. Record the pending MTF state to be delivered in
+ * vmx_check_nested_events().
+ */
+ if (nested_cpu_has_mtf(vmcs12) &&
+ (!vcpu->arch.exception.pending ||
+ vcpu->arch.exception.vector == DB_VECTOR) &&
+ (!vcpu->arch.exception_vmexit.pending ||
+ vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
+ vmx->nested.mtf_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else {
+ vmx->nested.mtf_pending = false;
+ }
+}
+
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ vmx_update_emulated_instruction(vcpu);
+ return skip_emulated_instruction(vcpu);
}
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
@@ -1482,33 +1894,40 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
-static void vmx_queue_exception(struct kvm_vcpu *vcpu)
+void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned nr = vcpu->arch.exception.nr;
- bool has_error_code = vcpu->arch.exception.has_error_code;
- u32 error_code = vcpu->arch.exception.error_code;
- u32 intr_info = nr | INTR_INFO_VALID_MASK;
- kvm_deliver_exception_payload(vcpu);
+ kvm_deliver_exception_payload(vcpu, ex);
- if (has_error_code) {
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ if (ex->has_error_code) {
+ /*
+ * Despite the error code being architecturally defined as 32
+ * bits, and the VMCS field being 32 bits, Intel CPUs and thus
+ * VMX don't actually supporting setting bits 31:16. Hardware
+ * will (should) never provide a bogus error code, but AMD CPUs
+ * do generate error codes with bits 31:16 set, and so KVM's
+ * ABI lets userspace shove in arbitrary 32-bit values. Drop
+ * the upper bits to avoid VM-Fail, losing information that
+ * doesn't really exist is preferable to killing the VM.
+ */
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
- if (kvm_exception_is_soft(nr))
+ if (kvm_exception_is_soft(ex->vector))
inc_eip = vcpu->arch.event_exit_inst_len;
- if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
return;
}
- WARN_ON_ONCE(vmx->emulation_required);
+ WARN_ON_ONCE(vmx->vt.emulation_required);
- if (kvm_exception_is_soft(nr)) {
+ if (kvm_exception_is_soft(ex->vector)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
@@ -1520,144 +1939,151 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-static bool vmx_rdtscp_supported(void)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
+ bool load_into_hardware)
{
- return cpu_has_vmx_rdtscp();
-}
+ struct vmx_uret_msr *uret_msr;
-static bool vmx_invpcid_supported(void)
-{
- return cpu_has_vmx_invpcid();
-}
-
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
-{
- struct shared_msr_entry tmp;
+ uret_msr = vmx_find_uret_msr(vmx, msr);
+ if (!uret_msr)
+ return;
- tmp = vmx->guest_msrs[to];
- vmx->guest_msrs[to] = vmx->guest_msrs[from];
- vmx->guest_msrs[from] = tmp;
+ uret_msr->load_into_hardware = load_into_hardware;
}
/*
- * Set up the vmcs to automatically save and restore system
- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
+ * Configuring user return MSRs to automatically save, load, and restore MSRs
+ * that need to be shoved into hardware when running the guest. Note, omitting
+ * an MSR here does _NOT_ mean it's not emulated, only that it will not be
+ * loaded into hardware when running the guest.
*/
-static void setup_msrs(struct vcpu_vmx *vmx)
+static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{
- int save_nmsrs, index;
-
- save_nmsrs = 0;
#ifdef CONFIG_X86_64
+ bool load_syscall_msrs;
+
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
- if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- index = __find_msr_index(vmx, MSR_STAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_LSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- }
+ load_syscall_msrs = is_long_mode(&vmx->vcpu) &&
+ (vmx->vcpu.arch.efer & EFER_SCE);
+
+ vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs);
#endif
- index = __find_msr_index(vmx, MSR_EFER);
- if (index >= 0 && update_transition_efer(vmx, index))
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- move_msr_up(vmx, index, save_nmsrs++);
+ vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
- vmx->save_nmsrs = save_nmsrs;
- vmx->guest_msrs_dirty = true;
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(&vmx->vcpu);
+ /*
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
+ * kernel and old userspace. If those guests run on a tsx=off host, do
+ * allow guests to use TSX_CTRL, but don't change the value in hardware
+ * so that TSX remains always disabled.
+ */
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
+
+ /*
+ * The set of MSRs to load may have changed, reload MSRs before the
+ * next VM-Enter.
+ */
+ vmx->guest_uret_msrs_loaded = false;
}
-static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
- return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
+ return vmcs12->tsc_offset;
- return vcpu->arch.tsc_offset;
+ return 0;
}
-static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u64 g_tsc_offset = 0;
- /*
- * We're here if L1 chose not to trap WRMSR to TSC. According
- * to the spec, this should set L1's TSC; The offset that L1
- * set for L2 remains unchanged, and still needs to be added
- * to the newly set TSC to get L2's TSC.
- */
- if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
- g_tsc_offset = vmcs12->tsc_offset;
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ return vmcs12->tsc_multiplier;
- trace_kvm_write_tsc_offset(vcpu->vcpu_id,
- vcpu->arch.tsc_offset - g_tsc_offset,
- offset);
- vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
- return offset + g_tsc_offset;
+ return kvm_caps.default_tsc_scaling_ratio;
}
-/*
- * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
- * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
- * all guests if the "nested" module option is off, and can also be disabled
- * for a single guest by disabling its VMX cpuid bit.
- */
-bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
- return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
-static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
- uint64_t val)
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+}
+
+/*
+ * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
+ * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
+ * backwards compatibility even though KVM doesn't support emulating SMX. And
+ * because userspace set "VMX in SMX", the guest must also be allowed to set it,
+ * e.g. if the MSR is left unlocked and the guest does a RMW operation.
+ */
+#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
+ FEAT_CTL_SGX_LC_ENABLED | \
+ FEAT_CTL_SGX_ENABLED | \
+ FEAT_CTL_LMCE_ENABLED)
+
+static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
+ struct msr_data *msr)
{
- uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+ uint64_t valid_bits;
- return !(val & ~valid_bits);
+ /*
+ * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
+ * exposed to the guest.
+ */
+ WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
+ ~KVM_SUPPORTED_FEATURE_CONTROL);
+
+ if (!msr->host_initiated &&
+ (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
+ return false;
+
+ if (msr->host_initiated)
+ valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
+ else
+ valid_bits = vmx->msr_ia32_feature_control_valid_bits;
+
+ return !(msr->data & ~valid_bits);
}
-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_feature_msr(u32 msr, u64 *data)
{
- switch (msr->index) {
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ switch (msr) {
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
- return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
default:
- return 1;
+ return KVM_MSR_RET_UNSUPPORTED;
}
-
- return 0;
}
/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
+ struct vmx_uret_msr *msr;
u32 index;
switch (msr_info->index) {
@@ -1674,18 +2100,23 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
- case MSR_IA32_SPEC_CTRL:
+ case MSR_IA32_TSX_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ goto find_uret_msr;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
return 1;
- msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ msr_info->data = vmx->msr_ia32_umwait_control;
break;
- case MSR_IA32_ARCH_CAPABILITIES:
+ case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
- msr_info->data = to_vmx(vcpu)->arch_capabilities;
+
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
@@ -1699,49 +2130,65 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
case MSR_IA32_MCG_EXT_CTL:
if (!msr_info->host_initiated &&
!(vmx->msr_ia32_feature_control &
- FEATURE_CONTROL_LMCE))
+ FEAT_CTL_LMCE_ENABLED))
return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl;
break;
- case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
- if (!nested_vmx_allowed(vcpu))
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+ if (!msr_info->host_initiated &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+ [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+ break;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
- return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
- &msr_info->data);
- case MSR_IA32_XSS:
- if (!vmx_xsaves_supported())
+ if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data))
return 1;
- msr_info->data = vcpu->arch.ia32_xss;
+#ifdef CONFIG_KVM_HYPERV
+ /*
+ * Enlightened VMCS v1 doesn't have certain VMCS fields but
+ * instead of just ignoring the features, different Hyper-V
+ * versions are either trying to use them and fail or do some
+ * sanity checking and refuse to boot. Filter all unsupported
+ * features out.
+ */
+ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
+ nested_evmcs_filter_control_msr(vcpu, msr_info->index,
+ &msr_info->data);
+#endif
break;
case MSR_IA32_RTIT_CTL:
- if (pt_mode != PT_MODE_HOST_GUEST)
+ if (!vmx_pt_mode_is_host_guest())
return 1;
msr_info->data = vmx->pt_desc.guest.ctl;
break;
case MSR_IA32_RTIT_STATUS:
- if (pt_mode != PT_MODE_HOST_GUEST)
+ if (!vmx_pt_mode_is_host_guest())
return 1;
msr_info->data = vmx->pt_desc.guest.status;
break;
case MSR_IA32_RTIT_CR3_MATCH:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering))
return 1;
msr_info->data = vmx->pt_desc.guest.cr3_match;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1750,7 +2197,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmx->pt_desc.guest.output_base;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1760,22 +2207,29 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ if (!vmx_pt_mode_is_host_guest() ||
+ (index >= 2 * vmx->pt_desc.num_address_ranges))
return 1;
if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- /* Otherwise falls through */
+ case MSR_IA32_S_CET:
+ msr_info->data = vmcs_readl(GUEST_S_CET);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ msr_info->data = vmcs_readl(GUEST_SSP);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE);
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = vmx_guest_debugctl_read();
+ break;
default:
- msr = find_msr_entry(vmx, msr_info->index);
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
break;
@@ -1786,15 +2240,56 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
+static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+ u64 data)
+{
+#ifdef CONFIG_X86_64
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return (u32)data;
+#endif
+ return (unsigned long)data;
+}
+
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+{
+ u64 debugctl = 0;
+
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+
+ if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
+ (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
+ debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
+
+ return debugctl;
+}
+
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
/*
- * Writes msr value into into the appropriate "register".
+ * Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
+ struct vmx_uret_msr *msr;
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
@@ -1816,37 +2311,95 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data);
break;
+ case MSR_IA32_XFD:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ /*
+ * Always intercepting WRMSR could incur non-negligible
+ * overhead given xfd might be changed frequently in
+ * guest context switch. Disable write interception
+ * upon the first write with a non-zero value (indicating
+ * potential usage on dynamic xfeatures). Also update
+ * exception bitmap to trap #NM for proper virtualization
+ * of guest xfd_err.
+ */
+ if (!ret && data) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+ MSR_TYPE_RW);
+ vcpu->arch.xfd_no_write_intercept = true;
+ vmx_update_exception_bitmap(vcpu);
+ }
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
+ if (is_guest_mode(vcpu))
+ get_vmcs12(vcpu)->guest_sysenter_cs = data;
vmcs_write32(GUEST_SYSENTER_CS, data);
break;
case MSR_IA32_SYSENTER_EIP:
+ if (is_guest_mode(vcpu)) {
+ data = nested_vmx_truncate_sysenter_addr(vcpu, data);
+ get_vmcs12(vcpu)->guest_sysenter_eip = data;
+ }
vmcs_writel(GUEST_SYSENTER_EIP, data);
break;
case MSR_IA32_SYSENTER_ESP:
+ if (is_guest_mode(vcpu)) {
+ data = nested_vmx_truncate_sysenter_addr(vcpu, data);
+ get_vmcs12(vcpu)->guest_sysenter_esp = data;
+ }
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
+ return 1;
+
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
+ if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_SAVE_DEBUG_CONTROLS)
+ get_vmcs12(vcpu)->guest_ia32_debugctl = data;
+
+ vmx_guest_debugctl_write(vcpu, data);
+
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
- if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
+ if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
+
+ if (is_guest_mode(vcpu) &&
+ ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) ||
+ (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
+ get_vmcs12(vcpu)->guest_bndcfgs = data;
+
vmcs_write64(GUEST_BNDCFGS, data);
break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
+ return 1;
+
+ /* The reserved bit 1 and non-32 bit [63:32] should be zero */
+ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
+ return 1;
+
+ vmx->msr_ia32_umwait_control = data;
+ break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ if (kvm_spec_ctrl_test_value(data))
return 1;
vmx->spec_ctrl = data;
-
if (!data)
break;
@@ -1857,188 +2410,191 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_merge_msr_bitmap. We should not touch the
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging. We update the vmcs01 here for L1 as well
* since it will end up touching the MSR anyway now.
*/
- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ vmx_disable_intercept_for_msr(vcpu,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW);
break;
- case MSR_IA32_PRED_CMD:
+ case MSR_IA32_TSX_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
return 1;
-
- if (data & ~PRED_CMD_IBPB)
+ if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
return 1;
-
- if (!data)
+ goto find_uret_msr;
+ case MSR_IA32_CR_PAT:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ if (ret)
break;
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ if (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
+ get_vmcs12(vcpu)->guest_ia32_pat = data;
- /*
- * For non-nested:
- * When it's written (to non-zero) for the first time, pass
- * it through.
- *
- * For nested:
- * The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_merge_msr_bitmap. We should not touch the
- * vmcs02.msr_bitmap here since it gets completely overwritten
- * in the merging.
- */
- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
- MSR_TYPE_W);
- break;
- case MSR_IA32_ARCH_CAPABILITIES:
- if (!msr_info->host_initiated)
- return 1;
- vmx->arch_capabilities = data;
- break;
- case MSR_IA32_CR_PAT:
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
- return 1;
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, data);
- vcpu->arch.pat = data;
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
- case MSR_IA32_TSC_ADJUST:
- ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
- FEATURE_CONTROL_LMCE)) ||
+ FEAT_CTL_LMCE_ENABLED)) ||
(data & ~MCG_EXT_CTL_LMCE_EN))
return 1;
vcpu->arch.mcg_ext_ctl = data;
break;
- case MSR_IA32_FEATURE_CONTROL:
- if (!vmx_feature_control_msr_valid(vcpu, data) ||
- (to_vmx(vcpu)->msr_ia32_feature_control &
- FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
+ case MSR_IA32_FEAT_CTL:
+ if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
return 1;
+
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
+
+ /* SGX may be enabled/disabled by guest's firmware */
+ vmx_write_encls_bitmap(vcpu, NULL);
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
- if (!msr_info->host_initiated)
- return 1; /* they are read-only */
- if (!nested_vmx_allowed(vcpu))
- return 1;
- return vmx_set_vmx_msr(vcpu, msr_index, data);
- case MSR_IA32_XSS:
- if (!vmx_xsaves_supported())
- return 1;
+ case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
/*
- * The only supported bit as of Skylake is bit 8, but
- * it is not supported on KVM.
+ * On real hardware, the LE hash MSRs are writable before
+ * the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
+ * at which point SGX related bits in IA32_FEATURE_CONTROL
+ * become writable.
+ *
+ * KVM does not emulate SGX activation for simplicity, so
+ * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
+ * is unlocked. This is technically not architectural
+ * behavior, but it's close enough.
*/
- if (data != 0)
+ if (!msr_info->host_initiated &&
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
+ ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
+ !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
return 1;
- vcpu->arch.ia32_xss = data;
- if (vcpu->arch.ia32_xss != host_xss)
- add_atomic_switch_msr(vmx, MSR_IA32_XSS,
- vcpu->arch.ia32_xss, host_xss, false);
- else
- clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+ vmx->msr_ia32_sgxlepubkeyhash
+ [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
break;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ if (!msr_info->host_initiated)
+ return 1; /* they are read-only */
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
+ return 1;
+ return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
vmx_rtit_ctl_check(vcpu, data) ||
vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
vmx->pt_desc.guest.ctl = data;
- pt_update_intercept_for_msr(vmx);
+ pt_update_intercept_for_msr(vcpu);
break;
case MSR_IA32_RTIT_STATUS:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (data & MSR_IA32_RTIT_STATUS_MASK))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (data & MSR_IA32_RTIT_STATUS_MASK)
return 1;
vmx->pt_desc.guest.status = data;
break;
case MSR_IA32_RTIT_CR3_MATCH:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_cr3_filtering))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cr3_filtering))
return 1;
vmx->pt_desc.guest.cr3_match = data;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (!intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_topa_output) &&
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_single_range_output)) ||
- (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+ if (!pt_output_base_valid(vcpu, data))
return 1;
vmx->pt_desc.guest.output_base = data;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (!intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_topa_output) &&
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_single_range_output)))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
return 1;
vmx->pt_desc.guest.output_mask = data;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!pt_can_write_msr(vmx))
+ return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ if (index >= 2 * vmx->pt_desc.num_address_ranges)
+ return 1;
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
- case MSR_TSC_AUX:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
- return 1;
- /* Check reserved bit, higher 32 bits should be zero */
- if ((data >> 32) != 0)
- return 1;
- /* Otherwise falls through */
- default:
- msr = find_msr_entry(vmx, msr_index);
- if (msr) {
- u64 old_msr_data = msr->data;
- msr->data = data;
- if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
- preempt_disable();
- ret = kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
- preempt_enable();
- if (ret)
- msr->data = old_msr_data;
- }
- break;
+ case MSR_IA32_S_CET:
+ vmcs_writel(GUEST_S_CET, data);
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ vmcs_writel(GUEST_SSP, data);
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ vmcs_writel(GUEST_INTR_SSP_TABLE, data);
+ break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (data & PERF_CAP_LBR_FMT) {
+ if ((data & PERF_CAP_LBR_FMT) !=
+ (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
+ }
+ if (data & PERF_CAP_PEBS_FORMAT) {
+ if ((data & PERF_CAP_PEBS_MASK) !=
+ (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
+ return 1;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
+ return 1;
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
+ return 1;
+ if (!cpuid_model_is_consistent(vcpu))
+ return 1;
}
ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+
+ default:
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_index);
+ if (msr)
+ ret = vmx_set_guest_uret_msr(vmx, msr, data);
+ else
+ ret = kvm_set_msr_common(vcpu, msr_info);
}
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
return ret;
}
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ unsigned long guest_owned_bits;
+
+ kvm_register_mark_available(vcpu, reg);
+
switch (reg) {
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
@@ -2050,131 +2606,44 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
if (enable_ept)
ept_save_pdptrs(vcpu);
break;
- default:
- break;
- }
-}
-
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- u64 msr;
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- if (msr & FEATURE_CONTROL_LOCKED) {
- /* launched w/ TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && tboot_enabled())
- return 1;
- /* launched w/o TXT and VMX only enabled w/ TXT */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && !tboot_enabled()) {
- printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
- "activate TXT before enabling KVM\n");
- return 1;
- }
- /* launched w/o TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !tboot_enabled())
- return 1;
- }
-
- return 0;
-}
-
-static void kvm_cpu_vmxon(u64 addr)
-{
- cr4_set_bits(X86_CR4_VMXE);
- intel_pt_handle_vmx(1);
+ case VCPU_EXREG_CR0:
+ guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
- asm volatile ("vmxon %0" : : "m"(addr));
-}
-
-static int hardware_enable(void)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old, test_bits;
-
- if (cr4_read_shadow() & X86_CR4_VMXE)
- return -EBUSY;
-
- /*
- * This can happen if we hot-added a CPU but failed to allocate
- * VP assist page for it.
- */
- if (static_branch_unlikely(&enable_evmcs) &&
- !hv_get_vp_assist_page(cpu))
- return -EFAULT;
-
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
- /*
- * Now we can enable the vmclear operation in kdump
- * since the loaded_vmcss_on_cpu list on this cpu
- * has been initialized.
- *
- * Though the cpu is not in VMX operation now, there
- * is no problem to enable the vmclear operation
- * for the loaded_vmcss_on_cpu list is empty!
- */
- crash_enable_local_vmclear(cpu);
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-
- test_bits = FEATURE_CONTROL_LOCKED;
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (tboot_enabled())
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+ vcpu->arch.cr0 &= ~guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
+ break;
+ case VCPU_EXREG_CR3:
+ /*
+ * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
+ * CR3 is loaded into hardware, not the guest's CR3.
+ */
+ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ break;
+ case VCPU_EXREG_CR4:
+ guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
- if ((old & test_bits) != test_bits) {
- /* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+ vcpu->arch.cr4 &= ~guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
+ break;
+ default:
+ KVM_BUG_ON(1, vcpu->kvm);
+ break;
}
- kvm_cpu_vmxon(phys_addr);
- if (enable_ept)
- ept_sync_global();
-
- return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
- int cpu = raw_smp_processor_id();
- struct loaded_vmcs *v, *n;
-
- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
- __loaded_vmcs_clear(v);
}
-
-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
- * tricks.
+/*
+ * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
+ * directly instead of going through cpu_has(), to ensure KVM is trapping
+ * ENCLS whenever it's supported in hardware. It does not matter whether
+ * the host OS supports or has enabled SGX.
*/
-static void kvm_cpu_vmxoff(void)
+static bool cpu_has_sgx(void)
{
- asm volatile (__ex("vmxoff"));
-
- intel_pt_handle_vmx(0);
- cr4_clear_bits(X86_CR4_VMXE);
+ return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
}
-static void hardware_disable(void)
-{
- vmclear_local_loaded_vmcss();
- kvm_cpu_vmxoff();
-}
-
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
@@ -2192,73 +2661,89 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
- struct vmx_capability *vmx_cap)
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+{
+ u64 allowed;
+
+ rdmsrq(msr, allowed);
+
+ return ctl_opt & allowed;
+}
+
+#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
+({ \
+ int i, r = 0; \
+ \
+ BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
+ BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
+ \
+ for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
+ typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
+ typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
+ \
+ if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
+ continue; \
+ \
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
+ "entry = %llx (%llx), exit = %llx (%llx)\n", \
+ (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
+ (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
+ \
+ if (error_on_inconsistent_vmcs_config) \
+ r = -EIO; \
+ \
+ entry_controls &= ~n_ctrl; \
+ exit_controls &= ~x_ctrl; \
+ } \
+ r; \
+})
+
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
{
- u32 vmx_msr_low, vmx_msr_high;
- u32 min, opt, min2, opt2;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
+ u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ u64 basic_msr;
+ u64 misc_msr;
+
+ /*
+ * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
+ * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
+ * intercepts writes to PAT and EFER, i.e. never enables those controls.
+ */
+ struct {
+ u32 entry_control;
+ u32 exit_control;
+ } const vmcs_entry_exit_pairs[] = {
+ { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
+ { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
+ { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
+ { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
+ { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
+ { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE },
+ };
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
- min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
- CPU_BASED_CR8_LOAD_EXITING |
- CPU_BASED_CR8_STORE_EXITING |
-#endif
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_UNCOND_IO_EXITING |
- CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING |
- CPU_BASED_MWAIT_EXITING |
- CPU_BASED_MONITOR_EXITING |
- CPU_BASED_INVLPG_EXITING |
- CPU_BASED_RDPMC_EXITING;
-
- opt = CPU_BASED_TPR_SHADOW |
- CPU_BASED_USE_MSR_BITMAPS |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ &_cpu_based_exec_control))
return -EIO;
-#ifdef CONFIG_X86_64
- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
- _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
- ~CPU_BASED_CR8_STORE_EXITING;
-#endif
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
- min2 = 0;
- opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
- SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_ENABLE_VPID |
- SECONDARY_EXEC_ENABLE_EPT |
- SECONDARY_EXEC_UNRESTRICTED_GUEST |
- SECONDARY_EXEC_PAUSE_LOOP_EXITING |
- SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_RDTSCP |
- SECONDARY_EXEC_ENABLE_INVPCID |
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_SHADOW_VMCS |
- SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_RDSEED_EXITING |
- SECONDARY_EXEC_RDRAND_EXITING |
- SECONDARY_EXEC_ENABLE_PML |
- SECONDARY_EXEC_TSC_SCALING |
- SECONDARY_EXEC_PT_USE_GPA |
- SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_ENCLS_EXITING;
- if (adjust_vmx_controls(min2, opt2,
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
MSR_IA32_VMX_PROCBASED_CTLS2,
- &_cpu_based_2nd_exec_control) < 0)
+ &_cpu_based_2nd_exec_control))
return -EIO;
}
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2274,44 +2759,46 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
&vmx_cap->ept, &vmx_cap->vpid);
- if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
- /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
- enabled */
- _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_INVLPG_EXITING);
- } else if (vmx_cap->ept) {
- vmx_cap->ept = 0;
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ vmx_cap->ept) {
pr_warn_once("EPT CAP should not exist if not support "
"1-setting enable EPT VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
- vmx_cap->vpid) {
- vmx_cap->vpid = 0;
+ vmx_cap->vpid) {
pr_warn_once("VPID CAP should not exist if not support "
"1-setting enable VPID VM-execution control\n");
+
+ if (error_on_inconsistent_vmcs_config)
+ return -EIO;
+
+ vmx_cap->vpid = 0;
}
- min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
-#ifdef CONFIG_X86_64
- min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
- opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
- VM_EXIT_SAVE_IA32_PAT |
- VM_EXIT_LOAD_IA32_PAT |
- VM_EXIT_LOAD_IA32_EFER |
- VM_EXIT_CLEAR_BNDCFGS |
- VM_EXIT_PT_CONCEAL_PIP |
- VM_EXIT_CLEAR_IA32_RTIT_CTL;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
- &_vmexit_control) < 0)
+ if (!cpu_has_sgx())
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
+
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+ _cpu_based_3rd_exec_control =
+ adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS3);
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ &_vmexit_control))
return -EIO;
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
- PIN_BASED_VMX_PREEMPTION_TIMER;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
- &_pin_based_exec_control) < 0)
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ &_pin_based_exec_control))
return -EIO;
if (cpu_has_broken_vmx_preemption_timer())
@@ -2320,15 +2807,14 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
- min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
- opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
- VM_ENTRY_LOAD_IA32_PAT |
- VM_ENTRY_LOAD_IA32_EFER |
- VM_ENTRY_LOAD_BNDCFGS |
- VM_ENTRY_PT_CONCEAL_PIP |
- VM_ENTRY_LOAD_IA32_RTIT_CTL;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
- &_vmentry_control) < 0)
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ &_vmentry_control))
+ return -EIO;
+
+ if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
+ _vmentry_control, _vmexit_control))
return -EIO;
/*
@@ -2337,75 +2823,197 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
* IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
* MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
*/
- if (boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case 26: /* AAK155 */
- case 30: /* AAP115 */
- case 37: /* AAT100 */
- case 44: /* BC86,AAY89,BD102 */
- case 46: /* BA97 */
- _vmexit_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
- _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
- "does not work properly. Using workaround\n");
- break;
- default:
- break;
- }
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM_EP: /* AAK155 */
+ case INTEL_NEHALEM: /* AAP115 */
+ case INTEL_WESTMERE: /* AAT100 */
+ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_NEHALEM_EX: /* BA97 */
+ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
}
-
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+ rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
+ /*
+ * KVM expects to be able to shove all legal physical addresses into
+ * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+ * 0 for processors that support Intel 64 architecture".
+ */
+ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
+ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
return -EIO;
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_conf->size);
- vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
-
- vmcs_conf->revision_id = vmx_msr_low;
+ rdmsrq(MSR_IA32_VMX_MISC, misc_msr);
+ vmcs_conf->basic = basic_msr;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
+ vmcs_conf->misc = misc_msr;
- if (static_branch_unlikely(&enable_evmcs))
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
return 0;
}
-struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
+static bool __kvm_is_vmx_supported(void)
+{
+ int cpu = smp_processor_id();
+
+ if (!(cpuid_ecx(1) & feature_bit(VMX))) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static bool kvm_is_vmx_supported(void)
+{
+ bool supported;
+
+ migrate_disable();
+ supported = __kvm_is_vmx_supported();
+ migrate_enable();
+
+ return supported;
+}
+
+int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!__kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+int vmx_enable_virtualization_cpu(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ return 0;
+}
+
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+void vmx_disable_virtualization_cpu(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (kvm_cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
- pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+ pages = __alloc_pages_node(node, flags, 0);
if (!pages)
return NULL;
vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
+ memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
/* KVM supports Enlightened VMCS v1 only */
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
if (shadow)
vmcs->hdr.shadow_vmcs = 1;
@@ -2414,7 +3022,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
void free_vmcs(struct vmcs *vmcs)
{
- free_pages((unsigned long)vmcs, vmcs_config.order);
+ free_page((unsigned long)vmcs);
}
/*
@@ -2438,26 +3046,24 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->vmcs)
return -ENOMEM;
+ vmcs_clear(loaded_vmcs->vmcs);
+
loaded_vmcs->shadow_vmcs = NULL;
- loaded_vmcs_init(loaded_vmcs);
+ loaded_vmcs->hv_timer_soft_disabled = false;
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
if (cpu_has_vmx_msr_bitmap()) {
- loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ loaded_vmcs->msr_bitmap = (unsigned long *)
+ __get_free_page(GFP_KERNEL_ACCOUNT);
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
+ memset(&loaded_vmcs->controls_shadow, 0,
+ sizeof(struct vmcs_controls_shadow));
return 0;
@@ -2483,7 +3089,7 @@ static __init int alloc_kvm_area(void)
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
- vmcs = alloc_vmcs_cpu(false, cpu);
+ vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
@@ -2499,8 +3105,8 @@ static __init int alloc_kvm_area(void)
* still be marked with revision_id reported by
* physical CPU.
*/
- if (static_branch_unlikely(&enable_evmcs))
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ if (kvm_is_using_evmcs())
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
per_cpu(vmxarea, cpu) = vmcs;
}
@@ -2523,7 +3129,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
- vmx_set_segment(vcpu, save, seg);
+ __vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2532,7 +3138,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
- * Update real mode segment cache. It may be not up-to-date if sement
+ * Update real mode segment cache. It may be not up-to-date if segment
* register was written while vcpu was in a guest mode.
*/
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
@@ -2544,9 +3150,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 0;
- vmx_segment_cache_clear(vmx);
-
- vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2556,7 +3160,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
@@ -2588,9 +3192,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not "
- "paragraph aligned when entering "
- "protected mode (seg=%d)", seg);
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
@@ -2605,6 +3208,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+ /*
+ * KVM should never use VM86 to virtualize Real Mode when L2 is active,
+ * as using VM86 is unnecessary if unrestricted guest is enabled, and
+ * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
+ * should VM-Fail and KVM should reject userspace attempts to stuff
+ * CR0.PG=0 when L2 is active.
+ */
+ WARN_ON_ONCE(is_guest_mode(vcpu));
+
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
@@ -2615,14 +3227,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 1;
- /*
- * Very old userspace does not call KVM_SET_TSS_ADDR before entering
- * vcpu. Warn the user that an update is overdue.
- */
- if (!kvm_vmx->tss_addr)
- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- "called before entering vcpu\n");
-
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
@@ -2636,7 +3240,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
@@ -2644,28 +3248,29 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
-
- kvm_mmu_reset_context(vcpu);
}
-void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
- if (!msr)
- return;
+ /* Nothing to do if hardware doesn't support EFER. */
+ if (!vmx_find_uret_msr(vmx, MSR_EFER))
+ return 0;
vcpu->arch.efer = efer;
- if (efer & EFER_LMA) {
- vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
- msr->data = efer;
- } else {
- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+#ifdef CONFIG_X86_64
+ if (efer & EFER_LMA)
+ vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
+ else
+ vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
+#else
+ if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
+ return 1;
+#endif
- msr->data = efer & ~EFER_LME;
- }
- setup_msrs(vmx);
+ vmx_setup_uret_msrs(vmx);
+ return 0;
}
#ifdef CONFIG_X86_64
@@ -2689,58 +3294,119 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
static void exit_lmode(struct kvm_vcpu *vcpu)
{
- vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}
#endif
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
+ * the CPU is not required to invalidate guest-physical mappings on
+ * VM-Entry, even if VPID is disabled. Guest-physical mappings are
+ * associated with the root EPT structure and not any particular VPID
+ * (INVVPID also isn't required to invalidate guest-physical mappings).
+ */
+ if (enable_ept) {
+ ept_sync_global();
+ } else if (enable_vpid) {
+ if (cpu_has_vmx_invvpid_global()) {
+ vpid_sync_vcpu_global();
+ } else {
+ vpid_sync_vcpu_single(vmx->vpid);
+ vpid_sync_vcpu_single(vmx->nested.vpid02);
+ }
+ }
+}
+
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
- int vpid = to_vmx(vcpu)->vpid;
+ if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
+static u64 construct_eptp(hpa_t root_hpa)
+{
+ u64 eptp = root_hpa | VMX_EPTP_MT_WB;
+ struct kvm_mmu_page *root;
- if (!vpid_sync_vcpu_addr(vpid, addr))
- vpid_sync_context(vpid);
+ if (kvm_mmu_is_dummy_root(root_hpa))
+ return eptp | VMX_EPTP_PWL_4;
/*
- * If VPIDs are not supported or enabled, then the above is a no-op.
- * But we don't really need a TLB flush in that case anyway, because
- * each VM entry/exit includes an implicit flush when VPID is 0.
+ * EPT roots should always have an associated MMU page. Return a "bad"
+ * EPTP to induce VM-Fail instead of continuing on in a unknown state.
*/
+ root = root_to_sp(root_hpa);
+ if (WARN_ON_ONCE(!root))
+ return INVALID_PAGE;
+
+ eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits && !root->role.ad_disabled)
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+
+ return eptp;
}
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb_ept_root(hpa_t root_hpa)
{
- ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+ u64 eptp = construct_eptp(root_hpa);
- vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
- vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+ if (VALID_PAGE(eptp))
+ ept_sync_context(eptp);
+ else
+ ept_sync_global();
}
-static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
- if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 root_hpa = mmu->root.hpa;
+
+ /* No flush required if the current context is invalid. */
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ if (enable_ept)
+ vmx_flush_tlb_ept_root(root_hpa);
+ else
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
- ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+ /*
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
+ * vmx_flush_tlb_guest() for an explanation of why this is ok.
+ */
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
+}
- vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
- vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ /*
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
+ * i.e. no explicit INVVPID is necessary.
+ */
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty))
+ if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
return;
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+ if (is_pae_paging(vcpu)) {
vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
@@ -2752,57 +3418,46 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
- mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
- mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
- mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
- mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- }
+ if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
+ return;
+
+ mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
}
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
- unsigned long cr0,
- struct kvm_vcpu *vcpu)
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING)
+
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- vmx_decache_cr3(vcpu);
- if (!(cr0 & X86_CR0_PG)) {
- /* From paging/starting to nonpaging */
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
- vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
- (CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING));
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- } else if (!is_paging(vcpu)) {
- /* From nonpaging to paging */
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
- vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
- ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING));
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- }
+ if (is_guest_mode(vcpu))
+ return nested_guest_cr0_valid(vcpu, cr0);
- if (!(cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
+ if (to_vmx(vcpu)->nested.vmxon)
+ return nested_host_cr0_valid(vcpu, cr0);
+
+ return true;
}
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long hw_cr0;
+ unsigned long hw_cr0, old_cr0_pg;
+ u32 tmp;
+
+ old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
if (enable_unrestricted_guest)
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+ if (!enable_ept)
+ hw_cr0 |= X86_CR0_WP;
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -2811,121 +3466,155 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
enter_rmode(vcpu);
}
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+ if (!old_cr0_pg && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+ else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
exit_lmode(vcpu);
}
#endif
- if (enable_ept && !enable_unrestricted_guest)
- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+ if (enable_ept && !enable_unrestricted_guest) {
+ /*
+ * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
+ * the below code _enables_ CR3 exiting, vmx_cache_reg() will
+ * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
+ * KVM's CR3 is installed.
+ */
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- vmcs_writel(CR0_READ_SHADOW, cr0);
- vmcs_writel(GUEST_CR0, hw_cr0);
- vcpu->arch.cr0 = cr0;
+ /*
+ * When running with EPT but not unrestricted guest, KVM must
+ * intercept CR3 accesses when paging is _disabled_. This is
+ * necessary because restricted guests can't actually run with
+ * paging disabled, and so KVM stuffs its own CR3 in order to
+ * run the guest when identity mapped page tables.
+ *
+ * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
+ * update, it may be stale with respect to CR3 interception,
+ * e.g. after nested VM-Enter.
+ *
+ * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
+ * stores to forward them to L1, even if KVM does not need to
+ * intercept them to preserve its identity mapped page tables.
+ */
+ if (!(cr0 & X86_CR0_PG)) {
+ exec_controls_setbit(vmx, CR3_EXITING_BITS);
+ } else if (!is_guest_mode(vcpu)) {
+ exec_controls_clearbit(vmx, CR3_EXITING_BITS);
+ } else {
+ tmp = exec_controls_get(vmx);
+ tmp &= ~CR3_EXITING_BITS;
+ tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
+ exec_controls_set(vmx, tmp);
+ }
+
+ /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
+ if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+
+ /*
+ * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
+ * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
+ */
+ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ }
/* depends on vcpu->arch.cr0 to be set to a new value */
- vmx->emulation_required = emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
-static int get_ept_level(struct kvm_vcpu *vcpu)
+static int vmx_get_max_ept_level(void)
{
- if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ if (cpu_has_vmx_ept_5levels())
return 5;
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
-{
- u64 eptp = VMX_EPTP_MT_WB;
-
- eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
-
- if (enable_ept_ad_bits &&
- (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
- eptp |= VMX_EPTP_AD_ENABLE_BIT;
- eptp |= (root_hpa & PAGE_MASK);
-
- return eptp;
-}
-
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
+ bool update_guest_cr3 = true;
unsigned long guest_cr3;
- u64 eptp;
- guest_cr3 = cr3;
if (enable_ept) {
- eptp = construct_eptp(vcpu, cr3);
- vmcs_write64(EPT_POINTER, eptp);
-
- if (kvm_x86_ops->tlb_remote_flush) {
- spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- to_vmx(vcpu)->ept_pointer = eptp;
- to_kvm_vmx(kvm)->ept_pointers_match
- = EPT_POINTERS_CHECK;
- spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
- }
+ KVM_MMU_WARN_ON(root_to_sp(root_hpa) &&
+ root_level != root_to_sp(root_hpa)->role.level);
+ vmcs_write64(EPT_POINTER, construct_eptp(root_hpa));
- if (enable_unrestricted_guest || is_paging(vcpu) ||
- is_guest_mode(vcpu))
- guest_cr3 = kvm_read_cr3(vcpu);
- else
+ hv_track_root_tdp(vcpu, root_hpa);
+
+ if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
- ept_load_pdptrs(vcpu);
+ else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
+ guest_cr3 = vcpu->arch.cr3;
+ else /* vmcs.GUEST_CR3 is already up-to-date. */
+ update_guest_cr3 = false;
+ vmx_ept_load_pdptrs(vcpu);
+ } else {
+ guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) |
+ kvm_get_active_cr3_lam_bits(vcpu);
}
- vmcs_writel(GUEST_CR3, guest_cr3);
+ if (update_guest_cr3)
+ vmcs_writel(GUEST_CR3, guest_cr3);
+}
+
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ /*
+ * We operate under the default treatment of SMM, so VMX cannot be
+ * enabled under SMM. Note, whether or not VMXE is allowed at all,
+ * i.e. is a reserved bit, is handled by common x86 code.
+ */
+ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
+ return false;
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+ return false;
+
+ return true;
}
-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr4;
+
/*
* Pass through host's Machine Check Enable value to hw_cr4, which
* is in force while we are in guest mode. Do not let guests control
* this bit, even if host CR4.MCE == 0.
*/
- unsigned long hw_cr4;
-
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
if (enable_unrestricted_guest)
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
- else if (to_vmx(vcpu)->rmode.vm86_active)
+ else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
else
hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
- if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+ if (vmx_umip_emulated()) {
if (cr4 & X86_CR4_UMIP) {
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_DESC);
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
hw_cr4 &= ~X86_CR4_UMIP;
} else if (!is_guest_mode(vcpu) ||
- !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_DESC);
- }
-
- if (cr4 & X86_CR4_VMXE) {
- /*
- * To use VMXON (and later other VMX instructions), a guest
- * must first be able to turn on cr4.VMXE (see handle_vmon()).
- * So basically the check on whether to allow nested VMX
- * is here. We operate under the default treatment of SMM,
- * so VMX cannot be enabled under SMM.
- */
- if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
- return 1;
+ !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
+ }
}
- if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
- return 1;
-
vcpu->arch.cr4 = cr4;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
if (!enable_unrestricted_guest) {
if (enable_ept) {
@@ -2954,7 +3643,9 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
- return 0;
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
@@ -2993,7 +3684,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->g = (ar >> 15) & 1;
}
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment s;
@@ -3004,39 +3695,49 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
-int vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ar;
if (unlikely(vmx->rmode.vm86_active))
return 0;
- else {
- int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
- return VMX_AR_DPL(ar);
- }
+
+ if (no_cache)
+ ar = vmcs_read32(GUEST_SS_AR_BYTES);
+ else
+ ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return VMX_AR_DPL(ar);
+}
+
+int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, false);
+}
+
+int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, true);
}
static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3049,7 +3750,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write16(sf->selector, var->selector);
else if (var->s)
fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
- goto out;
+ return;
}
vmcs_writel(sf->base, var->base);
@@ -3067,16 +3768,20 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
* tree. Newer qemu binaries with that qemu fix would not need this
* kvm hack.
*/
- if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
var->type |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+}
-out:
- vmx->emulation_required = emulation_required(vcpu);
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ __vmx_set_segment(vcpu, var, seg);
+
+ to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
}
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
@@ -3084,25 +3789,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
vmcs_writel(GUEST_GDTR_BASE, dt->address);
@@ -3256,11 +3961,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
* not.
* We assume that registers are always usable
*/
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
{
- if (enable_unrestricted_guest)
- return true;
-
/* real mode guest state checks */
if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3304,78 +4006,65 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
return true;
}
-static int init_rmode_tss(struct kvm *kvm)
+static int init_rmode_tss(struct kvm *kvm, void __user *ua)
{
- gfn_t fn;
- u16 data = 0;
- int idx, r;
+ const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ u16 data;
+ int i;
+
+ for (i = 0; i < 3; i++) {
+ if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE))
+ return -EFAULT;
+ }
- idx = srcu_read_lock(&kvm->srcu);
- fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
- r = kvm_write_guest_page(kvm, fn++, &data,
- TSS_IOPB_BASE_OFFSET, sizeof(u16));
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
- r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
- if (r < 0)
- goto out;
+ if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16)))
+ return -EFAULT;
+
data = ~0;
- r = kvm_write_guest_page(kvm, fn, &data,
- RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
- sizeof(u8));
-out:
- srcu_read_unlock(&kvm->srcu, idx);
- return r;
+ if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8)))
+ return -EFAULT;
+
+ return 0;
}
static int init_rmode_identity_map(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
- int i, idx, r = 0;
- kvm_pfn_t identity_map_pfn;
+ int i, r = 0;
+ void __user *uaddr;
u32 tmp;
/* Protect kvm_vmx->ept_identity_pagetable_done. */
mutex_lock(&kvm->slots_lock);
if (likely(kvm_vmx->ept_identity_pagetable_done))
- goto out2;
+ goto out;
if (!kvm_vmx->ept_identity_map_addr)
kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
-
- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
- if (r < 0)
- goto out2;
- idx = srcu_read_lock(&kvm->srcu);
- r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
- if (r < 0)
+ uaddr = __x86_set_memory_region(kvm,
+ IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm_vmx->ept_identity_map_addr,
+ PAGE_SIZE);
+ if (IS_ERR(uaddr)) {
+ r = PTR_ERR(uaddr);
goto out;
+ }
+
/* Set up identity-mapping pagetable for EPT in real mode */
- for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+ for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
- r = kvm_write_guest_page(kvm, identity_map_pfn,
- &tmp, i * sizeof(tmp), sizeof(tmp));
- if (r < 0)
+ if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) {
+ r = -EFAULT;
goto out;
+ }
}
kvm_vmx->ept_identity_pagetable_done = true;
out:
- srcu_read_unlock(&kvm->srcu, idx);
-
-out2:
mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -3395,36 +4084,6 @@ static void seg_setup(int seg)
vmcs_write32(sf->ar_bytes, ar);
}
-static int alloc_apic_access_page(struct kvm *kvm)
-{
- struct page *page;
- int r = 0;
-
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_page_done)
- goto out;
- r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
- if (r)
- goto out;
-
- page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page)) {
- r = -EFAULT;
- goto out;
- }
-
- /*
- * Do not pin the page in memory, so that memory hot-unplug
- * is able to migrate it.
- */
- put_page(page);
- kvm->arch.apic_access_page_done = true;
-out:
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
int allocate_vpid(void)
{
int vpid;
@@ -3450,232 +4109,200 @@ void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type)
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
{
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
-
/*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
+ * bitmap has changed.
*/
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __clear_bit(msr, msr_bitmap + 0x000 / f);
-
- if (type & MSR_TYPE_W)
- /* write-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __clear_bit(msr, msr_bitmap + 0x400 / f);
-
- if (type & MSR_TYPE_W)
- /* write-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f);
+ if (kvm_is_using_evmcs()) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
}
+
+ vmx->nested.force_msr_bitmap_recalc = true;
}
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type)
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
- int f = sizeof(unsigned long);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
- /*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __set_bit(msr, msr_bitmap + 0x000 / f);
-
- if (type & MSR_TYPE_W)
- /* write-low */
- __set_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __set_bit(msr, msr_bitmap + 0x400 / f);
-
- if (type & MSR_TYPE_W)
- /* write-high */
- __set_bit(msr, msr_bitmap + 0xc00 / f);
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+ }
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
}
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type, bool value)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
- if (value)
- vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
- else
- vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
-}
+ /*
+ * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
+ * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
+ * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
+ */
+ const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
+ const int write_idx = read_idx + (0x800 / sizeof(u64));
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
+ u8 mode;
-static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
-{
- u8 mode = 0;
+ if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
+ return;
if (cpu_has_secondary_exec_ctrls() &&
- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+ (secondary_exec_controls_get(vmx) &
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- mode |= MSR_BITMAP_MODE_X2APIC;
+ mode = MSR_BITMAP_MODE_X2APIC;
if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ } else {
+ mode = 0;
}
- return mode;
-}
+ if (mode == vmx->x2apic_msr_bitmap_mode)
+ return;
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
- u8 mode)
-{
- int msr;
+ vmx->x2apic_msr_bitmap_mode = mode;
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
- msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
- }
+ /*
+ * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
+ * registers (0x840 and above) intercepted, KVM doesn't support them.
+ * Intercept all writes by default and poke holes as needed. Pass
+ * through reads for all valid registers by default in x2APIC+APICv
+ * mode, only the current timer count needs on-demand emulation by KVM.
+ */
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
+ msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
+ else
+ msr_bitmap[read_idx] = ~0ull;
+ msr_bitmap[write_idx] = ~0ull;
- if (mode & MSR_BITMAP_MODE_X2APIC) {
- /*
- * TPR reads and writes can be virtualized even if virtual interrupt
- * delivery is not in use.
- */
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
- if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
- vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
- }
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
+ !(mode & MSR_BITMAP_MODE_X2APIC));
+
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+ vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ if (enable_ipiv)
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
}
}
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- u8 mode = vmx_msr_bitmap_mode(vcpu);
- u8 changed = mode ^ vmx->msr_bitmap_mode;
-
- if (!changed)
- return;
-
- if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
- vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
-
- vmx->msr_bitmap_mode = mode;
-}
-
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
-{
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
u32 i;
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
- MSR_TYPE_RW, flag);
- for (i = 0; i < vmx->pt_desc.addr_range; i++) {
- vmx_set_intercept_for_msr(msr_bitmap,
- MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap,
- MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
}
}
-static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
+static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- return enable_apicv;
-}
+ bool intercept;
-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic_page;
- u32 vppr;
- int rvi;
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
- return false;
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
- rvi = vmx_get_rvi();
+ /* PT MSRs can be passed through iff PT is exposed to the guest. */
+ if (vmx_pt_mode_is_host_guest())
+ pt_update_intercept_for_msr(vcpu);
- vapic_page = kmap(vmx->nested.virtual_apic_page);
- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
- kunmap(vmx->nested.virtual_apic_page);
+ if (vcpu->arch.xfd_no_write_intercept)
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !to_vmx(vcpu)->spec_ctrl);
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- bool nested)
-{
-#ifdef CONFIG_SMP
- int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
- if (vcpu->mode == IN_GUEST_MODE) {
- /*
- * The vector of interrupt to be delivered to vcpu had
- * been set in PIR before this function.
- *
- * Following cases will be reached in this block, and
- * we always send a notification event in all cases as
- * explained below.
- *
- * Case 1: vcpu keeps in non-root mode. Sending a
- * notification event posts the interrupt to vcpu.
- *
- * Case 2: vcpu exits to root mode and is still
- * runnable. PIR will be synced to vIRR before the
- * next vcpu entry. Sending a notification event in
- * this case has no effect, as vcpu is not in root
- * mode.
- *
- * Case 3: vcpu exits to root mode and is blocked.
- * vcpu_block() has already synced PIR to vIRR and
- * never blocks vcpu if vIRR is not cleared. Therefore,
- * a blocked vcpu here does not wait for any requested
- * interrupts in PIR, and sending a notification event
- * which has no effect is safe here.
- */
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return true;
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept);
}
-#endif
- return false;
+
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) {
+ intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept);
+ }
+
+ /*
+ * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
+ * filtered by userspace.
+ */
+}
+
+void vmx_recalc_intercepts(struct kvm_vcpu *vcpu)
+{
+ vmx_recalc_msr_intercepts(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -3683,6 +4310,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+ * and freed, and must not be accessed outside of vcpu->mutex. The
+ * vCPU's cached PI NV is valid if and only if posted interrupts
+ * enabled in its vmcs12, i.e. checking the vector also checks that
+ * L1 has enabled posted interrupts for L2.
+ */
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/*
@@ -3691,9 +4325,21 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
vmx->nested.pi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * This pairs with the smp_mb_*() after setting vcpu->mode in
+ * vcpu_enter_guest() to guarantee the vCPU sees the event
+ * request if triggering a posted interrupt "fails" because
+ * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
+ * the smb_wmb() in kvm_make_request() only ensures everything
+ * done before making the request is visible when the request
+ * is visible, it doesn't ensure ordering between the store to
+ * vcpu->requests and the load from vcpu->mode.
+ */
+ smp_mb__after_atomic();
+
/* the PIR and ON have been set by L1. */
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
- kvm_vcpu_kick(vcpu);
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
return 0;
}
return -1;
@@ -3705,24 +4351,36 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
* 2. If target vcpu isn't running(root mode), kick it to pick up the
* interrupt from PIR in next vmentry.
*/
-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int r;
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
if (!r)
- return;
+ return 0;
- if (pi_test_and_set_pir(vector, &vmx->pi_desc))
- return;
+ /* Note, this is called iff the local APIC is in-kernel. */
+ if (!vcpu->arch.apic->apicv_active)
+ return -1;
- /* If a previous notification has sent the IPI, nothing to do. */
- if (pi_test_and_set_on(&vmx->pi_desc))
- return;
+ __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
+ return 0;
+}
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ if (vmx_deliver_posted_interrupt(vcpu, vector)) {
+ kvm_lapic_set_irr(vector, apic);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
+ } else {
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ }
}
/*
@@ -3735,7 +4393,6 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
{
u32 low32, high32;
unsigned long tmpl;
- struct desc_ptr dt;
unsigned long cr0, cr3, cr4;
cr0 = read_cr0();
@@ -3771,15 +4428,23 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
- store_idt(&dt);
- vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
- vmx->host_idt_base = dt.address;
+ vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
- rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+
+ /*
+ * SYSENTER is used for 32-bit system calls on either 32-bit or
+ * 64-bit kernels. It is always zero If neither is allowed, otherwise
+ * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+ * have already done so!).
+ */
+ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
+ rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
@@ -3788,18 +4453,38 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
}
if (cpu_has_load_ia32_efer())
- vmcs_write64(HOST_IA32_EFER, host_efer);
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
+
+ /*
+ * Supervisor shadow stack is not enabled on host side, i.e.,
+ * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM
+ * description(RDSSP instruction), SSP is not readable in CPL0,
+ * so resetting the two registers to 0s at VM-Exit does no harm
+ * to kernel execution. When execution flow exits to userspace,
+ * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter
+ * 3 and 4 for details.
+ */
+ if (cpu_has_load_cet_ctrl()) {
+ vmcs_writel(HOST_S_CET, kvm_host.s_cet);
+ vmcs_writel(HOST_SSP, 0);
+ vmcs_writel(HOST_INTR_SSP_TABLE, 0);
+ }
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
{
- vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
- if (enable_ept)
- vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+ vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
+ ~vcpu->arch.cr4_guest_rsvd_bits;
+ if (!enable_ept) {
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
+ }
if (is_guest_mode(&vmx->vcpu))
- vmx->vcpu.arch.cr4_guest_owned_bits &=
- ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
- vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+ vcpu->arch.cr4_guest_owned_bits &=
+ ~get_vmcs12(vcpu)->cr4_guest_host_mask;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
}
static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
@@ -3812,49 +4497,106 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
if (!enable_vnmi)
pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
- /* Enable the preemption timer dynamically */
- pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (!enable_preemption_timer)
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
return pin_based_exec_ctrl;
}
-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+static u32 vmx_get_initial_vmentry_ctrl(void)
+{
+ u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ /*
+ * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
+ */
+ vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
+ VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_IA32E_MODE);
+
+ return vmentry_ctrl;
+}
+
+static u32 vmx_get_initial_vmexit_ctrl(void)
+{
+ u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+
+ /*
+ * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
+ * nested virtualization and thus allowed to be set in vmcs12.
+ */
+ vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+
+ if (vmx_pt_mode_is_system())
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmexit_ctrl &
+ ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+}
+
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
- if (kvm_vcpu_apicv_active(vcpu))
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
- else
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_apicv_status = true;
+ return;
}
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
+ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
+
+ secondary_exec_controls_changebit(vmx,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY,
+ kvm_vcpu_apicv_active(vcpu));
+ if (enable_ipiv)
+ tertiary_exec_controls_changebit(vmx, TERTIARY_EXEC_IPI_VIRT,
+ kvm_vcpu_apicv_active(vcpu));
+
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
-u32 vmx_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+ /*
+ * Not used by KVM, but fully supported for nesting, i.e. are allowed in
+ * vmcs12 and propagated to vmcs02 when set in vmcs12.
+ */
+ exec_control &= ~(CPU_BASED_RDTSC_EXITING |
+ CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_PAUSE_EXITING);
+
+ /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
+ exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING);
+
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
+ if (!cpu_need_tpr_shadow(&vmx->vcpu))
exec_control &= ~CPU_BASED_TPR_SHADOW;
+
#ifdef CONFIG_X86_64
+ if (exec_control & CPU_BASED_TPR_SHADOW)
+ exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING);
+ else
exec_control |= CPU_BASED_CR8_STORE_EXITING |
CPU_BASED_CR8_LOAD_EXITING;
#endif
- }
- if (!enable_ept)
- exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_INVLPG_EXITING;
+ /* No need to intercept CR3 access or INVPLG when using EPT. */
+ if (enable_ept)
+ exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
if (kvm_mwait_in_guest(vmx->vcpu.kvm))
exec_control &= ~(CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING);
@@ -3863,14 +4605,91 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}
+static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
+{
+ u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
+
+ /*
+ * IPI virtualization relies on APICv. Disable IPI virtualization if
+ * APICv is inhibited.
+ */
+ if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu))
+ exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
-static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+ return exec_control;
+}
+
+/*
+ * Adjust a single secondary execution control bit to intercept/allow an
+ * instruction in the guest. This is usually done based on whether or not a
+ * feature has been exposed to the guest in order to correctly emulate faults.
+ */
+static inline void
+vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
+ u32 control, bool enabled, bool exiting)
+{
+ /*
+ * If the control is for an opt-in feature, clear the control if the
+ * feature is not exposed to the guest, i.e. not enabled. If the
+ * control is opt-out, i.e. an exiting control, clear the control if
+ * the feature _is_ exposed to the guest, i.e. exiting/interception is
+ * disabled for the associated instruction. Note, the caller is
+ * responsible presetting exec_control to set all supported bits.
+ */
+ if (enabled == exiting)
+ *exec_control &= ~control;
+
+ /*
+ * Update the nested MSR settings so that a nested VMM can/can't set
+ * controls for features that are/aren't exposed to the guest.
+ */
+ if (nested &&
+ kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
+ /*
+ * All features that can be added or removed to VMX MSRs must
+ * be supported in the first place for nested virtualization.
+ */
+ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
+ enabled = false;
+
+ if (enabled)
+ vmx->nested.msrs.secondary_ctls_high |= control;
+ else
+ vmx->nested.msrs.secondary_ctls_high &= ~control;
+ }
+}
+
+/*
+ * Wrapper macro for the common case of adjusting a secondary execution control
+ * based on a single guest CPUID bit, with a dedicated feature bit. This also
+ * verifies that the control is actually supported by KVM and hardware.
+ */
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+ __enabled, exiting); \
+ } \
+})
+
+/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
+#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
+
+#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
+
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
struct kvm_vcpu *vcpu = &vmx->vcpu;
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
if (!cpu_need_virtualize_apic_accesses(vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@ -3878,6 +4697,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
}
if (!enable_unrestricted_guest)
@@ -3889,6 +4709,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ /*
+ * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
+ * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
+ */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
* in vmx_set_cr4. */
exec_control &= ~SECONDARY_EXEC_DESC;
@@ -3900,137 +4726,112 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- if (!enable_pml)
+ /*
+ * PML is enabled/disabled when dirty logging of memsmlots changes, but
+ * it needs to be set here when dirty logging is already active, e.g.
+ * if this vCPU was created after dirty logging was enabled.
+ */
+ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (vmx_xsaves_supported()) {
- /* Exposing XSAVES only when XSAVE is exposed */
- bool xsaves_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
-
- if (!xsaves_enabled)
- exec_control &= ~SECONDARY_EXEC_XSAVES;
-
- if (nested) {
- if (xsaves_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_XSAVES;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_XSAVES;
- }
- }
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
- if (vmx_rdtscp_supported()) {
- bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
- if (!rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
+ /*
+ * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
+ * feature is exposed to the guest. This creates a virtualization hole
+ * if both are supported in hardware but only one is exposed to the
+ * guest, but letting the guest execute RDTSCP or RDPID when either one
+ * is advertised is preferable to emulating the advertised instruction
+ * in KVM on #UD, and obviously better than incorrectly injecting #UD.
+ */
+ if (cpu_has_vmx_rdtscp()) {
+ bool rdpid_or_rdtscp_enabled =
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
- if (nested) {
- if (rdtscp_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDTSCP;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
- }
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_ENABLE_RDTSCP,
+ rdpid_or_rdtscp_enabled, false);
}
- if (vmx_invpcid_supported()) {
- /* Exposing INVPCID only when PCID is exposed */
- bool invpcid_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PCID);
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
- if (!invpcid_enabled) {
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
- guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
- }
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
- if (nested) {
- if (invpcid_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_INVPCID;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_ENABLE_INVPCID;
- }
- }
+ vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
+ ENABLE_USR_WAIT_PAUSE, false);
- if (vmx_rdrand_supported()) {
- bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
- if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
+ if (!vcpu->kvm->arch.bus_lock_detection_enabled)
+ exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
- if (nested) {
- if (rdrand_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND_EXITING;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND_EXITING;
- }
- }
+ if (!kvm_notify_vmexit_enabled(vcpu->kvm))
+ exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
- if (vmx_rdseed_supported()) {
- bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
- if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
+ return exec_control;
+}
- if (nested) {
- if (rdseed_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED_EXITING;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED_EXITING;
- }
- }
+static inline int vmx_get_pid_table_order(struct kvm *kvm)
+{
+ return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
+}
+
+static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
+{
+ struct page *pages;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+
+ if (!irqchip_in_kernel(kvm) || !enable_ipiv)
+ return 0;
+
+ if (kvm_vmx->pid_table)
+ return 0;
- vmx->secondary_exec_control = exec_control;
+ pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ vmx_get_pid_table_order(kvm));
+ if (!pages)
+ return -ENOMEM;
+
+ kvm_vmx->pid_table = (void *)page_address(pages);
+ return 0;
}
-static void ept_set_mmio_spte_mask(void)
+int vmx_vcpu_precreate(struct kvm *kvm)
{
- /*
- * EPT Misconfigurations can be generated if the value of bits 2:0
- * of an EPT paging-structure entry is 110b (write/execute).
- */
- kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
- VMX_EPT_MISCONFIG_WX_VALUE);
+ return vmx_alloc_ipiv_pid_table(kvm);
}
#define VMX_XSS_EXIT_BITMAP 0
-/*
- * Sets up the vmcs for emulated real mode.
- */
-static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void init_vmcs(struct vcpu_vmx *vmx)
{
- int i;
+ struct kvm *kvm = vmx->vcpu.kvm;
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
if (nested)
- nested_vmx_vcpu_setup();
+ nested_vmx_set_vmcs_shadowing_bitmap();
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
/* Control */
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
- vmx->hv_deadline_tsc = -1;
+ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
+ exec_controls_set(vmx, vmx_exec_control(vmx));
if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- vmx->secondary_exec_control);
+ secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
}
- if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
+
+ if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4039,15 +4840,23 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_INTR_STATUS, 0);
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
- vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
+ }
+
+ if (vmx_can_use_ipiv(&vmx->vcpu)) {
+ vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
+ vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1);
}
- if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+ if (!kvm_pause_in_guest(kvm)) {
vmcs_write32(PLE_GAP, ple_gap);
vmx->ple_window = ple_window;
vmx->ple_window_dirty = true;
}
+ if (kvm_notify_vmexit_enabled(kvm))
+ vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
+
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -4070,75 +4879,99 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
- u32 index = vmx_msr_index[i];
- u32 data_low, data_high;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- vmx->guest_msrs[j].index = i;
- vmx->guest_msrs[j].data = 0;
- vmx->guest_msrs[j].mask = -1ull;
- ++vmx->nmsrs;
- }
-
- vmx->arch_capabilities = kvm_get_arch_capabilities();
-
- vm_exit_controls_init(vmx, vmx_vmexit_ctrl());
+ vm_exit_controls_set(vmx, vmx_get_initial_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
- vm_entry_controls_init(vmx, vmx_vmentry_ctrl());
+ vm_entry_controls_set(vmx, vmx_get_initial_vmentry_ctrl());
- vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+ vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
set_cr4_guest_host_mask(vmx);
- if (vmx_xsaves_supported())
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
+ if (cpu_has_vmx_xsaves())
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
- if (cpu_has_vmx_encls_vmexit())
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ vmx_write_encls_bitmap(&vmx->vcpu, NULL);
- if (pt_mode == PT_MODE_HOST_GUEST) {
+ if (vmx_pt_mode_is_host_guest()) {
memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
/* Bit[6~0] are forced to 1, writes are ignored. */
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(&vmx->vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vmx->vcpu.arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ vmx_setup_uret_msrs(vmx);
+}
+
+static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ init_vmcs(vmx);
+
+ if (nested &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
+
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ vmx->nested.current_vmptr = INVALID_GPA;
+
+#ifdef CONFIG_KVM_HYPERV
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+#endif
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&vmx->vt.pi_desc);
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct msr_data apic_base_msr;
- u64 cr0;
+
+ if (!init_event)
+ __vmx_vcpu_reset(vcpu);
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
- vcpu->arch.microcode_version = 0x100000000ULL;
- vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- kvm_set_cr8(vcpu, 0);
+ vmx->msr_ia32_umwait_control = 0;
- if (!init_event) {
- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(vcpu))
- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(vcpu, &apic_base_msr);
- }
-
- vmx_segment_cache_clear(vmx);
+ vmx->hv_deadline_tsc = -1;
+ kvm_set_cr8(vcpu, 0);
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
@@ -4160,91 +4993,68 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
- if (!init_event) {
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- }
-
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- kvm_rip_write(vcpu, 0xfff0);
-
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+ vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
+
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
if (kvm_mpx_supported())
vmcs_write64(GUEST_BNDCFGS, 0);
- setup_msrs(vmx);
-
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
- if (cpu_has_vmx_tpr_shadow() && !init_event) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (cpu_need_tpr_shadow(vcpu))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vcpu->arch.apic->regs));
- vmcs_write32(TPR_THRESHOLD, 0);
+ if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
+ vmcs_writel(GUEST_SSP, 0);
+ vmcs_writel(GUEST_INTR_SSP_TABLE, 0);
}
+ if (kvm_cpu_cap_has(X86_FEATURE_IBT) ||
+ kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+ vmcs_writel(GUEST_S_CET, 0);
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx->vpid != 0)
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-
- cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx->vcpu.arch.cr0 = cr0;
- vmx_set_cr0(vcpu, cr0); /* enter rmode */
- vmx_set_cr4(vcpu, 0);
- vmx_set_efer(vcpu, 0);
-
- update_exception_bitmap(vcpu);
-
vpid_sync_context(vmx->vpid);
- if (init_event)
- vmx_clear_hlt(vcpu);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
- vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
- enable_irq_window(vcpu);
+ vmx_enable_irq_window(vcpu);
return;
}
- vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
int irq = vcpu->arch.interrupt.nr;
- trace_kvm_inj_virq(irq);
+ trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
++vcpu->stat.irq_injections;
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
if (vcpu->arch.interrupt.soft)
inc_eip = vcpu->arch.event_exit_inst_len;
- if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
return;
}
intr = irq | INTR_INFO_VALID_MASK;
@@ -4259,7 +5069,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
vmx_clear_hlt(vcpu);
}
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4280,8 +5090,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
- if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
return;
}
@@ -4325,44 +5134,82 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
}
}
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+ return false;
+
+ if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+ return true;
+
+ return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_NMI));
+}
+
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
- return 0;
+ return -EBUSY;
- if (!enable_vnmi &&
- to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
- return 0;
+ /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+ return -EBUSY;
- return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
- | GUEST_INTR_STATE_NMI));
+ return !vmx_nmi_blocked(vcpu);
}
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
- return (!to_vmx(vcpu)->nested.nested_run_pending &&
- vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
- int ret;
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return false;
+
+ return __vmx_interrupt_blocked(vcpu);
+}
+
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return -EBUSY;
+
+ return !vmx_interrupt_blocked(vcpu);
+}
+
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ void __user *ret;
if (enable_unrestricted_guest)
return 0;
- ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
- PAGE_SIZE * 3);
- if (ret)
- return ret;
+ mutex_lock(&kvm->slots_lock);
+ ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+ PAGE_SIZE * 3);
+ mutex_unlock(&kvm->slots_lock);
+
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
+
to_kvm_vmx(kvm)->tss_addr = addr;
- return init_rmode_tss(kvm);
+
+ return init_rmode_tss(kvm, ret);
}
-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
return 0;
@@ -4380,12 +5227,10 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
return false;
- /* fall through */
+ fallthrough;
case DB_VECTOR:
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- return false;
- /* fall through */
+ return !(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
case DE_VECTOR:
case OF_VECTOR:
case BR_VECTOR:
@@ -4395,7 +5240,6 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
case GP_VECTOR:
case MF_VECTOR:
return true;
- break;
}
return false;
}
@@ -4408,10 +5252,10 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
- if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+ if (kvm_emulate_instruction(vcpu, 0)) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
return 1;
}
@@ -4427,65 +5271,99 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
return 1;
}
+static int handle_machine_check(struct kvm_vcpu *vcpu)
+{
+ /* handled by vmx_vcpu_run() */
+ return 1;
+}
+
/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
+ * If the host has split lock detection disabled, then #AC is
+ * unconditionally injected into the guest, which is the pre split lock
+ * detection behaviour.
+ *
+ * If the host has split lock detection enabled then #AC is
+ * only injected into the guest when:
+ * - Guest CPL == 3 (user mode)
+ * - Guest has #AC detection enabled in CR0
+ * - Guest EFLAGS has AC bit set
*/
-static void kvm_machine_check(void)
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
{
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
- struct pt_regs regs = {
- .cs = 3, /* Fake ring 3 no matter what the guest ran on */
- .flags = X86_EFLAGS_IF,
- };
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return true;
- do_machine_check(&regs, 0);
-#endif
+ return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
-static int handle_machine_check(struct kvm_vcpu *vcpu)
+static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
{
- /* already handled by vcpu_run */
- return 1;
+ return vcpu->arch.guest_fpu.fpstate->xfd &&
+ !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
}
-static int handle_exception(struct kvm_vcpu *vcpu)
+static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 intr_info, ex_no, error_code;
- unsigned long cr2, rip, dr6;
+ unsigned long cr2, dr6;
u32 vect_info;
- enum emulation_result er;
vect_info = vmx->idt_vectoring_info;
- intr_info = vmx->exit_intr_info;
+ intr_info = vmx_get_intr_info(vcpu);
- if (is_machine_check(intr_info))
- return handle_machine_check(vcpu);
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
+ * vmx_vcpu_enter_exit().
+ */
+ if (is_machine_check(intr_info) || is_nmi(intr_info))
+ return 1;
- if (is_nmi(intr_info))
- return 1; /* already handled by vmx_vcpu_run() */
+ /*
+ * Queue the exception here instead of in handle_nm_fault_irqoff().
+ * This ensures the nested_vmx check is not skipped so vmexit can
+ * be reflected to L1 (when it intercepts #NM) before reaching this
+ * point.
+ */
+ if (is_nm_fault(intr_info)) {
+ kvm_queue_exception_p(vcpu, NM_VECTOR,
+ is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
+ return 1;
+ }
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
+ if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+
+ WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
+ "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
+ dump_vmcs(vcpu);
+ kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
+ return 1;
+ }
+
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
WARN_ON_ONCE(!enable_vmware_backdoor);
- er = kvm_emulate_instruction(vcpu,
- EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
- if (er == EMULATE_USER_EXIT)
- return 0;
- else if (er != EMULATE_DONE)
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
+ * error code on #GP.
+ */
+ if (error_code) {
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
- return 1;
+ return 1;
+ }
+ return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
}
/*
@@ -4497,18 +5375,26 @@ static int handle_exception(struct kvm_vcpu *vcpu)
!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.ndata = 4;
vcpu->run->internal.data[0] = vect_info;
vcpu->run->internal.data[1] = intr_info;
vcpu->run->internal.data[2] = error_code;
+ vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
return 0;
}
if (is_page_fault(intr_info)) {
- cr2 = vmcs_readl(EXIT_QUALIFICATION);
- /* EPT won't cause page fault directly */
- WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ cr2 = vmx_get_exit_qual(vcpu);
+ if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+ /*
+ * EPT will cause page fault only if we need to
+ * detect illegal GPAs.
+ */
+ WARN_ON_ONCE(!allow_smaller_maxphyaddr);
+ kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+ return 1;
+ } else
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -4517,24 +5403,46 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
- case AC_VECTOR:
- kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
- return 1;
case DB_VECTOR:
- dr6 = vmcs_readl(EXIT_QUALIFICATION);
+ dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= dr6 | DR6_RTM;
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Use the inner "skip" helper to
+ * avoid single-step #DB and MTF updates, as ICEBP is
+ * higher priority. Note, skipping ICEBP still clears
+ * STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
if (is_icebp(intr_info))
- skip_emulated_instruction(vcpu);
-
- kvm_queue_exception(vcpu, DB_VECTOR);
+ WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
+
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
}
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
- /* fall through */
+ fallthrough;
case BP_VECTOR:
/*
* Update instruction length as we may reinject #BP from
@@ -4544,10 +5452,23 @@ static int handle_exception(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.event_exit_inst_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_run->exit_reason = KVM_EXIT_DEBUG;
- rip = kvm_rip_read(vcpu);
- kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = ex_no;
break;
+ case AC_VECTOR:
+ if (vmx_guest_inject_ac(vcpu)) {
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
+ }
+
+ /*
+ * Handle split lock. Depending on detection mode this will
+ * either warn and disable split lock detection for this
+ * task or force SIGBUS on it.
+ */
+ if (handle_guest_split_lock(kvm_rip_read(vcpu)))
+ return 1;
+ fallthrough;
default:
kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
kvm_run->ex.exception = ex_no;
@@ -4557,7 +5478,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return 0;
}
-static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.irq_exits;
return 1;
@@ -4576,13 +5497,13 @@ static int handle_io(struct kvm_vcpu *vcpu)
int size, in, string;
unsigned port;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
string = (exit_qualification & 16) != 0;
++vcpu->stat.io_exits;
if (string)
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0);
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -4591,8 +5512,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
return kvm_fast_pio(vcpu, size, port, in);
}
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
/*
* Patch in the VMCALL instruction:
@@ -4620,18 +5540,11 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_guest_cr0_valid(vcpu, val))
- return 1;
-
if (kvm_set_cr0(vcpu, val))
return 1;
vmcs_writel(CR0_READ_SHADOW, orig_val);
return 0;
} else {
- if (to_vmx(vcpu)->nested.vmxon &&
- !nested_host_cr0_valid(vcpu, val))
- return 1;
-
return kvm_set_cr0(vcpu, val);
}
}
@@ -4655,8 +5568,14 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
static int handle_desc(struct kvm_vcpu *vcpu)
{
- WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ /*
+ * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
+ * and other code needs to be updated if UMIP can be guest owned.
+ */
+ BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
+
+ WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
+ return kvm_emulate_instruction(vcpu, 0);
}
static int handle_cr(struct kvm_vcpu *vcpu)
@@ -4667,12 +5586,12 @@ static int handle_cr(struct kvm_vcpu *vcpu)
int err;
int ret;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
cr = exit_qualification & 15;
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- val = kvm_register_readl(vcpu, reg);
+ val = kvm_register_read(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
@@ -4680,6 +5599,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, err);
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
err = kvm_set_cr3(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 4:
@@ -4705,14 +5625,13 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 2: /* clts */
- WARN_ONCE(1, "Guest should always own CR0.TS");
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- return kvm_skip_emulated_instruction(vcpu);
+ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+ return -EIO;
case 1: /*mov from cr*/
switch (cr) {
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
@@ -4726,7 +5645,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
break;
case 3: /* lmsw */
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
- trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
kvm_lmsw(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
@@ -4743,17 +5662,18 @@ static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
int dr, dr7, reg;
+ int err = 1;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
/* First, if DR does not exist, trigger UD */
if (!kvm_require_dr(vcpu, dr))
return 1;
- /* Do not handle if the CPL > 0, will trigger GP on re-entry */
- if (!kvm_require_cpl(vcpu, 0))
- return 1;
+ if (vmx_get_cpl(vcpu) > 0)
+ goto out;
+
dr7 = vmcs_readl(GUEST_DR7);
if (dr7 & DR7_GD) {
/*
@@ -4762,23 +5682,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
* guest debugging itself.
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+ vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
vcpu->run->debug.arch.dr7 = dr7;
vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
vcpu->run->debug.arch.exception = DB_VECTOR;
vcpu->run->exit_reason = KVM_EXIT_DEBUG;
return 0;
} else {
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
return 1;
}
}
if (vcpu->guest_debug == 0) {
- vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_MOV_DR_EXITING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
/*
* No more DR vmexits; force a reload of the debug registers
@@ -4791,28 +5708,17 @@ static int handle_dr(struct kvm_vcpu *vcpu)
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- unsigned long val;
-
- if (kvm_get_dr(vcpu, dr, &val))
- return 1;
- kvm_register_write(vcpu, reg, val);
- } else
- if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
- return 1;
-
- return kvm_skip_emulated_instruction(vcpu);
-}
-
-static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.dr6;
-}
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
+ err = 0;
+ } else {
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
+ }
-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
+out:
+ return kvm_complete_insn_gp(vcpu, err);
}
-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
@@ -4822,58 +5728,18 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
- vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
-}
-
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
-{
- vmcs_writel(GUEST_DR7, val);
-}
-
-static int handle_cpuid(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_cpuid(vcpu);
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu)
-{
- u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- struct msr_data msr_info;
-
- msr_info.index = ecx;
- msr_info.host_initiated = false;
- if (vmx_get_msr(vcpu, &msr_info)) {
- trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
- trace_kvm_msr_read(ecx, msr_info.data);
-
- /* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
- vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
- return kvm_skip_emulated_instruction(vcpu);
+ /*
+ * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
+ * a stale dr6 from the guest.
+ */
+ set_debugreg(DR6_RESERVED, 6);
}
-static int handle_wrmsr(struct kvm_vcpu *vcpu)
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
- struct msr_data msr;
- u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
- | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-
- msr.data = data;
- msr.index = ecx;
- msr.host_initiated = false;
- if (kvm_set_msr(vcpu, &msr) != 0) {
- trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- trace_kvm_msr_write(ecx, data);
- return kvm_skip_emulated_instruction(vcpu);
+ vmcs_writel(GUEST_DR7, val);
}
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
@@ -4884,8 +5750,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
- vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -4893,70 +5758,18 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_halt(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_halt(vcpu);
-}
-
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
-}
-
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
kvm_mmu_invlpg(vcpu, exit_qualification);
return kvm_skip_emulated_instruction(vcpu);
}
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
- int err;
-
- err = kvm_rdpmc(vcpu);
- return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
- u64 new_bv = kvm_read_edx_eax(vcpu);
- u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-
- if (kvm_set_xcr(vcpu, index, new_bv) == 0)
- return kvm_skip_emulated_instruction(vcpu);
- return 1;
-}
-
-static int handle_xsaves(struct kvm_vcpu *vcpu)
-{
- kvm_skip_emulated_instruction(vcpu);
- WARN(1, "this should never happen\n");
- return 1;
-}
-
-static int handle_xrstors(struct kvm_vcpu *vcpu)
-{
- kvm_skip_emulated_instruction(vcpu);
- WARN(1, "this should never happen\n");
- return 1;
-}
-
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
int access_type, offset;
access_type = exit_qualification & APIC_ACCESS_TYPE;
@@ -4972,12 +5785,12 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
}
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0);
}
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
int vector = exit_qualification & 0xff;
/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
@@ -4987,10 +5800,17 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 offset = exit_qualification & 0xfff;
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+
+ /*
+ * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
+ * hardware has done any necessary aliasing, offset adjustments, etc...
+ * for the access. I.e. the correct value has already been written to
+ * the vAPIC page for the correct 16-byte chunk. KVM needs only to
+ * retrieve the register value and emulate the access.
+ */
+ u32 offset = exit_qualification & 0xff0;
- /* APIC-write VM exit is trap-like and thus no need to adjust IP */
kvm_apic_write_nodecode(vcpu, offset);
return 1;
}
@@ -5008,7 +5828,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
reason = (u32)exit_qualification >> 30;
if (reason == TASK_SWITCH_GATE && idt_v) {
@@ -5028,7 +5848,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
error_code =
vmcs_read32(IDT_VECTORING_ERROR_CODE);
}
- /* fall through */
+ fallthrough;
case INTR_TYPE_SOFT_EXCEPTION:
kvm_clear_exception_queue(vcpu);
break;
@@ -5041,32 +5861,21 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
type != INTR_TYPE_EXT_INTR &&
type != INTR_TYPE_NMI_INTR))
- skip_emulated_instruction(vcpu);
-
- if (kvm_task_switch(vcpu, tss_selector,
- type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
- }
+ WARN_ON(!skip_emulated_instruction(vcpu));
/*
* TODO: What about debug traps on tss switch?
* Are we supposed to inject them and update dr6?
*/
-
- return 1;
+ return kvm_task_switch(vcpu, tss_selector,
+ type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
+ reason, has_error_code, error_code);
}
static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
gpa_t gpa;
- u64 error_code;
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
/*
* EPT violation happened while executing iret from NMI,
@@ -5080,34 +5889,29 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- trace_kvm_page_fault(gpa, exit_qualification);
-
- /* Is it a read fault? */
- error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
- ? PFERR_USER_MASK : 0;
- /* Is it a write fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
- ? PFERR_WRITE_MASK : 0;
- /* Is it a fetch fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
- ? PFERR_FETCH_MASK : 0;
- /* ept page table entry is present? */
- error_code |= (exit_qualification &
- (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
- EPT_VIOLATION_EXECUTABLE))
- ? PFERR_PRESENT_MASK : 0;
-
- error_code |= (exit_qualification & 0x100) != 0 ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
-
- vcpu->arch.exit_qualification = exit_qualification;
- return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+ trace_kvm_page_fault(vcpu, gpa, exit_qualification);
+
+ /*
+ * Check that the GPA doesn't exceed physical memory limits, as that is
+ * a guest page fault. We have to emulate the instruction here, because
+ * if the illegal address is that of a paging structure, then
+ * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
+ * would also use advanced VM-exit information for EPT violations to
+ * reconstruct the page fault error code.
+ */
+ if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
+ return kvm_emulate_instruction(vcpu, 0);
+
+ return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
}
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
+ if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
+ return 1;
+
/*
* A nested guest cannot optimize MMIO vmexits, because we have an
* nGPA here instead of the required GPA.
@@ -5116,21 +5920,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
if (!is_guest_mode(vcpu) &&
!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
trace_kvm_fast_mmio(gpa);
- /*
- * Doing kvm_skip_emulated_instruction() depends on undefined
- * behavior: Intel's manual doesn't mandate
- * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
- * occurs and while on real hardware it was observed to be set,
- * other hypervisors (namely Hyper-V) don't set it, we end up
- * advancing IP with some random value. Disable fast mmio when
- * running nested and keep it for real hardware in hope that
- * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
- */
- if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
- return kvm_skip_emulated_instruction(vcpu);
- else
- return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
- EMULATE_DONE;
+ return kvm_skip_emulated_instruction(vcpu);
}
return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -5138,138 +5928,104 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
- WARN_ON_ONCE(!enable_vnmi);
- vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
- CPU_BASED_VIRTUAL_NMI_PENDING);
+ if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+ return -EIO;
+
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
-static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+/*
+ * Returns true if emulation is required (due to the vCPU having invalid state
+ * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
+ * current vCPU state.
+ */
+static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- enum emulation_result err = EMULATE_DONE;
- int ret = 1;
- u32 cpu_exec_ctrl;
- bool intr_window_requested;
- unsigned count = 130;
+
+ if (!vmx->vt.emulation_required)
+ return false;
/*
- * We should never reach the point where we are emulating L2
- * due to invalid guest state as that means we incorrectly
- * allowed a nested VMEntry with an invalid vmcs12.
+ * It is architecturally impossible for emulation to be required when a
+ * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
+ * guest state is invalid and unrestricted guest is disabled, i.e. KVM
+ * should synthesize VM-Fail instead emulation L2 code. This path is
+ * only reachable if userspace modifies L2 guest state after KVM has
+ * performed the nested VM-Enter consistency checks.
*/
- WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
+ if (vmx->nested.nested_run_pending)
+ return true;
- cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
+ /*
+ * KVM only supports emulating exceptions if the vCPU is in Real Mode.
+ * If emulation is required, KVM can't perform a successful VM-Enter to
+ * inject the exception.
+ */
+ return !vmx->rmode.vm86_active &&
+ (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
+}
+
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool intr_window_requested;
+ unsigned count = 130;
+
+ intr_window_requested = exec_controls_get(vmx) &
+ CPU_BASED_INTR_WINDOW_EXITING;
- while (vmx->emulation_required && count-- != 0) {
- if (intr_window_requested && vmx_interrupt_allowed(vcpu))
+ while (vmx->vt.emulation_required && count-- != 0) {
+ if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
return handle_interrupt_window(&vmx->vcpu);
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
- err = kvm_emulate_instruction(vcpu, 0);
-
- if (err == EMULATE_USER_EXIT) {
- ++vcpu->stat.mmio_exits;
- ret = 0;
- goto out;
- }
+ /*
+ * Ensure that any updates to kvm->buses[] observed by the
+ * previous instruction (emulated or otherwise) are also
+ * visible to the instruction KVM is about to emulate.
+ */
+ smp_rmb();
- if (err != EMULATE_DONE)
- goto emulation_error;
+ if (!kvm_emulate_instruction(vcpu, 0))
+ return 0;
- if (vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending)
- goto emulation_error;
+ if (vmx_unhandleable_emulation_required(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- ret = kvm_vcpu_halt(vcpu);
- goto out;
+ return kvm_emulate_halt_noskip(vcpu);
}
- if (signal_pending(current))
- goto out;
- if (need_resched())
- schedule();
+ /*
+ * Note, return 1 and not 0, vcpu_run() will invoke
+ * xfer_to_guest_mode() which will create a proper return
+ * code.
+ */
+ if (__xfer_to_guest_mode_work_pending())
+ return 1;
}
-out:
- return ret;
-
-emulation_error:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
-}
-
-static void grow_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int old = vmx->ple_window;
-
- vmx->ple_window = __grow_ple_window(old, ple_window,
- ple_window_grow,
- ple_window_max);
-
- if (vmx->ple_window != old)
- vmx->ple_window_dirty = true;
-
- trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
-}
-
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int old = vmx->ple_window;
-
- vmx->ple_window = __shrink_ple_window(old, ple_window,
- ple_window_shrink,
- ple_window);
-
- if (vmx->ple_window != old)
- vmx->ple_window_dirty = true;
-
- trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
+ return 1;
}
-/*
- * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
- */
-static void wakeup_handler(void)
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu;
- int cpu = smp_processor_id();
-
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
- blocked_vcpu_list) {
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (pi_test_on(pi_desc) == 1)
- kvm_vcpu_kick(vcpu);
+ if (vmx_unhandleable_emulation_required(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
}
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-}
-static void vmx_enable_tdp(void)
-{
- kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
- enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
- enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK,
- cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- VMX_EPT_RWX_MASK, 0ull);
-
- ept_set_mmio_spte_mask();
- kvm_enable_tdp();
+ return 1;
}
/*
@@ -5291,131 +6047,40 @@ static int handle_pause(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
- return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
- printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
- return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
-}
-
static int handle_monitor_trap(struct kvm_vcpu *vcpu)
{
return 1;
}
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
- printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
- return handle_nop(vcpu);
-}
-
static int handle_invpcid(struct kvm_vcpu *vcpu)
{
u32 vmx_instruction_info;
unsigned long type;
- bool pcid_enabled;
gva_t gva;
- struct x86_exception e;
- unsigned i;
- unsigned long roots_to_free = 0;
struct {
u64 pcid;
u64 gla;
} operand;
+ int gpr_index;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
-
- if (type > 3) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info);
+ type = kvm_register_read(vcpu, gpr_index);
/* According to the Intel instruction reference, the memory operand
* is read even if it isn't needed (e.g., for type==all)
*/
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmx_instruction_info, false, &gva))
- return 1;
-
- if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_page_fault(vcpu, &e);
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_instruction_info, false,
+ sizeof(operand), &gva))
return 1;
- }
-
- if (operand.pcid >> 12 != 0) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
- switch (type) {
- case INVPCID_TYPE_INDIV_ADDR:
- if ((!pcid_enabled && (operand.pcid != 0)) ||
- is_noncanonical_address(operand.gla, vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
- return kvm_skip_emulated_instruction(vcpu);
-
- case INVPCID_TYPE_SINGLE_CTXT:
- if (!pcid_enabled && (operand.pcid != 0)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- if (kvm_get_active_pcid(vcpu) == operand.pcid) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
- == operand.pcid)
- roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
- /*
- * If neither the current cr3 nor any of the prev_roots use the
- * given PCID, then nothing needs to be done here because a
- * resync will happen anyway before switching to any other CR3.
- */
-
- return kvm_skip_emulated_instruction(vcpu);
-
- case INVPCID_TYPE_ALL_NON_GLOBAL:
- /*
- * Currently, KVM doesn't mark global entries in the shadow
- * page tables, so a non-global flush just degenerates to a
- * global flush. If needed, we could optimize this later by
- * keeping track of global entries in shadow page tables.
- */
-
- /* fall-through */
- case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_mmu_unload(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
-
- default:
- BUG(); /* We have already checked above that type <= 3 */
- }
+ return kvm_handle_invpcid(vcpu, type, gva);
}
static int handle_pml_full(struct kvm_vcpu *vcpu)
@@ -5424,7 +6089,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
trace_kvm_pml_full(vcpu->vcpu_id);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
/*
* PML buffer FULL happened while executing iret from NMI,
@@ -5443,16 +6108,52 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
+ * due to the timer expiring while it was "soft" disabled, just eat the
+ * exit and re-enter the guest.
+ */
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+ return EXIT_FASTPATH_REENTER_GUEST;
+
+ /*
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+ * If L2 is active, go down the slow path as emulating the guest timer
+ * expiration likely requires synthesizing a nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
+}
+
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- if (!to_vmx(vcpu)->req_immediate_exit)
- kvm_lapic_expired_hv_timer(vcpu);
+ /*
+ * This non-fastpath handler is reached if and only if the preemption
+ * timer was being used to emulate a guest timer while L2 is active.
+ * All other scenarios are supposed to be handled in the fastpath.
+ */
+ WARN_ON_ONCE(!is_guest_mode(vcpu));
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
/*
* When nested=0, all VMX instruction VM Exits filter here. The handlers
- * are overwritten by nested_vmx_setup() when nested=1.
+ * are overwritten by nested_vmx_hardware_setup() when nested=1.
*/
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
{
@@ -5460,16 +6161,78 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
/*
- * SGX virtualization is not yet supported. There is no software
- * enable bit for SGX, so we have to trap ENCLS and inject a #UD
- * to prevent the guest from executing ENCLS.
+ * SGX virtualization is disabled. There is no software enable bit for
+ * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
+ * the guest from executing ENCLS (when SGX is supported by hardware).
*/
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
+#endif /* CONFIG_X86_SGX_KVM */
+
+static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
+ * VM-Exits. Unconditionally set the flag here and leave the handling to
+ * vmx_handle_exit().
+ */
+ to_vt(vcpu)->exit_reason.bus_lock_detected = true;
+ return 1;
+}
+
+static int handle_notify(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
+ bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
+
+ ++vcpu->stat.notify_window_exits;
+
+ /*
+ * Notify VM exit happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER ||
+ context_invalid) {
+ vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
+ vcpu->run->notify.flags = context_invalid ?
+ KVM_NOTIFY_CONTEXT_INVALID : 0;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int vmx_get_msr_imm_reg(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_instr_info_reg(vmcs_read32(VMX_INSTRUCTION_INFO));
+}
+
+static int handle_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_rdmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
+
+static int handle_wrmsr_imm(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+}
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
@@ -5477,22 +6240,22 @@ static int handle_encls(struct kvm_vcpu *vcpu)
* to be done to userspace and return 0.
*/
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
- [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
+ [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
[EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
- [EXIT_REASON_CPUID] = handle_cpuid,
- [EXIT_REASON_MSR_READ] = handle_rdmsr,
- [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
- [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
- [EXIT_REASON_HLT] = handle_halt,
- [EXIT_REASON_INVD] = handle_invd,
+ [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
+ [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
+ [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = kvm_emulate_halt,
+ [EXIT_REASON_INVD] = kvm_emulate_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
- [EXIT_REASON_RDPMC] = handle_rdpmc,
- [EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
+ [EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
[EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
[EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
[EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
@@ -5506,8 +6269,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_APIC_WRITE] = handle_apic_write,
[EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
- [EXIT_REASON_WBINVD] = handle_wbinvd,
- [EXIT_REASON_XSETBV] = handle_xsetbv,
+ [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
+ [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_GDTR_IDTR] = handle_desc,
@@ -5515,29 +6278,57 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
- [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
[EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
- [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
[EXIT_REASON_INVEPT] = handle_vmx_instruction,
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
- [EXIT_REASON_RDRAND] = handle_invalid_op,
- [EXIT_REASON_RDSEED] = handle_invalid_op,
- [EXIT_REASON_XSAVES] = handle_xsaves,
- [EXIT_REASON_XRSTORS] = handle_xrstors,
+ [EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
+ [EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
+ [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
+ [EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
+ [EXIT_REASON_TDCALL] = handle_tdx_instruction,
+ [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
+ [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
};
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
- *info1 = vmcs_readl(EXIT_QUALIFICATION);
- *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ *reason = vmx->vt.exit_reason.full;
+ *info1 = vmx_get_exit_qual(vcpu);
+ if (!(vmx->vt.exit_reason.failed_vmentry)) {
+ *info2 = vmx->idt_vectoring_info;
+ *intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ else
+ *error_code = 0;
+ } else {
+ *info2 = 0;
+ *intr_info = 0;
+ *error_code = 0;
+ }
+}
+
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ else
+ *error_code = 0;
}
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
@@ -5551,50 +6342,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u16 pml_idx, pml_tail_index;
u64 *pml_buf;
- u16 pml_idx;
+ int i;
pml_idx = vmcs_read16(GUEST_PML_INDEX);
/* Do nothing if PML buffer is empty */
- if (pml_idx == (PML_ENTITY_NUM - 1))
+ if (pml_idx == PML_HEAD_INDEX)
return;
+ /*
+ * PML index always points to the next available PML buffer entity
+ * unless PML log has just overflowed.
+ */
+ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
- /* PML index always points to next available PML buffer entity */
- if (pml_idx >= PML_ENTITY_NUM)
- pml_idx = 0;
- else
- pml_idx++;
-
+ /*
+ * PML log is written backwards: the CPU first writes the entry 511
+ * then the entry 510, and so on.
+ *
+ * Read the entries in the same order they were written, to ensure that
+ * the dirty ring is filled in the same order the CPU wrote them.
+ */
pml_buf = page_address(vmx->pml_pg);
- for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+
+ for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
u64 gpa;
- gpa = pml_buf[pml_idx];
+ gpa = pml_buf[i];
WARN_ON(gpa & (PAGE_SIZE - 1));
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
}
/* reset PML index */
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
-}
-
-/*
- * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
- * Called before reporting dirty_bitmap to userspace.
- */
-static void kvm_flush_pml_buffers(struct kvm *kvm)
-{
- int i;
- struct kvm_vcpu *vcpu;
- /*
- * We only need to kick vcpu out of guest mode here, as PML buffer
- * is flushed at beginning of all VMEXITs, and it's obvious that only
- * vcpus running in guest are possible to have unflushed GPAs in PML
- * buffer.
- */
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vcpu_kick(vcpu);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
static void vmx_dump_sel(char *name, uint32_t sel)
@@ -5613,20 +6394,48 @@ static void vmx_dump_dtsel(char *name, uint32_t limit)
vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
}
-static void dump_vmcs(void)
+static void vmx_dump_msrs(char *name, struct vmx_msrs *m)
{
- u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
- u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
- u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
- u32 secondary_exec_control = 0;
- unsigned long cr4 = vmcs_readl(GUEST_CR4);
- u64 efer = vmcs_read64(GUEST_IA32_EFER);
- int i, n;
+ unsigned int i;
+ struct vmx_msr_entry *e;
+
+ pr_err("MSR %s:\n", name);
+ for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+ pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmentry_ctl, vmexit_ctl;
+ u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+ u64 tertiary_exec_control;
+ unsigned long cr4;
+ int efer_slot;
+
+ if (!dump_invalid_vmcs) {
+ pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
+ return;
+ }
+
+ vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ cr4 = vmcs_readl(GUEST_CR4);
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ else
+ secondary_exec_control = 0;
+
+ if (cpu_has_tertiary_exec_ctrls())
+ tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+ else
+ tertiary_exec_control = 0;
+ pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
+ vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
pr_err("*** Guest State ***\n");
pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
@@ -5634,9 +6443,7 @@ static void dump_vmcs(void)
pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
- if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
- (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
- {
+ if (cpu_has_vmx_ept()) {
pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
@@ -5659,10 +6466,20 @@ static void dump_vmcs(void)
vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
- if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
- (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- efer, vmcs_read64(GUEST_IA32_PAT));
+ efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+ else if (efer_slot >= 0)
+ pr_err("EFER= 0x%016llx (autoload)\n",
+ vmx->msr_autoload.guest.val[efer_slot].value);
+ else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer | (EFER_LMA | EFER_LME));
+ else
+ pr_err("EFER= 0x%016llx (effective)\n",
+ vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
@@ -5678,7 +6495,15 @@ static void dump_vmcs(void)
if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
pr_err("InterruptStatus = %04x\n",
vmcs_read16(GUEST_INTR_STATUS));
-
+ if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+ if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+ vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
+
+ if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP),
+ vmcs_readl(GUEST_INTR_SSP_TABLE));
pr_err("*** Host State ***\n");
pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
@@ -5699,19 +6524,26 @@ static void dump_vmcs(void)
vmcs_readl(HOST_IA32_SYSENTER_ESP),
vmcs_read32(HOST_IA32_SYSENTER_CS),
vmcs_readl(HOST_IA32_SYSENTER_EIP));
- if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
- pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- vmcs_read64(HOST_IA32_EFER),
- vmcs_read64(HOST_IA32_PAT));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+ pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+ pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
if (cpu_has_load_perf_global_ctrl() &&
vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
pr_err("PerfGlobCtl = 0x%016llx\n",
vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+ if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+ vmx_dump_msrs("host autoload", &vmx->msr_autoload.host);
+ if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE)
+ pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n",
+ vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP),
+ vmcs_readl(HOST_INTR_SSP_TABLE));
pr_err("*** Control State ***\n");
- pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
- pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
- pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+ cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+ pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+ pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
vmcs_read32(EXCEPTION_BITMAP),
vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
@@ -5733,101 +6565,155 @@ static void dump_vmcs(void)
if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
pr_err("TSC Multiplier = 0x%016llx\n",
vmcs_read64(TSC_MULTIPLIER));
- if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
- pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+ u16 status = vmcs_read16(GUEST_INTR_STATUS);
+ pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
+ }
+ pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+ pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
+ pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
+ }
if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
- n = vmcs_read32(CR3_TARGET_COUNT);
- for (i = 0; i + 1 < n; i += 4)
- pr_err("CR3 target%u=%016lx target%u=%016lx\n",
- i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
- i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
- if (i < n)
- pr_err("CR3 target%u=%016lx\n",
- i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
pr_err("PLE Gap=%08x Window=%08x\n",
vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
+ }
}
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
-
- trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+ u16 exit_handler_index;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
* querying dirty_bitmap, we only need to kick all vcpus out of guest
* mode as if vcpus is in root mode, the PML buffer must has been
- * flushed already.
+ * flushed already. Note, PML is never enabled in hardware while
+ * running L2.
*/
- if (enable_pml)
+ if (enable_pml && !is_guest_mode(vcpu))
vmx_flush_pml_buffer(vcpu);
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required)
- return handle_invalid_guest_state(vcpu);
+ /*
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
+ */
+ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+ return -EIO;
+
+ if (is_guest_mode(vcpu)) {
+ /*
+ * PML is never enabled when running L2, bail immediately if a
+ * PML full exit occurs as something is horribly wrong.
+ */
+ if (exit_reason.basic == EXIT_REASON_PML_FULL)
+ goto unexpected_vmexit;
+
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->vt.emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
+ if (nested_vmx_reflect_vmexit(vcpu))
+ return 1;
+ }
- if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
- return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->vt.emulation_required)
+ return handle_invalid_guest_state(vcpu);
- if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
- dump_vmcs();
+ if (exit_reason.failed_vmentry) {
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
- = exit_reason;
+ = exit_reason.full;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
if (unlikely(vmx->fail)) {
+ dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
- /*
- * Note:
- * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
- * delivery event since it indicates guest is accessing MMIO.
- * The vm-exit can be triggered again after return to guest that
- * will cause infinite loop.
- */
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION &&
- exit_reason != EXIT_REASON_PML_FULL &&
- exit_reason != EXIT_REASON_TASK_SWITCH)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.ndata = 3;
- vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason;
- vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
- if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.ndata++;
- vcpu->run->internal.data[3] =
- vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- }
+ (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason.basic != EXIT_REASON_PML_FULL &&
+ exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
+ exit_reason.basic != EXIT_REASON_NOTIFY &&
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
+ kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
return 0;
}
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked)) {
- if (vmx_interrupt_allowed(vcpu)) {
+ if (!vmx_interrupt_blocked(vcpu)) {
vmx->loaded_vmcs->soft_vnmi_blocked = 0;
} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
vcpu->arch.nmi_pending) {
@@ -5844,105 +6730,78 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
}
- if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
- else {
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
- exit_reason);
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- }
-}
-/*
- * Software based L1D cache flush which is used when microcode providing
- * the cache control MSR is not loaded.
- *
- * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
- * flush it is required to read in 64 KiB because the replacement algorithm
- * is not exactly LRU. This could be sized at runtime via topology
- * information but as all relevant affected CPUs have 32KiB L1D cache size
- * there is no point in doing so.
- */
-static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
-{
- int size = PAGE_SIZE << L1D_CACHE_ORDER;
+ if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
+ goto unexpected_vmexit;
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
+ return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
+ return handle_wrmsr_imm(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
+ return handle_preemption_timer(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
+ return handle_interrupt_window(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ return handle_external_interrupt(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
+ return handle_ept_misconfig(vcpu);
+#endif
- /*
- * This code is only executed when the the flush mode is 'cond' or
- * 'always'
- */
- if (static_branch_likely(&vmx_l1d_flush_cond)) {
- bool flush_l1d;
+ exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_handler_index])
+ goto unexpected_vmexit;
- /*
- * Clear the per-vcpu flush bit, it gets set again
- * either from vcpu_run() or from one of the unsafe
- * VMEXIT handlers.
- */
- flush_l1d = vcpu->arch.l1tf_flush_l1d;
- vcpu->arch.l1tf_flush_l1d = false;
+ return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
- /*
- * Clear the per-cpu flush bit, it gets set again from
- * the interrupt handlers.
- */
- flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
- kvm_clear_cpu_l1tf_flush_l1d();
+unexpected_vmexit:
+ dump_vmcs(vcpu);
+ kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full);
+ return 0;
+}
- if (!flush_l1d)
- return;
- }
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ int ret = __vmx_handle_exit(vcpu, exit_fastpath);
- vcpu->stat.l1d_flush++;
+ /*
+ * Exit to user space when bus lock detected to inform that there is
+ * a bus lock in guest.
+ */
+ if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
+ if (ret > 0)
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
- if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
- return;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
}
-
- asm volatile(
- /* First ensure the pages are in the TLB */
- "xorl %%eax, %%eax\n"
- ".Lpopulate_tlb:\n\t"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $4096, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lpopulate_tlb\n\t"
- "xorl %%eax, %%eax\n\t"
- "cpuid\n\t"
- /* Now fill the cache */
- "xorl %%eax, %%eax\n"
- ".Lfill_cache:\n"
- "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
- "addl $64, %%eax\n\t"
- "cmpl %%eax, %[size]\n\t"
- "jne .Lfill_cache\n\t"
- "lfence\n"
- :: [flush_pages] "r" (vmx_l1d_flush_pages),
- [size] "r" (size)
- : "eax", "ebx", "ecx", "edx");
+ return ret;
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ int tpr_threshold;
if (is_guest_mode(vcpu) &&
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
return;
- if (irr == -1 || tpr < irr) {
- vmcs_write32(TPR_THRESHOLD, 0);
- return;
- }
-
- vmcs_write32(TPR_THRESHOLD, irr);
+ tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
+ if (is_guest_mode(vcpu))
+ to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
+ else
+ vmcs_write32(TPR_THRESHOLD, tpr_threshold);
}
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 sec_exec_control;
if (!lapic_in_kernel(vcpu))
@@ -5954,24 +6813,33 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
/* Postpone execution until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
- to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
+ vmx->nested.change_vmcs01_virtual_apic_mode = true;
return;
}
- sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ sec_exec_control = secondary_exec_controls_get(vmx);
sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
switch (kvm_get_apic_mode(vcpu)) {
case LAPIC_MODE_INVALID:
WARN_ONCE(true, "Invalid local APIC state");
+ break;
case LAPIC_MODE_DISABLED:
break;
case LAPIC_MODE_XAPIC:
if (flexpriority_enabled) {
sec_exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- vmx_flush_tlb(vcpu, true);
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ /*
+ * Flush the TLB, reloading the APIC access page will
+ * only do so if its physical address has changed, but
+ * the guest may have inserted a non-APIC mapping into
+ * the TLB while the APIC access page was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
break;
case LAPIC_MODE_X2APIC:
@@ -5980,24 +6848,108 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
break;
}
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+ secondary_exec_controls_set(vmx, sec_exec_control);
- vmx_update_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
- if (!is_guest_mode(vcpu)) {
- vmcs_write64(APIC_ACCESS_ADDR, hpa);
- vmx_flush_tlb(vcpu, true);
+ const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot;
+ struct page *refcounted_page;
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ bool writable;
+
+ /* Defer reload until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
+ return;
}
+
+ if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return;
+
+ /*
+ * Explicitly grab the memslot using KVM's internal slot ID to ensure
+ * KVM doesn't unintentionally grab a userspace memslot. It _should_
+ * be impossible for userspace to create a memslot for the APIC when
+ * APICv is enabled, but paranoia won't hurt in this case.
+ */
+ slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return;
+
+ /*
+ * Ensure that the mmu_notifier sequence count is read before KVM
+ * retrieves the pfn from the primary MMU. Note, the memslot is
+ * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
+ * in kvm_mmu_invalidate_end().
+ */
+ mmu_seq = kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ /*
+ * No need to retry if the memslot does not exist or is invalid. KVM
+ * controls the APIC-access page memslot, and only deletes the memslot
+ * if APICv is permanently inhibited, i.e. the memslot won't reappear.
+ */
+ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ read_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ else
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
+
+ /*
+ * Do not pin the APIC access page in memory so that it can be freely
+ * migrated, the MMU notifier will call us again if it is migrated or
+ * swapped out. KVM backs the memslot with anonymous memory, the pfn
+ * should always point at a refcounted page (if the pfn is valid).
+ */
+ if (!WARN_ON_ONCE(!refcounted_page))
+ kvm_release_page_clean(refcounted_page);
+
+ /*
+ * No need for a manual TLB flush at this point, KVM has already done a
+ * flush if there were SPTEs pointing at the previous page.
+ */
+ read_unlock(&vcpu->kvm->mmu_lock);
}
-static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
u16 status;
u8 old;
+ /*
+ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
+ * is only relevant for if and only if Virtual Interrupt Delivery is
+ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
+ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
+ * VM-Exit, otherwise L1 with run with a stale SVI.
+ */
+ if (is_guest_mode(vcpu)) {
+ /*
+ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+ * Note, userspace can stuff state while L2 is active; assert
+ * that VID is disabled if and only if the vCPU is in KVM_RUN
+ * to avoid false positives if userspace is setting APIC state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run &&
+ nested_cpu_has_vid(get_vmcs12(vcpu)));
+ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+ return;
+ }
+
if (max_isr == -1)
max_isr = 0;
@@ -6027,59 +6979,53 @@ static void vmx_set_rvi(int vector)
}
}
-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
- /*
- * When running L2, updating RVI is only relevant when
- * vmcs12 virtual-interrupt-delivery enabled.
- * However, it can be enabled only when L1 also
- * intercepts external-interrupts and in that case
- * we should not update vmcs02 RVI but instead intercept
- * interrupt. Therefore, do nothing when running L2.
- */
- if (!is_guest_mode(vcpu))
- vmx_set_rvi(max_irr);
-}
-
-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
- WARN_ON(!vcpu->arch.apicv_active);
- if (pi_test_on(&vmx->pi_desc)) {
- pi_clear_on(&vmx->pi_desc);
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
+ return -EIO;
+
+ if (pi_test_on(&vt->pi_desc)) {
+ pi_clear_on(&vt->pi_desc);
/*
- * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * IOMMU can write to PID.ON, so the barrier matters even on UP.
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
- kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, we should re-evaluate
- * what should be done with this new L1 interrupt.
- * If L1 intercepts external-interrupts, we should
- * exit from L2 to L1. Otherwise, interrupt should be
- * delivered directly to L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated) {
- if (nested_exit_on_intr(vcpu))
- kvm_vcpu_exiting_guest_mode(vcpu);
- else
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
+ got_posted_interrupt =
+ kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -6090,95 +7036,105 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+void vmx_do_interrupt_irqoff(unsigned long entry);
+void vmx_do_nmi_irqoff(void);
+
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * Save xfd_err to guest_fpu before interrupt is enabled, so the
+ * MSR value is not clobbered by the host activity before the guest
+ * has chance to consume it.
+ *
+ * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
+ * interception may have been caused by L1 interception. Per the SDM,
+ * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
+ *
+ * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
+ * unlike CR2 and DR6, the value is not a payload that is attached to
+ * the #NM exception.
+ */
+ if (is_xfd_nm_fault(vcpu))
+ rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
- pi_clear_on(&vmx->pi_desc);
- memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
+{
+ /* if exit due to PF check for async PF */
+ if (is_page_fault(intr_info))
+ vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ /* if exit due to NM, handle before interrupts are enabled */
+ else if (is_nm_fault(intr_info))
+ handle_nm_fault_irqoff(vcpu);
+ /* Handle machine checks before interrupts are enabled */
+ else if (is_machine_check(intr_info))
+ kvm_machine_check();
}
-static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
{
- u32 exit_intr_info = 0;
- u16 basic_exit_reason = (u16)vmx->exit_reason;
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
- if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
- || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
+ if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- vmx->exit_intr_info = exit_intr_info;
-
- /* if exit due to PF check for async PF */
- if (is_page_fault(exit_intr_info))
- vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
- /* Handle machine checks before interrupts are enabled */
- if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
- is_machine_check(exit_intr_info))
- kvm_machine_check();
+ /*
+ * Invoke the kernel's IRQ handler for the vector. Use the FRED path
+ * when it's available even if FRED isn't fully enabled, e.g. even if
+ * FRED isn't supported in hardware, in order to avoid the indirect
+ * CALL in the non-FRED path.
+ */
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ if (IS_ENABLED(CONFIG_X86_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
+ else
+ vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
+ kvm_after_interrupt(vcpu);
- /* We need to handle NMIs before interrupts are enabled */
- if (is_nmi(exit_intr_info)) {
- kvm_before_interrupt(&vmx->vcpu);
- asm("int $2");
- kvm_after_interrupt(&vmx->vcpu);
- }
+ vcpu->arch.at_instruction_boundary = true;
}
-static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
- == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
- unsigned int vector;
- unsigned long entry;
- gate_desc *desc;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-#ifdef CONFIG_X86_64
- unsigned long tmp;
-#endif
+ if (to_vt(vcpu)->emulation_required)
+ return;
- vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
- desc = (gate_desc *)vmx->host_idt_base + vector;
- entry = gate_offset(desc);
- asm volatile(
-#ifdef CONFIG_X86_64
- "mov %%" _ASM_SP ", %[sp]\n\t"
- "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
- "push $%c[ss]\n\t"
- "push %[sp]\n\t"
-#endif
- "pushf\n\t"
- __ASM_SIZE(push) " $%c[cs]\n\t"
- CALL_NOSPEC
- :
-#ifdef CONFIG_X86_64
- [sp]"=&r"(tmp),
-#endif
- ASM_CALL_CONSTRAINT
- :
- THUNK_TARGET(entry),
- [ss]"i"(__KERNEL_DS),
- [cs]"i"(__KERNEL_CS)
- );
+ switch (vmx_get_exit_reason(vcpu).basic) {
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_EXCEPTION_NMI:
+ handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ break;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ kvm_machine_check();
+ break;
+ default:
+ break;
}
}
-STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
-static bool vmx_has_emulated_msr(int index)
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/*
* We cannot do SMM unless we can run the guest in big
* real mode.
*/
return enable_unrestricted_guest || emulate_invalid_guest_state;
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+ return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
+ case MSR_AMD64_TSC_RATIO:
/* This is AMD only. */
return false;
default:
@@ -6186,11 +7142,6 @@ static bool vmx_has_emulated_msr(int index)
}
}
-static bool vmx_pt_supported(void)
-{
- return pt_mode == PT_MODE_HOST_GUEST;
-}
-
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -6203,11 +7154,8 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
if (enable_vnmi) {
if (vmx->loaded_vmcs->nmi_known_unmasked)
return;
- /*
- * Can't use vmx->exit_intr_info since we're not sure what
- * the exit reason is.
- */
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
@@ -6269,17 +7217,21 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
break;
case INTR_TYPE_SOFT_EXCEPTION:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
- /* fall through */
- case INTR_TYPE_HARD_EXCEPTION:
- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(error_code_field);
- kvm_requeue_exception_e(vcpu, vector, err);
- } else
- kvm_requeue_exception(vcpu, vector);
+ fallthrough;
+ case INTR_TYPE_HARD_EXCEPTION: {
+ u32 error_code = 0;
+
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(error_code_field);
+
+ kvm_requeue_exception(vcpu, vector,
+ idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
+ error_code);
break;
+ }
case INTR_TYPE_SOFT_INTR:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
- /* fall through */
+ fallthrough;
case INTR_TYPE_EXT_INTR:
kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
break;
@@ -6295,7 +7247,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
IDT_VECTORING_ERROR_CODE);
}
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
__vmx_complete_interrupts(vcpu,
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
@@ -6309,9 +7261,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
{
int i, nr_msrs;
struct perf_guest_switch_msr *msrs;
+ struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
- msrs = perf_guest_get_msrs(&nr_msrs);
+ pmu->host_cross_mapped_mask = 0;
+ if (pmu->pebs_enable & pmu->global_ctrl)
+ intel_pmu_cross_mapped_check(pmu);
+ /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
+ msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu);
if (!msrs)
return;
@@ -6323,27 +7280,16 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
-{
- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
- if (!vmx->loaded_vmcs->hv_timer_armed)
- vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
- vmx->loaded_vmcs->hv_timer_armed = true;
-}
-
-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->req_immediate_exit) {
- vmx_arm_hv_timer(vmx, 0);
- return;
- }
-
- if (vmx->hv_deadline_tsc != -1) {
+ if (force_immediate_exit) {
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+ } else if (vmx->hv_deadline_tsc != -1) {
tscl = rdtsc();
if (vmx->hv_deadline_tsc > tscl)
/* set_hv_timer ensures the delta fits in 32-bits */
@@ -6352,44 +7298,188 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
else
delta_tsc = 0;
- vmx_arm_hv_timer(vmx, delta_tsc);
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = false;
+ } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
+ vmx->loaded_vmcs->hv_timer_soft_disabled = true;
+ }
+}
+
+void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+{
+ if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
+ vmx->loaded_vmcs->host_state.rsp = host_rsp;
+ vmcs_writel(HOST_RSP, host_rsp);
+ }
+}
+
+void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ unsigned int flags)
+{
+ u64 hostval = this_cpu_read(x86_spec_ctrl_current);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
+ return;
+
+ if (flags & VMX_RUN_SAVE_SPEC_CTRL)
+ vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
+
+ /*
+ * If the guest/host SPEC_CTRL values differ, restore the host value.
+ *
+ * For legacy IBRS, the IBRS bit always needs to be written after
+ * transitioning from a less privileged predictor mode, regardless of
+ * whether the guest/host values differ.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
+ vmx->spec_ctrl != hostval)
+ native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
+
+ barrier_nospec();
+}
+
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
+{
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+ * the fastpath even, all other exits must use the slow path.
+ */
+ if (is_guest_mode(vcpu) &&
+ vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
+ switch (vmx_get_exit_reason(vcpu).basic) {
+ case EXIT_REASON_MSR_WRITE:
+ return handle_fastpath_wrmsr(vcpu);
+ case EXIT_REASON_MSR_WRITE_IMM:
+ return handle_fastpath_wrmsr_imm(vcpu, vmx_get_exit_qual(vcpu),
+ vmx_get_msr_imm_reg(vcpu));
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ case EXIT_REASON_HLT:
+ return handle_fastpath_hlt(vcpu);
+ case EXIT_REASON_INVD:
+ return handle_fastpath_invd(vcpu);
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+}
+
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
+ !is_nmi(vmx_get_intr_info(vcpu)))
return;
+
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
+ else
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+}
+
+static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ unsigned int flags)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ guest_state_enter_irqoff();
+
+ vmx_l1d_flush(vcpu);
+
+ vmx_disable_fb_clear(vmx);
+
+ if (vcpu->arch.cr2 != native_read_cr2())
+ native_write_cr2(vcpu->arch.cr2);
+
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ flags);
+
+ vcpu->arch.cr2 = native_read_cr2();
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+
+ vmx->idt_vectoring_info = 0;
+
+ vmx_enable_fb_clear(vmx);
+
+ if (unlikely(vmx->fail)) {
+ vmx->vt.exit_reason.full = 0xdead;
+ goto out;
}
- if (vmx->loaded_vmcs->hv_timer_armed)
- vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
- vmx->loaded_vmcs->hv_timer_armed = false;
+ vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ vmx_handle_nmi(vcpu);
+
+out:
+ guest_state_exit_irqoff();
}
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4, evmcs_rsp;
+ unsigned long cr3, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked))
vmx->loaded_vmcs->entry_time = ktime_get();
- /* Don't enter VMX if guest state is invalid, let the exit handler
- start emulation until we arrive back to a valid state */
- if (vmx->emulation_required)
- return;
+ /*
+ * Don't enter VMX if guest state is invalid, let the exit handler
+ * start emulation until we arrive back to a valid state. Synthesize a
+ * consistency check VM-Exit due to invalid guest state and bail.
+ */
+ if (unlikely(vmx->vt.emulation_required)) {
+ vmx->fail = 0;
+
+ vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->vt.exit_reason.failed_vmentry = 1;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ vmx->vt.exit_intr_info = 0;
+ return EXIT_FASTPATH_NONE;
+ }
+
+ trace_kvm_entry(vcpu, force_immediate_exit);
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
vmcs_write32(PLE_WINDOW, vmx->ple_window);
}
- if (vmx->nested.need_vmcs12_sync)
- nested_sync_from_vmcs12(vcpu);
+ /*
+ * We did this in prepare_switch_to_guest, because it needs to
+ * be within srcu_read_lock.
+ */
+ WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ vcpu->arch.regs_dirty = 0;
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
+ /*
+ * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
+ * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
+ * it switches back to the current->mm, which can occur in KVM context
+ * when switching to a temporary mm to patch kernel code, e.g. if KVM
+ * toggles a static key while handling a VM-Exit.
+ */
cr3 = __get_current_cr3_fast();
if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
vmcs_writel(HOST_CR3, cr3);
@@ -6410,195 +7500,33 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- if (static_cpu_has(X86_FEATURE_PKU) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
- vcpu->arch.pkru != vmx->host_pkru)
- __write_pkru(vcpu->arch.pkru);
-
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
+ if (intel_pmu_lbr_is_enabled(vcpu))
+ vmx_passthrough_lbr_msrs(vcpu);
- vmx_update_hv_timer(vcpu);
-
- /*
- * If this vCPU has touched SPEC_CTRL, restore the guest's value if
- * it's non-zero. Since vmentry is serialising on affected CPUs, there
- * is no need to worry about the conditional branch over the wrmsr
- * being speculatively taken.
- */
- x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
-
- vmx->__launched = vmx->loaded_vmcs->launched;
-
- evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
- (unsigned long)&current_evmcs->host_rsp : 0;
-
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
-
- asm(
- /* Store host registers */
- "push %%" _ASM_DX "; push %%" _ASM_BP ";"
- "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
- "push %%" _ASM_CX " \n\t"
- "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
- "je 1f \n\t"
- "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
- /* Avoid VMWRITE when Enlightened VMCS is in use */
- "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
- "jz 2f \n\t"
- "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
- "jmp 1f \n\t"
- "2: \n\t"
- __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
- "1: \n\t"
- "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
-
- /* Reload cr2 if changed */
- "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
- "mov %%cr2, %%" _ASM_DX " \n\t"
- "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
- "je 3f \n\t"
- "mov %%" _ASM_AX", %%cr2 \n\t"
- "3: \n\t"
- /* Check if vmlaunch or vmresume is needed */
- "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t"
- /* Load guest registers. Don't clobber flags. */
- "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
- "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
- "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
- "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
- "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
- "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t"
- "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t"
- "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
- "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
- "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
- "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
- "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
- "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
-#endif
- /* Load guest RCX. This kills the vmx_vcpu pointer! */
- "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
-
- /* Enter guest mode */
- "call vmx_vmenter\n\t"
-
- /* Save guest's RCX to the stack placeholder (see above) */
- "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
+ if (enable_preemption_timer)
+ vmx_update_hv_timer(vcpu, force_immediate_exit);
+ else if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
- /* Load host's RCX, i.e. the vmx_vcpu pointer */
- "pop %%" _ASM_CX " \n\t"
+ kvm_wait_lapic_expire(vcpu);
- /* Set vmx->fail based on EFLAGS.{CF,ZF} */
- "setbe %c[fail](%%" _ASM_CX ")\n\t"
-
- /* Save all guest registers, including RCX from the stack */
- "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
- __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t"
- "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t"
- "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
- "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
- "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
- "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
- "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
- "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
- /*
- * Clear host registers marked as clobbered to prevent
- * speculative use.
- */
- "xor %%r8d, %%r8d \n\t"
- "xor %%r9d, %%r9d \n\t"
- "xor %%r10d, %%r10d \n\t"
- "xor %%r11d, %%r11d \n\t"
- "xor %%r12d, %%r12d \n\t"
- "xor %%r13d, %%r13d \n\t"
- "xor %%r14d, %%r14d \n\t"
- "xor %%r15d, %%r15d \n\t"
-#endif
- "mov %%cr2, %%" _ASM_AX " \n\t"
- "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
-
- "xor %%eax, %%eax \n\t"
- "xor %%ebx, %%ebx \n\t"
- "xor %%esi, %%esi \n\t"
- "xor %%edi, %%edi \n\t"
- "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
- : ASM_CALL_CONSTRAINT
- : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
- [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
- [fail]"i"(offsetof(struct vcpu_vmx, fail)),
- [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
- [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
- [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
- [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
-#endif
- [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
- [wordsize]"i"(sizeof(ulong))
- : "cc", "memory"
-#ifdef CONFIG_X86_64
- , "rax", "rbx", "rdi"
- , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
- , "eax", "ebx", "edi"
-#endif
- );
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
-
- x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
-
- /* Eliminate branch target predictions from guest mode */
- vmexit_fill_RSB();
+ /* The actual VMENTER/EXIT is in the .noinstr.text section. */
+ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
/* All fields are clean at this point */
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs()) {
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
+ }
+
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (vmx->host_debugctlmsr)
- update_debugctlmsr(vmx->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64
/*
@@ -6613,195 +7541,155 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
loadsegment(es, __USER_DS);
#endif
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_SEGMENTS)
- | (1 << VCPU_EXREG_CR3));
- vcpu->arch.regs_dirty = 0;
-
pt_guest_exit(vmx);
- /*
- * eager fpu is enabled if PKEY is supported and CR4 is switched
- * back on host, so it is safe to read guest PKRU from current
- * XSAVE.
- */
- if (static_cpu_has(X86_FEATURE_PKU) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
- vcpu->arch.pkru = __read_pkru();
- if (vcpu->arch.pkru != vmx->host_pkru)
- __write_pkru(vmx->host_pkru);
+ if (is_guest_mode(vcpu)) {
+ /*
+ * Track VMLAUNCH/VMRESUME that have made past guest state
+ * checking.
+ */
+ if (vmx->nested.nested_run_pending &&
+ !vmx_get_exit_reason(vcpu).failed_vmentry)
+ ++vcpu->stat.nested_run;
+
+ vmx->nested.nested_run_pending = 0;
}
- vmx->nested.nested_run_pending = 0;
- vmx->idt_vectoring_info = 0;
+ if (unlikely(vmx->fail))
+ return EXIT_FASTPATH_NONE;
- vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
- if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
- return;
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
+
+ if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
+ return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
-}
-STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
-
-static struct kvm *vmx_vm_alloc(void)
-{
- struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
- return &kvm_vmx->kvm;
-}
-static void vmx_vm_free(struct kvm *kvm)
-{
- vfree(to_kvm_vmx(kvm));
+ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (enable_pml)
vmx_destroy_pml_buffer(vmx);
free_vpid(vmx->vpid);
- leave_guest_mode(vcpu);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
- kfree(vmx->guest_msrs);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
- kmem_cache_free(kvm_vcpu_cache, vmx);
+ free_page((unsigned long)vmx->ve_info);
}
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
- int err;
- struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
- unsigned long *msr_bitmap;
- int cpu;
-
- if (!vmx)
- return ERR_PTR(-ENOMEM);
-
- vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
- if (!vmx->vcpu.arch.guest_fpu) {
- printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
- err = -ENOMEM;
- goto free_partial_vcpu;
- }
+ struct vmx_uret_msr *tsx_ctrl;
+ struct vcpu_vmx *vmx;
+ int i, err;
- vmx->vpid = allocate_vpid();
+ BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
+ vmx = to_vmx(vcpu);
- err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
- if (err)
- goto free_vcpu;
+ INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
err = -ENOMEM;
+ vmx->vpid = allocate_vpid();
+
/*
* If PML is turned on, failure on enabling PML just results in failure
* of creating the vcpu, therefore we can simplify PML logic (by
* avoiding dealing with cases, such as enabling PML partially on vcpus
- * for the guest, etc.
+ * for the guest), etc.
*/
if (enable_pml) {
- vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!vmx->pml_pg)
- goto uninit_vcpu;
+ goto free_vpid;
}
- vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
- BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
- > PAGE_SIZE);
-
- if (!vmx->guest_msrs)
- goto free_pml;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i)
+ vmx->guest_uret_msrs[i].mask = -1ull;
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
+ * Keep the host value unchanged to avoid changing CPUID bits
+ * under the host kernel's feet.
+ */
+ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (tsx_ctrl)
+ tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ }
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
- goto free_msrs;
-
- msr_bitmap = vmx->vmcs01.msr_bitmap;
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- vmx->msr_bitmap_mode = 0;
+ goto free_pml;
+
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (kvm_is_using_evmcs() &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
vmx->loaded_vmcs = &vmx->vmcs01;
- cpu = get_cpu();
- vmx_vcpu_load(&vmx->vcpu, cpu);
- vmx->vcpu.cpu = cpu;
- vmx_vcpu_setup(vmx);
- vmx_vcpu_put(&vmx->vcpu);
- put_cpu();
- if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
- err = alloc_apic_access_page(kvm);
+
+ if (cpu_need_virtualize_apic_accesses(vcpu)) {
+ err = kvm_alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
if (enable_ept && !enable_unrestricted_guest) {
- err = init_rmode_identity_map(kvm);
+ err = init_rmode_identity_map(vcpu->kvm);
if (err)
goto free_vmcs;
}
- if (nested)
- nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
- vmx_capability.ept,
- kvm_vcpu_apicv_active(&vmx->vcpu));
- else
- memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
- vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
- vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
- /*
- * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
- * or POSTED_INTR_WAKEUP_VECTOR.
- */
- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
+ vmx->ve_info = page_to_virt(page);
+ }
- vmx->ept_pointer = INVALID_PAGE;
+ if (vmx_can_use_ipiv(vcpu))
+ WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
+ __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
- return &vmx->vcpu;
+ return 0;
free_vmcs:
free_loaded_vmcs(vmx->loaded_vmcs);
-free_msrs:
- kfree(vmx->guest_msrs);
free_pml:
vmx_destroy_pml_buffer(vmx);
-uninit_vcpu:
- kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
+free_vpid:
free_vpid(vmx->vpid);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
-free_partial_vcpu:
- kmem_cache_free(kvm_vcpu_cache, vmx);
- return ERR_PTR(err);
+ return err;
}
-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-static int vmx_vm_init(struct kvm *kvm)
+int vmx_vm_init(struct kvm *kvm)
{
- spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
-
if (!ple_gap)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
switch (l1tf_mitigation) {
@@ -6809,6 +7697,7 @@ static int vmx_vm_init(struct kvm *kvm)
case L1TF_MITIGATION_FLUSH_NOWARN:
/* 'I explicitly don't care' is set */
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
@@ -6816,7 +7705,7 @@ static int vmx_vm_init(struct kvm *kvm)
* Warn upon starting the first VM in a potentially
* insecure environment.
*/
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (sched_smt_active())
pr_warn_once(L1TF_MSG_SMT);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
pr_warn_once(L1TF_MSG_L1D);
@@ -6826,79 +7715,40 @@ static int vmx_vm_init(struct kvm *kvm)
break;
}
}
+
+ if (enable_pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
return 0;
}
-static void __init vmx_check_processor_compat(void *rtn)
+static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
{
- struct vmcs_config vmcs_conf;
- struct vmx_capability vmx_cap;
-
- *(int *)rtn = 0;
- if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
- *(int *)rtn = -EIO;
- if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
- enable_apicv);
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- *(int *)rtn = -EIO;
- }
+ /*
+ * Non-coherent DMA devices need the guest to flush CPU properly.
+ * In that case it is not possible to map all guest RAM as WB, so
+ * always trust guest PAT.
+ */
+ return !kvm_arch_has_noncoherent_dma(kvm) &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
}
-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- u8 cache;
- u64 ipat = 0;
-
- /* For VT-d and EPT combination
- * 1. MMIO: always map as UC
- * 2. EPT with VT-d:
- * a. VT-d without snooping control feature: can't guarantee the
- * result, try to trust guest.
- * b. VT-d with snooping control feature: snooping control feature of
- * VT-d engine can guarantee the cache correctness. Just set it
- * to WB to keep consistent with host. So the same as item 3.
- * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
- * consistent with host MTRR
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
*/
- if (is_mmio) {
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
- ipat = VMX_EPT_IPAT_BIT;
- cache = MTRR_TYPE_WRBACK;
- goto exit;
- }
-
- if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
- ipat = VMX_EPT_IPAT_BIT;
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- cache = MTRR_TYPE_WRBACK;
- else
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
-
- cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+ /* Force WB if ignoring guest PAT */
+ if (vmx_ignore_guest_pat(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
-exit:
- return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
-}
-
-static int vmx_get_lpage_level(void)
-{
- if (enable_ept && !cpu_has_vmx_ept_1g_page())
- return PT_DIRECTORY_LEVEL;
- else
- /* For shadow and EPT supported 1GB page */
- return PT_PDPE_LEVEL;
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
-static void vmcs_set_secondary_exec_control(u32 new_ctl)
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
{
/*
* These bits in the secondary execution controls field
@@ -6912,10 +7762,9 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC;
- u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ u32 cur_ctl = secondary_exec_controls_get(vmx);
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
- (new_ctl & ~mask) | (cur_ctl & mask));
+ secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
}
/*
@@ -6935,49 +7784,38 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
} while (0)
- entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
- cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
- cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
- cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
- cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
- cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
- cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
- cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
- cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
- cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
- cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
- cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
- cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
- cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
- cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
-
- entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
- cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
- cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
- cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
- cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
- cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
+ entry = kvm_find_cpuid_entry(vcpu, 0x1);
+ cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
+
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0);
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
+ cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
+ cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK));
+ cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT));
+
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1);
+ cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
#undef cr4_fixed1_update
}
-static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (kvm_mpx_supported()) {
- bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
-
- if (mpx_enabled) {
- vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
- vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
- } else {
- vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
- vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
- }
- }
-}
-
static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6985,7 +7823,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
int i;
for (i = 0; i < PT_CPUID_LEAVES; i++) {
- best = kvm_find_cpuid_entry(vcpu, 0x14, i);
+ best = kvm_find_cpuid_entry_index(vcpu, 0x14, i);
if (!best)
return;
vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
@@ -6995,12 +7833,13 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
}
/* Get the number of configurable Address Ranges for filtering */
- vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
+ vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges);
/* Initialize and clear the no dependency bits */
vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
- RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+ RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
+ RTIT_CTL_BRANCH_EN);
/*
* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
@@ -7018,12 +7857,11 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
/*
- * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
- * MTCFreq can be set
+ * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
*/
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
- RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+ RTIT_CTL_MTC_RANGE);
/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
@@ -7038,72 +7876,332 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
- /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+ /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
/* unmask address range configure area */
- for (i = 0; i < vmx->pt_desc.addr_range; i++)
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
- }
+ /*
+ * XSAVES is effectively enabled if and only if XSAVE is also exposed
+ * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+ * set if and only if XSAVE is supported.
+ */
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
- if (nested_vmx_allowed(vcpu))
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
- FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ vmx_setup_uret_msrs(vmx);
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(vmx,
+ vmx_secondary_exec_control(vmx));
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (nested_vmx_allowed(vcpu)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
nested_vmx_cr_fixed1_bits_update(vcpu);
- nested_vmx_entry_exit_ctls_update(vcpu);
- }
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
- guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+ guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
+
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ struct vmx_uret_msr *msr;
+ msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (msr) {
+ bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
+ vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+ }
+ }
+
+ set_cr4_guest_host_mask(vmx);
+
+ vmx_write_encls_bitmap(vcpu, NULL);
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
+ vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
+ vmx->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_SGX_LC_ENABLED;
+ else
+ vmx->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_SGX_LC_ENABLED;
+
+ /* Refresh #PF interception to account for MAXPHYADDR changes. */
+ vmx_update_exception_bitmap(vcpu);
}
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static __init u64 vmx_get_perf_capabilities(void)
{
- if (func == 1 && nested)
- entry->ecx |= bit(X86_FEATURE_VMX);
+ u64 perf_cap = PERF_CAP_FW_WRITES;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
+ x86_perf_get_lbr(&vmx_lbr_caps);
+
+ /*
+ * KVM requires LBR callstack support, as the overhead due to
+ * context switching LBRs without said support is too high.
+ * See intel_pmu_create_guest_lbr_event() for more info.
+ */
+ if (!vmx_lbr_caps.has_callstack)
+ memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
+ else if (vmx_lbr_caps.nr)
+ perf_cap |= host_perf_cap & PERF_CAP_LBR_FMT;
+ }
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+
+ /*
+ * Disallow adaptive PEBS as it is functionally broken, can be
+ * used by the guest to read *host* LBRs, and can be used to
+ * bypass userspace event filters. To correctly and safely
+ * support adaptive PEBS, KVM needs to:
+ *
+ * 1. Account for the ADAPTIVE flag when (re)programming fixed
+ * counters.
+ *
+ * 2. Gain support from perf (or take direct control of counter
+ * programming) to support events without adaptive PEBS
+ * enabled for the hardware counter.
+ *
+ * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
+ * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
+ *
+ * 4. Document which PMU events are effectively exposed to the
+ * guest via adaptive PEBS, and make adaptive PEBS mutually
+ * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
+ */
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
}
-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+static __init void vmx_set_cpu_caps(void)
{
- to_vmx(vcpu)->req_immediate_exit = true;
+ kvm_set_cpu_caps();
+
+ /* CPUID 0x1 */
+ if (nested)
+ kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+ /* CPUID 0x7 */
+ if (kvm_mpx_supported())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+ if (!cpu_has_vmx_invpcid())
+ kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
+ if (vmx_pt_mode_is_host_guest())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+ if (vmx_pebs_supported()) {
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
+ kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
+ }
+
+ if (!enable_pmu)
+ kvm_cpu_cap_clear(X86_FEATURE_PDCM);
+ kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
+
+ if (!enable_sgx) {
+ kvm_cpu_cap_clear(X86_FEATURE_SGX);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX1);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
+ }
+
+ if (vmx_umip_emulated())
+ kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+ /* CPUID 0xD.1 */
+ if (!cpu_has_vmx_xsaves())
+ kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+ /* CPUID 0x80000001 and 0x7 (RDPID) */
+ if (!cpu_has_vmx_rdtscp()) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
+
+ if (cpu_has_vmx_waitpkg())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
+
+ /*
+ * Disable CET if unrestricted_guest is unsupported as KVM doesn't
+ * enforce CET HW behaviors in emulator. On platforms with
+ * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code
+ * fails, so disable CET in this case too.
+ */
+ if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest ||
+ !cpu_has_vmx_basic_no_hw_errcode_cc()) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ }
}
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage)
+static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ unsigned long *exit_qualification)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ unsigned short port;
+ int size;
+ bool imm;
/*
- * RDPID causes #UD if disabled through secondary execution controls.
- * Because it is marked as EmulateOnUD, we need to intercept it here.
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
*/
- if (info->intercept == x86_intercept_rdtscp &&
- !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
- ctxt->exception.vector = UD_VECTOR;
- ctxt->exception.error_code_valid = false;
- return X86EMUL_PROPAGATE_FAULT;
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ imm = info->src_type == OP_IMM;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ imm = info->dst_type == OP_IMM;
}
+
+ *exit_qualification = ((unsigned long)port << 16) | (size - 1);
+
+ if (info->intercept == x86_intercept_ins ||
+ info->intercept == x86_intercept_outs)
+ *exit_qualification |= BIT(4);
+
+ if (info->rep_prefix)
+ *exit_qualification |= BIT(5);
+
+ if (imm)
+ *exit_qualification |= BIT(6);
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long exit_qualification = 0;
+ u32 vm_exit_reason;
+ u64 exit_insn_len;
+
+ switch (info->intercept) {
+ case x86_intercept_rdpid:
+ /*
+ * RDPID causes #UD if not enabled through secondary execution
+ * controls (ENABLE_RDTSCP). Note, the implicit MSR access to
+ * TSC_AUX is NOT subject to interception, i.e. checking only
+ * the dedicated execution control is architecturally correct.
+ */
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
+ exception->vector = UD_VECTOR;
+ exception->error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ return X86EMUL_CONTINUE;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
+ break;
+
+ case x86_intercept_lgdt:
+ case x86_intercept_lidt:
+ case x86_intercept_lldt:
+ case x86_intercept_ltr:
+ case x86_intercept_sgdt:
+ case x86_intercept_sidt:
+ case x86_intercept_sldt:
+ case x86_intercept_str:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+ return X86EMUL_CONTINUE;
+
+ if (info->intercept == x86_intercept_lldt ||
+ info->intercept == x86_intercept_ltr ||
+ info->intercept == x86_intercept_sldt ||
+ info->intercept == x86_intercept_str)
+ vm_exit_reason = EXIT_REASON_LDTR_TR;
+ else
+ vm_exit_reason = EXIT_REASON_GDTR_IDTR;
+ /*
+ * FIXME: Decode the ModR/M to generate the correct exit
+ * qualification for memory operands.
+ */
+ break;
+
+ case x86_intercept_hlt:
+ if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_HLT;
+ break;
+
+ case x86_intercept_pause:
+ /*
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
+ * with vanilla NOPs in the emulator. Apply the interception
+ * check only to actual PAUSE instructions. Don't check
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
+ * exit, i.e. KVM is within its rights to allow L2 to execute
+ * the PAUSE.
+ */
+ if ((info->rep_prefix != REPE_PREFIX) ||
+ !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
+ break;
+
/* TODO: check more intercepts... */
- return X86EMUL_CONTINUE;
+ default:
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
+ if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
+ return X86EMUL_UNHANDLEABLE;
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
+ exit_insn_len);
+ return X86EMUL_INTERCEPTED;
}
#ifdef CONFIG_X86_64
@@ -7125,19 +8223,19 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
return 0;
}
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
-
- if (kvm_mwait_in_guest(vcpu->kvm))
- return -EOPNOTSUPP;
+ struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
vmx = to_vmx(vcpu);
tscl = rdtsc();
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
- lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
+ ktimer->timer_advance_ns);
if (delta_tsc > lapic_timer_advance_cycles)
delta_tsc -= lapic_timer_advance_cycles;
@@ -7145,11 +8243,10 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
delta_tsc = 0;
/* Convert to host delta tsc if tsc scaling is enabled */
- if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
- u64_shl_div_u64(delta_tsc,
- kvm_tsc_scaling_ratio_frac_bits,
- vcpu->arch.tsc_scaling_ratio,
- &delta_tsc))
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
+ delta_tsc && u64_shl_div_u64(delta_tsc,
+ kvm_caps.tsc_scaling_ratio_frac_bits,
+ vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
return -ERANGE;
/*
@@ -7162,389 +8259,268 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
return -ERANGE;
vmx->hv_deadline_tsc = tscl + delta_tsc;
- return delta_tsc == 0;
+ *expired = !delta_tsc;
+ return 0;
}
-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
-static void vmx_slot_enable_log_dirty(struct kvm *kvm,
- struct kvm_memory_slot *slot)
-{
- kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
- kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+ if (WARN_ON_ONCE(!enable_pml))
+ return;
+
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_cpu_dirty_logging = true;
+ return;
+ }
+
+ /*
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
+ * code, but in that case another update request will be made and so
+ * the guest will never run with a stale PML value.
+ */
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
+ else
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static void vmx_slot_disable_log_dirty(struct kvm *kvm,
- struct kvm_memory_slot *slot)
+void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
- kvm_mmu_slot_set_dirty(kvm, slot);
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEAT_CTL_LMCE_ENABLED;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEAT_CTL_LMCE_ENABLED;
}
-static void vmx_flush_log_dirty(struct kvm *kvm)
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
- kvm_flush_pml_buffers(kvm);
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+ return !is_smm(vcpu);
}
-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
- struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t gpa;
- struct page *page = NULL;
- u64 *pml_address;
- if (is_guest_mode(vcpu)) {
- WARN_ON_ONCE(vmx->nested.pml_full);
+ /*
+ * TODO: Implement custom flows for forcing the vCPU out/in of L2 on
+ * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
+ * SMI and RSM only modify state that is saved and restored via SMRAM.
+ * E.g. most MSRs are left untouched, but many are modified by VM-Exit
+ * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
+ */
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
- /*
- * Check if PML is enabled for the nested guest.
- * Whether eptp bit 6 is set is already checked
- * as part of A/D emulation.
- */
- vmcs12 = get_vmcs12(vcpu);
- if (!nested_cpu_has_pml(vmcs12))
- return 0;
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ vmx_clear_hlt(vcpu);
+ return 0;
+}
- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
- vmx->nested.pml_full = true;
- return 1;
- }
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
- page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
- if (is_error_page(page))
- return 0;
+ if (vmx->nested.smm.guest_mode) {
+ ret = nested_vmx_enter_non_root_mode(vcpu, false);
+ if (ret)
+ return ret;
- pml_address = kmap(page);
- pml_address[vmcs12->guest_pml_index--] = gpa;
- kunmap(page);
- kvm_release_page_clean(page);
+ vmx->nested.nested_run_pending = 1;
+ vmx->nested.smm.guest_mode = false;
}
-
return 0;
}
-static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- gfn_t offset, unsigned long mask)
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
- kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+ /* RSM will cause a vmexit anyway. */
}
+#endif
-static void __pi_post_block(struct kvm_vcpu *vcpu)
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- do {
- old.control = new.control = pi_desc->control;
- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the VCPU is blocked\n");
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- vcpu->pre_pcpu = -1;
- }
+ return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
-/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- * we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- * 'NDST' <-- vcpu->pre_pcpu
- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- * interrupt is posted for this vCPU, we cannot block it, in
- * this case, return 1, otherwise, return 0.
- *
- */
-static int pi_pre_block(struct kvm_vcpu *vcpu)
+void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
- unsigned int dest;
- struct pi_desc old, new;
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return 0;
+ if (is_guest_mode(vcpu)) {
+ struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
- WARN_ON(irqs_disabled());
- local_irq_disable();
- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ if (hrtimer_try_to_cancel(timer) == 1)
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
}
-
- do {
- old.control = new.control = pi_desc->control;
-
- WARN((pi_desc->sn == 1),
- "Warning: SN field of posted-interrupts "
- "is set before blocking\n");
-
- /*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
- */
- dest = cpu_physical_id(vcpu->pre_pcpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'wakeup vector' */
- new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- /* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc) == 1)
- __pi_post_block(vcpu);
-
- local_irq_enable();
- return (vcpu->pre_pcpu == -1);
}
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+void vmx_hardware_unsetup(void)
{
- if (pi_pre_block(vcpu))
- return 1;
+ kvm_set_posted_intr_wakeup_handler(NULL);
- if (kvm_lapic_hv_timer_in_use(vcpu))
- kvm_lapic_switch_to_sw_timer(vcpu);
-
- return 0;
-}
-
-static void pi_post_block(struct kvm_vcpu *vcpu)
-{
- if (vcpu->pre_pcpu == -1)
- return;
+ if (nested)
+ nested_vmx_hardware_unsetup();
- WARN_ON(irqs_disabled());
- local_irq_disable();
- __pi_post_block(vcpu);
- local_irq_enable();
+ free_kvm_area();
}
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+void vmx_vm_destroy(struct kvm *kvm)
{
- if (kvm_x86_ops->set_hv_timer)
- kvm_lapic_switch_to_hv_timer(vcpu);
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
- pi_post_block(vcpu);
+ free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm));
}
/*
- * vmx_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
+ * Note, the SDM states that the linear address is masked *after* the modified
+ * canonicality check, whereas KVM masks (untags) the address and then performs
+ * a "normal" canonicality check. Functionally, the two methods are identical,
+ * and when the masking occurs relative to the canonicality check isn't visible
+ * to software, i.e. KVM's behavior doesn't violate the SDM.
*/
-static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- */
+gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags)
+{
+ int lam_bit;
+ unsigned long cr3_bits;
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
- /*
- * Make sure the IRTE is in remapped mode if
- * we don't handle it in posted mode.
- */
- ret = irq_set_vcpu_affinity(host_irq, NULL);
- if (ret < 0) {
- printk(KERN_INFO
- "failed to back to remapped mode, irq: %u\n",
- host_irq);
- goto out;
- }
+ if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG))
+ return gva;
- continue;
- }
+ if (!is_64_bit_mode(vcpu))
+ return gva;
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+ /*
+ * Bit 63 determines if the address should be treated as user address
+ * or a supervisor address.
+ */
+ if (!(gva & BIT_ULL(63))) {
+ cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
+ if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48)))
+ return gva;
- if (set)
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- else
- ret = irq_set_vcpu_affinity(host_irq, NULL);
+ /* LAM_U48 is ignored if LAM_U57 is set. */
+ lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47;
+ } else {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
+ return gva;
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
+ lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47;
}
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
+ /*
+ * Untag the address by sign-extending the lam_bit, but NOT to bit 63.
+ * Bit 63 is retained from the raw virtual address so that untagging
+ * doesn't change a user access to a supervisor access, and vice versa.
+ */
+ return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}
-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+static unsigned int vmx_handle_intel_pt_intr(void)
{
- if (vcpu->arch.mcg_cap & MCG_LMCE_P)
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
- FEATURE_CONTROL_LMCE;
- else
- to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~FEATURE_CONTROL_LMCE;
-}
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
-{
- /* we need a nested vmexit to enter SMM, postpone if run is pending */
- if (to_vmx(vcpu)->nested.nested_run_pending)
+ /* '0' on failure so that the !PT case can use a RET0 static call. */
+ if (!vcpu || !kvm_handling_nmi_from_guest(vcpu))
return 0;
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
return 1;
}
-static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static __init void vmx_setup_user_return_msrs(void)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
- if (vmx->nested.smm.guest_mode)
- nested_vmx_vmexit(vcpu, -1, 0, 0);
+ /*
+ * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
+ * will emulate SYSCALL in legacy mode if the vendor string in guest
+ * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
+ * support this emulation, MSR_STAR is included in the list for i386,
+ * but is never loaded into hardware. MSR_CSTAR is also never loaded
+ * into hardware and is here purely for emulation purposes.
+ */
+ const u32 vmx_uret_msrs_list[] = {
+ #ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+ #endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
+ };
+ int i;
- vmx->nested.smm.vmxon = vmx->nested.vmxon;
- vmx->nested.vmxon = false;
- vmx_clear_hlt(vcpu);
- return 0;
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
}
-static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+static void __init vmx_setup_me_spte_mask(void)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- int ret;
-
- if (vmx->nested.smm.vmxon) {
- vmx->nested.vmxon = true;
- vmx->nested.smm.vmxon = false;
- }
-
- if (vmx->nested.smm.guest_mode) {
- vcpu->arch.hflags &= ~HF_SMM_MASK;
- ret = nested_vmx_enter_non_root_mode(vcpu, false);
- vcpu->arch.hflags |= HF_SMM_MASK;
- if (ret)
- return ret;
+ u64 me_mask = 0;
- vmx->nested.smm.guest_mode = false;
- }
- return 0;
-}
+ /*
+ * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
+ * boot_cpu_data.x86_phys_bits holds the actual physical address
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
+ */
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
+ me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
+ kvm_host.maxphyaddr - 1);
-static int enable_smi_window(struct kvm_vcpu *vcpu)
-{
- return 0;
+ /*
+ * Unlike SME, host kernel doesn't support setting up any
+ * MKTME KeyID on Intel platforms. No memory encryption
+ * bits should be included into the SPTE.
+ */
+ kvm_mmu_set_me_spte_mask(0, me_mask);
}
-static __init int hardware_setup(void)
+__init int vmx_hardware_setup(void)
{
unsigned long host_bndcfgs;
- int r, i;
+ struct desc_ptr dt;
+ int r;
- rdmsrl_safe(MSR_EFER, &host_efer);
+ store_idt(&dt);
+ host_idt_base = dt.address;
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
- kvm_define_shared_msr(i, vmx_msr_index[i]);
+ vmx_setup_user_return_msrs();
- if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
- return -EIO;
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
if (boot_cpu_has(X86_FEATURE_MPX)) {
- rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
- WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
+ if (!cpu_has_vmx_mpx())
+ kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
@@ -7556,6 +8532,20 @@ static __init int hardware_setup(void)
!cpu_has_vmx_invept_global())
enable_ept = 0;
+ /* NX support is required for shadow paging. */
+ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Shadow paging doesn't have a (further) performance penalty
+ * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
+ * by default
+ */
+ if (!enable_ept)
+ allow_smaller_maxphyaddr = true;
+
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
@@ -7568,26 +8558,27 @@ static __init int hardware_setup(void)
if (!cpu_has_virtual_nmis())
enable_vnmi = 0;
+#ifdef CONFIG_X86_SGX_KVM
+ if (!cpu_has_vmx_encls_vmexit())
+ enable_sgx = false;
+#endif
+
/*
* set_apic_access_page_addr() is used to reload apic access
* page upon invalidation. No need to do anything if not
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- kvm_x86_ops->set_apic_access_page_addr = NULL;
+ vt_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- kvm_x86_ops->update_cr8_intercept = NULL;
-
- if (enable_ept && !cpu_has_vmx_ept_2m_page())
- kvm_disable_largepages();
+ vt_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
- kvm_x86_ops->tlb_remote_flush_with_range =
- hv_remote_flush_tlb_with_range;
+ vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
@@ -7599,23 +8590,38 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
- kvm_x86_ops->sync_pir_to_irr = NULL;
- }
+ if (!enable_apicv)
+ vt_x86_ops.sync_pir_to_irr = NULL;
- if (cpu_has_vmx_tsc_scaling()) {
- kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 48;
- }
+ if (!enable_apicv || !cpu_has_vmx_ipiv())
+ enable_ipiv = false;
+
+ if (cpu_has_vmx_tsc_scaling())
+ kvm_caps.has_tsc_control = true;
+
+ kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_caps.tsc_scaling_ratio_frac_bits = 48;
+ kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+ kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
- vmx_enable_tdp();
+ kvm_mmu_set_ept_masks(enable_ept_ad_bits,
+ cpu_has_vmx_ept_execute_only());
else
- kvm_disable_tdp();
+ vt_x86_ops.get_mt_mask = NULL;
+
+ /*
+ * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
+ * bits to shadow_zero_check.
+ */
+ vmx_setup_me_spte_mask();
+
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
+ ept_caps_to_lpage_level(vmx_capability.ept));
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -7624,39 +8630,56 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
enable_pml = 0;
- if (!enable_pml) {
- kvm_x86_ops->slot_enable_log_dirty = NULL;
- kvm_x86_ops->slot_disable_log_dirty = NULL;
- kvm_x86_ops->flush_log_dirty = NULL;
- kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
- }
-
if (!cpu_has_vmx_preemption_timer())
- kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+ enable_preemption_timer = false;
- if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
- u64 vmx_msr;
+ if (enable_preemption_timer) {
+ u64 use_timer_freq = 5000ULL * 1000 * 1000;
- rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
cpu_preemption_timer_multi =
- vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
- } else {
- kvm_x86_ops->set_hv_timer = NULL;
- kvm_x86_ops->cancel_hv_timer = NULL;
+ vmx_misc_preemption_timer_rate(vmcs_config.misc);
+
+ if (tsc_khz)
+ use_timer_freq = (u64)tsc_khz * 1000;
+ use_timer_freq >>= cpu_preemption_timer_multi;
+
+ /*
+ * KVM "disables" the preemption timer by setting it to its max
+ * value. Don't use the timer if it might cause spurious exits
+ * at a rate faster than 0.1 Hz (of uninterrupted guest time).
+ */
+ if (use_timer_freq > 0xffffffffu / 10)
+ enable_preemption_timer = false;
}
- kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ if (!enable_preemption_timer) {
+ vt_x86_ops.set_hv_timer = NULL;
+ vt_x86_ops.cancel_hv_timer = NULL;
+ }
- kvm_mce_cap_supported |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_LMCE_P;
+ kvm_caps.supported_mce_cap |= MCG_CMCI_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
return -EINVAL;
- if (!enable_ept || !cpu_has_vmx_intel_pt())
+ if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
+ if (pt_mode == PT_MODE_HOST_GUEST)
+ vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ else
+ vt_init_ops.handle_intel_pt_intr = NULL;
+
+ setup_default_sgx_lepubkeyhash();
+ vmx_set_cpu_caps();
+
+ /*
+ * Configure nested capabilities after core CPU capabilities so that
+ * nested support can be conditional on base support, e.g. so that KVM
+ * can hide/show features based on kvm_cpu_cap_has().
+ */
if (nested) {
- nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
- vmx_capability.ept, enable_apicv);
+ nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
if (r)
@@ -7664,272 +8687,88 @@ static __init int hardware_setup(void)
}
r = alloc_kvm_area();
- if (r)
+ if (r && nested)
nested_vmx_hardware_unsetup();
- return r;
-}
-static __exit void hardware_unsetup(void)
-{
- if (nested)
- nested_vmx_hardware_unsetup();
-
- free_kvm_area();
-}
-
-static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .hardware_setup = hardware_setup,
- .hardware_unsetup = hardware_unsetup,
- .check_processor_compatibility = vmx_check_processor_compat,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_init = vmx_vm_init,
- .vm_alloc = vmx_vm_alloc,
- .vm_free = vmx_vm_free,
-
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_guest_switch = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_bp_intercept = update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
- .decache_cr3 = vmx_decache_cr3,
- .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
- .set_cr0 = vmx_set_cr0,
- .set_cr3 = vmx_set_cr3,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .get_dr6 = vmx_get_dr6,
- .set_dr6 = vmx_set_dr6,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
-
- .tlb_flush = vmx_flush_tlb,
- .tlb_flush_gva = vmx_flush_tlb_gva,
-
- .run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
- .queue_exception = vmx_queue_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .get_enable_apicv = vmx_get_enable_apicv,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_post_state_restore = vmx_apicv_post_state_restore,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_tdp_level = get_ept_level,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .get_lpage_level = vmx_get_lpage_level,
-
- .cpuid_update = vmx_cpuid_update,
-
- .rdtscp_supported = vmx_rdtscp_supported,
- .invpcid_supported = vmx_invpcid_supported,
-
- .set_supported_cpuid = vmx_set_supported_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
- .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
-
- .set_tdp_cr3 = vmx_set_cr3,
-
- .check_intercept = vmx_check_intercept,
- .handle_external_intr = vmx_handle_external_intr,
- .mpx_supported = vmx_mpx_supported,
- .xsaves_supported = vmx_xsaves_supported,
- .umip_emulated = vmx_umip_emulated,
- .pt_supported = vmx_pt_supported,
-
- .request_immediate_exit = vmx_request_immediate_exit,
-
- .sched_in = vmx_sched_in,
-
- .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
- .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
- .flush_log_dirty = vmx_flush_log_dirty,
- .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
- .write_log_dirty = vmx_write_pml_buffer,
-
- .pre_block = vmx_pre_block,
- .post_block = vmx_post_block,
-
- .pmu_ops = &intel_pmu_ops,
-
- .update_pi_irte = vmx_update_pi_irte,
-
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
- .setup_mce = vmx_setup_mce,
+ /*
+ * On Intel CPUs that lack self-snoop feature, letting the guest control
+ * memory types may result in unexpected behavior. So always ignore guest
+ * PAT on those CPUs and map VM as writeback, not allowing userspace to
+ * disable the quirk.
+ *
+ * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
+ * supported, UC is slow enough to cause issues with some older guests (e.g.
+ * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
+ * map the video RAM, causing wayland desktop to fail to get started
+ * correctly). To avoid breaking those older guests that rely on KVM to force
+ * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
+ * safer (for performance) default behavior.
+ *
+ * On top of this, non-coherent DMA devices need the guest to flush CPU
+ * caches properly. This also requires honoring guest PAT, and is forced
+ * independent of the quirk in vmx_ignore_guest_pat().
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
- .smi_allowed = vmx_smi_allowed,
- .pre_enter_smm = vmx_pre_enter_smm,
- .pre_leave_smm = vmx_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
- .check_nested_events = NULL,
- .get_nested_state = NULL,
- .set_nested_state = NULL,
- .get_vmcs12_pages = NULL,
- .nested_enable_evmcs = NULL,
-};
-
-static void vmx_cleanup_l1d_flush(void)
-{
- if (vmx_l1d_flush_pages) {
- free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
- vmx_l1d_flush_pages = NULL;
- }
- /* Restore state so sysfs ignores VMX */
- l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+ return r;
}
-static void vmx_exit(void)
+void vmx_exit(void)
{
-#ifdef CONFIG_KEXEC_CORE
- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
- synchronize_rcu();
-#endif
-
- kvm_exit();
+ allow_smaller_maxphyaddr = false;
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
vmx_cleanup_l1d_flush();
+
+ kvm_x86_vendor_exit();
}
-module_exit(vmx_exit);
-static int __init vmx_init(void)
+int __init vmx_init(void)
{
- int r;
+ int r, cpu;
+
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
+
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
-#if IS_ENABLED(CONFIG_HYPERV)
/*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
+ * Note, VMCS and eVMCS configuration only touch VMX knobs/variables,
+ * i.e. there's nothing to unwind if a later step fails.
*/
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
- int cpu;
+ hv_init_evmcs();
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
-
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
- } else {
- enlightened_vmcs = false;
- }
-#endif
+ /*
+ * Parse the VMCS config and VMX capabilities before anything else, so
+ * that the information is available to all setup flows.
+ */
+ if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
+ return -EIO;
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_x86_vendor_init(&vt_init_ops);
if (r)
return r;
- /*
- * Must be called after kvm_init() so enable_ept is properly set
- * up. Hand the parameter mitigation value in which was stored in
- * the pre module init parser. If no parameter was given, it will
- * contain 'auto' which will be turned into the default 'cond'
- * mitigation mode.
- */
- if (boot_cpu_has(X86_BUG_L1TF)) {
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ /* Must be called after common x86 init so enable_ept is setup. */
+ r = vmx_setup_l1d_flush();
+ if (r)
+ goto err_l1d_flush;
+
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+ pi_init_cpu(cpu);
}
-#ifdef CONFIG_KEXEC_CORE
- rcu_assign_pointer(crash_vmclear_loaded_vmcss,
- crash_vmclear_local_loaded_vmcss);
-#endif
vmx_check_vmcs12_offsets();
return 0;
+
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
}
-module_init(vmx_init);