summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c538
1 files changed, 429 insertions, 109 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1689f433f3a0..46b428c0990e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -188,6 +188,150 @@ module_param(ple_window_max, uint, 0444);
extern const ulong vmx_return;
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+ const char *option;
+ enum vmx_l1d_flush_state cmd;
+} vmentry_l1d_param[] = {
+ {"auto", VMENTER_L1D_FLUSH_AUTO},
+ {"never", VMENTER_L1D_FLUSH_NEVER},
+ {"cond", VMENTER_L1D_FLUSH_COND},
+ {"always", VMENTER_L1D_FLUSH_ALWAYS},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+ struct page *page;
+ unsigned int i;
+
+ if (!enable_ept) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+ return 0;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+ u64 msr;
+
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+ }
+
+ /* If set to auto use the default l1tf mitigation method */
+ if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ l1tf = VMENTER_L1D_FLUSH_NEVER;
+ break;
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ l1tf = VMENTER_L1D_FLUSH_COND;
+ break;
+ case L1TF_MITIGATION_FULL:
+ case L1TF_MITIGATION_FULL_FORCE:
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ break;
+ }
+ } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ }
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+ !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+ if (!page)
+ return -ENOMEM;
+ vmx_l1d_flush_pages = page_address(page);
+
+ /*
+ * Initialize each page with a different pattern in
+ * order to protect against KSM in the nested
+ * virtualization case.
+ */
+ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+ memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ PAGE_SIZE);
+ }
+ }
+
+ l1tf_vmx_mitigation = l1tf;
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ static_branch_enable(&vmx_l1d_should_flush);
+ else
+ static_branch_disable(&vmx_l1d_should_flush);
+
+ if (l1tf == VMENTER_L1D_FLUSH_COND)
+ static_branch_enable(&vmx_l1d_flush_cond);
+ else
+ static_branch_disable(&vmx_l1d_flush_cond);
+ return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+ unsigned int i;
+
+ if (s) {
+ for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+ if (sysfs_streq(s, vmentry_l1d_param[i].option))
+ return vmentry_l1d_param[i].cmd;
+ }
+ }
+ return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ int l1tf, ret;
+
+ if (!boot_cpu_has(X86_BUG_L1TF))
+ return 0;
+
+ l1tf = vmentry_l1d_flush_parse(s);
+ if (l1tf < 0)
+ return l1tf;
+
+ /*
+ * Has vmx_init() run already? If not then this is the pre init
+ * parameter parsing. In that case just store the value and let
+ * vmx_init() do the proper setup after enable_ept has been
+ * established.
+ */
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+ vmentry_l1d_flush_param = l1tf;
+ return 0;
+ }
+
+ mutex_lock(&vmx_l1d_flush_mutex);
+ ret = vmx_setup_l1d_flush(l1tf);
+ mutex_unlock(&vmx_l1d_flush_mutex);
+ return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
struct kvm_vmx {
struct kvm kvm;
@@ -757,6 +901,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+struct vmx_msrs {
+ unsigned int nr;
+ struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+};
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -790,9 +939,8 @@ struct vcpu_vmx {
struct loaded_vmcs *loaded_vmcs;
bool __launched; /* temporary, used in vmx_vcpu_run */
struct msr_autoload {
- unsigned nr;
- struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
- struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+ struct vmx_msrs guest;
+ struct vmx_msrs host;
} msr_autoload;
struct {
int loaded;
@@ -2377,9 +2525,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
vm_exit_controls_clearbit(vmx, exit);
}
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+ unsigned int i;
+
+ for (i = 0; i < m->nr; ++i) {
+ if (m->val[i].index == msr)
+ return i;
+ }
+ return -ENOENT;
+}
+
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
- unsigned i;
+ int i;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
@@ -2400,18 +2559,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
}
break;
}
+ i = find_msr(&m->guest, msr);
+ if (i < 0)
+ goto skip_guest;
+ --m->guest.nr;
+ m->guest.val[i] = m->guest.val[m->guest.nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
-
- if (i == m->nr)
+skip_guest:
+ i = find_msr(&m->host, msr);
+ if (i < 0)
return;
- --m->nr;
- m->guest[i] = m->guest[m->nr];
- m->host[i] = m->host[m->nr];
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+
+ --m->host.nr;
+ m->host.val[i] = m->host.val[m->host.nr];
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -2426,9 +2588,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
}
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
- u64 guest_val, u64 host_val)
+ u64 guest_val, u64 host_val, bool entry_only)
{
- unsigned i;
+ int i, j = 0;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
@@ -2463,24 +2625,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
+ i = find_msr(&m->guest, msr);
+ if (!entry_only)
+ j = find_msr(&m->host, msr);
- if (i == NR_AUTOLOAD_MSRS) {
+ if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
- } else if (i == m->nr) {
- ++m->nr;
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
+ if (i < 0) {
+ i = m->guest.nr++;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+ }
+ m->guest.val[i].index = msr;
+ m->guest.val[i].value = guest_val;
- m->guest[i].index = msr;
- m->guest[i].value = guest_val;
- m->host[i].index = msr;
- m->host[i].value = host_val;
+ if (entry_only)
+ return;
+
+ if (j < 0) {
+ j = m->host.nr++;
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+ }
+ m->host.val[j].index = msr;
+ m->host.val[j].value = host_val;
}
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
@@ -2524,7 +2693,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer);
+ guest_efer, host_efer, false);
return false;
} else {
guest_efer &= ~ignore_bits;
@@ -2571,6 +2740,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
+ unsigned long fs_base, kernel_gs_base;
#endif
int i;
@@ -2586,12 +2756,20 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
#ifdef CONFIG_X86_64
- save_fsgs_for_kvm();
- vmx->host_state.fs_sel = current->thread.fsindex;
- vmx->host_state.gs_sel = current->thread.gsindex;
-#else
- savesegment(fs, vmx->host_state.fs_sel);
- savesegment(gs, vmx->host_state.gs_sel);
+ if (likely(is_64bit_mm(current->mm))) {
+ save_fsgs_for_kvm();
+ vmx->host_state.fs_sel = current->thread.fsindex;
+ vmx->host_state.gs_sel = current->thread.gsindex;
+ fs_base = current->thread.fsbase;
+ kernel_gs_base = current->thread.gsbase;
+ } else {
+#endif
+ savesegment(fs, vmx->host_state.fs_sel);
+ savesegment(gs, vmx->host_state.gs_sel);
+#ifdef CONFIG_X86_64
+ fs_base = read_msr(MSR_FS_BASE);
+ kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ }
#endif
if (!(vmx->host_state.fs_sel & 7)) {
vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
@@ -2611,10 +2789,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
savesegment(ds, vmx->host_state.ds_sel);
savesegment(es, vmx->host_state.es_sel);
- vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
+ vmcs_writel(HOST_FS_BASE, fs_base);
vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
- vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ vmx->msr_host_kernel_gs_base = kernel_gs_base;
if (is_long_mode(&vmx->vcpu))
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
@@ -3978,7 +4156,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.ia32_xss = data;
if (vcpu->arch.ia32_xss != host_xss)
add_atomic_switch_msr(vmx, MSR_IA32_XSS,
- vcpu->arch.ia32_xss, host_xss);
+ vcpu->arch.ia32_xss, host_xss, false);
else
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
@@ -4322,11 +4500,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
- /* KVM supports Enlightened VMCS v1 only */
- if (static_branch_unlikely(&enable_evmcs))
- vmcs_conf->revision_id = KVM_EVMCS_VERSION;
- else
- vmcs_conf->revision_id = vmx_msr_low;
+ vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
@@ -4396,7 +4570,13 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
return NULL;
vmcs = page_address(pages);
memset(vmcs, 0, vmcs_config.size);
- vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
+
+ /* KVM supports Enlightened VMCS v1 only */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs->revision_id = KVM_EVMCS_VERSION;
+ else
+ vmcs->revision_id = vmcs_config.revision_id;
+
return vmcs;
}
@@ -4564,6 +4744,19 @@ static __init int alloc_kvm_area(void)
return -ENOMEM;
}
+ /*
+ * When eVMCS is enabled, alloc_vmcs_cpu() sets
+ * vmcs->revision_id to KVM_EVMCS_VERSION instead of
+ * revision_id reported by MSR_IA32_VMX_BASIC.
+ *
+ * However, even though not explictly documented by
+ * TLFS, VMXArea passed as VMXON argument should
+ * still be marked with revision_id reported by
+ * physical CPU.
+ */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs->revision_id = vmcs_config.revision_id;
+
per_cpu(vmxarea, cpu) = vmcs;
}
return 0;
@@ -6250,9 +6443,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -6272,8 +6465,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
+ vmx->arch_capabilities = kvm_get_arch_capabilities();
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
@@ -7869,6 +8061,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+ vmx->nested.vpid02 = allocate_vpid();
+
vmx->nested.vmxon = true;
return 0;
@@ -8456,21 +8650,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
/* Emulate the VMPTRST instruction */
static int handle_vmptrst(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- gva_t vmcs_gva;
+ unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
struct x86_exception e;
+ gva_t gva;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, true, &vmcs_gva))
+ if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
return 1;
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
- if (kvm_write_guest_virt_system(vcpu, vmcs_gva,
- (void *)&to_vmx(vcpu)->nested.current_vmptr,
- sizeof(u64), &e)) {
+ if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
+ sizeof(gpa_t), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
@@ -9523,6 +9716,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
}
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ /*
+ * This code is only executed when the the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ bool flush_l1d;
+
+ /*
+ * Clear the per-vcpu flush bit, it gets set again
+ * either from vcpu_run() or from one of the unsafe
+ * VMEXIT handlers.
+ */
+ flush_l1d = vcpu->arch.l1tf_flush_l1d;
+ vcpu->arch.l1tf_flush_l1d = false;
+
+ /*
+ * Clear the per-cpu flush bit, it gets set again from
+ * the interrupt handlers.
+ */
+ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+ kvm_clear_cpu_l1tf_flush_l1d();
+
+ if (!flush_l1d)
+ return;
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -9924,7 +10190,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, msrs[i].msr);
else
add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
- msrs[i].host);
+ msrs[i].host, false);
}
static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
@@ -10019,6 +10285,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
(unsigned long)&current_evmcs->host_rsp : 0;
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -10346,11 +10615,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
- if (nested) {
+ if (nested)
nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
kvm_vcpu_apicv_active(&vmx->vcpu));
- vmx->nested.vpid02 = allocate_vpid();
- }
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
@@ -10367,7 +10634,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
return &vmx->vcpu;
free_vmcs:
- free_vpid(vmx->nested.vpid02);
free_loaded_vmcs(vmx->loaded_vmcs);
free_msrs:
kfree(vmx->guest_msrs);
@@ -10381,10 +10647,37 @@ free_vcpu:
return ERR_PTR(err);
}
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+
static int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
kvm->arch.pause_in_guest = true;
+
+ if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ /* 'I explicitly don't care' is set */
+ break;
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ /*
+ * Warn upon starting the first VM in a potentially
+ * insecure environment.
+ */
+ if (cpu_smt_control == CPU_SMT_ENABLED)
+ pr_warn_once(L1TF_MSG_SMT);
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+ pr_warn_once(L1TF_MSG_L1D);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ /* Flush is enforced */
+ break;
+ }
+ }
return 0;
}
@@ -11238,10 +11531,10 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
* Set the MSR load/store lists to match L0's settings.
*/
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
set_cr4_guest_host_mask(vmx);
@@ -11753,7 +12046,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u32 msr_entry_idx;
u32 exit_qual;
int r;
@@ -11775,10 +12067,10 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
nested_get_vmcs12_pages(vcpu, vmcs12);
r = EXIT_REASON_MSR_LOAD_FAIL;
- msr_entry_idx = nested_vmx_load_msr(vcpu,
- vmcs12->vm_entry_msr_load_addr,
- vmcs12->vm_entry_msr_load_count);
- if (msr_entry_idx)
+ exit_qual = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (exit_qual)
goto fail;
/*
@@ -11878,6 +12170,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return ret;
}
+ /* Hide L1D cache contents from the nested guest. */
+ vmx->vcpu.arch.l1tf_flush_l1d = true;
+
/*
* If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
* by event injection, halt vcpu.
@@ -12398,8 +12693,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_segment_cache_clear(vmx);
/* Update any VMCS fields that might have changed while L2 ran */
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (vmx->hv_deadline_tsc == -1)
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -13116,6 +13411,51 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.enable_smi_window = enable_smi_window,
};
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
+static void vmx_exit(void)
+{
+#ifdef CONFIG_KEXEC_CORE
+ RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+ synchronize_rcu();
+#endif
+
+ kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (static_branch_unlikely(&enable_evmcs)) {
+ int cpu;
+ struct hv_vp_assist_page *vp_ap;
+ /*
+ * Reset everything to support using non-enlightened VMCS
+ * access later (e.g. when we reload the module with
+ * enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+ }
+#endif
+ vmx_cleanup_l1d_flush();
+}
+module_exit(vmx_exit);
+
static int __init vmx_init(void)
{
int r;
@@ -13150,10 +13490,25 @@ static int __init vmx_init(void)
#endif
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
+ /*
+ * Must be called after kvm_init() so enable_ept is properly set
+ * up. Hand the parameter mitigation value in which was stored in
+ * the pre module init parser. If no parameter was given, it will
+ * contain 'auto' which will be turned into the default 'cond'
+ * mitigation mode.
+ */
+ if (boot_cpu_has(X86_BUG_L1TF)) {
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r) {
+ vmx_exit();
+ return r;
+ }
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
@@ -13162,39 +13517,4 @@ static int __init vmx_init(void)
return 0;
}
-
-static void __exit vmx_exit(void)
-{
-#ifdef CONFIG_KEXEC_CORE
- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
- synchronize_rcu();
-#endif
-
- kvm_exit();
-
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
+module_init(vmx_init);