diff options
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 225 |
1 files changed, 134 insertions, 91 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ba66c171d951..63615d242bdf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -602,15 +602,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; - u64 old_msr_data = msr->data; - msr->data = data; if (msr->load_into_hardware) { preempt_disable(); - ret = kvm_set_user_return_msr(slot, msr->data, msr->mask); + ret = kvm_set_user_return_msr(slot, data, msr->mask); preempt_enable(); - if (ret) - msr->data = old_msr_data; } + if (!ret) + msr->data = data; return ret; } @@ -1105,6 +1103,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif + unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1169,6 +1168,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); + + /* Host CR3 including its PCID is stable when guest state is loaded. */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != host_state->cr3)) { + vmcs_writel(HOST_CR3, cr3); + host_state->cr3 = cr3; + } + vmx->guest_state_loaded = true; } @@ -1271,7 +1278,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (!already_loaded) { void *gdt = get_current_gdt_ro(); - unsigned long sysenter_esp; /* * Flush all EPTP/VPID contexts, the new pCPU may have stale @@ -1287,8 +1293,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { + /* 22.2.3 */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(cpu) + 1)); + } vmx->loaded_vmcs->cpu = cpu; } @@ -1748,7 +1757,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) } /* - * Reads an msr value (of 'msr_index') into 'pdata'. + * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ @@ -2095,9 +2104,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } ret = kvm_set_msr_common(vcpu, msr_info); break; - case MSR_IA32_TSC_ADJUST: - ret = kvm_set_msr_common(vcpu, msr_info); - break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && !(to_vmx(vcpu)->msr_ia32_feature_control & @@ -2646,15 +2652,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); - - if (IS_ENABLED(CONFIG_HYPERV) && - static_branch_unlikely(&enable_evmcs) && - (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { - struct hv_enlightened_vmcs *evmcs = - (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; - - evmcs->hv_enlightenments_control.msr_bitmap = 1; - } } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); @@ -2918,6 +2915,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) } } +static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return nested_get_vpid02(vcpu); + return to_vmx(vcpu)->vpid; +} + static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -2930,31 +2934,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, mmu->shadow_root_level)); - else if (!is_guest_mode(vcpu)) - vpid_sync_context(to_vmx(vcpu)->vpid); else - vpid_sync_context(nested_get_vpid02(vcpu)); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* - * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok. */ - vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); + vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* - * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 - * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit - * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a + * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are + * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary. */ - vpid_sync_context(to_vmx(vcpu)->vpid); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -2984,7 +2986,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); } #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ @@ -3109,9 +3111,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; - else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) guest_cr3 = vcpu->arch.cr3; - else /* vmcs01.GUEST_CR3 is already up-to-date. */ + else /* vmcs.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { @@ -3686,6 +3688,19 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } +static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) +{ + /* + * When KVM is a nested hypervisor on top of Hyper-V and uses + * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR + * bitmap has changed. + */ + if (static_branch_unlikely(&enable_evmcs)) + evmcs_touch_msr_bitmap(); + + vmx->nested.force_msr_bitmap_recalc = true; +} + void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3694,8 +3709,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3739,8 +3753,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3930,6 +3943,19 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * This pairs with the smp_mb_*() after setting vcpu->mode in + * vcpu_enter_guest() to guarantee the vCPU sees the event + * request if triggering a posted interrupt "fails" because + * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as + * the smb_wmb() in kvm_make_request() only ensures everything + * done before making the request is visible when the request + * is visible, it doesn't ensure ordering between the store to + * vcpu->requests and the load from vcpu->mode. + */ + smp_mb__after_atomic(); + /* the PIR and ON have been set by L1. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) kvm_vcpu_kick(vcpu); @@ -3963,6 +3989,12 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ if (vcpu != kvm_get_running_vcpu() && !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); @@ -4021,6 +4053,12 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + + /* + * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites + * HOST_IA32_SYSENTER_ESP. + */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ @@ -4039,8 +4077,10 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & ~vcpu->arch.cr4_guest_rsvd_bits; - if (!enable_ept) - vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + if (!enable_ept) { + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; + } if (is_guest_mode(&vmx->vcpu)) vcpu->arch.cr4_guest_owned_bits &= ~get_vmcs12(vcpu)->cr4_guest_host_mask; @@ -4692,7 +4732,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } return 1; } @@ -5363,7 +5403,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } /* @@ -6262,9 +6302,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; - bool max_irr_updated; + bool got_posted_interrupt; - if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; if (pi_test_on(&vmx->pi_desc)) { @@ -6274,22 +6314,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr_updated = + got_posted_interrupt = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - - /* - * If we are running L2 and L1 has a new pending interrupt - * which can be injected, this may cause a vmexit or it may - * be injected into L2. Either way, this interrupt will be - * processed via KVM_REQ_EVENT, not RVI, because we do not use - * virtual interrupt delivery to inject L1 interrupts into L2. - */ - if (is_guest_mode(vcpu) && max_irr_updated) - kvm_make_request(KVM_REQ_EVENT, vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); + got_posted_interrupt = false; } - vmx_hwapic_irr_update(vcpu, max_irr); + + /* + * Newly recognized interrupts are injected via either virtual interrupt + * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is + * disabled in two cases: + * + * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 + * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a + * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected + * into L2, but KVM doesn't use virtual interrupt delivery to inject + * interrupts into L2, and so KVM_REQ_EVENT is again needed. + * + * 2) If APICv is disabled for this vCPU, assigned devices may still + * attempt to post interrupts. The posted interrupt vector will cause + * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + */ + if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) + vmx_set_rvi(max_irr); + else if (got_posted_interrupt) + kvm_make_request(KVM_REQ_EVENT, vcpu); + return max_irr; } @@ -6588,7 +6639,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; + unsigned long cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -6631,12 +6682,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } + vcpu->arch.regs_dirty = 0; cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { @@ -6725,7 +6771,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vmx_register_cache_reset(vcpu); + vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; pt_guest_exit(vmx); @@ -6826,6 +6872,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (err < 0) goto free_pml; + /* + * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a + * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the + * feature only for vmcs01, KVM currently isn't equipped to realize any + * performance benefits from enabling it for vmcs02. + */ + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + evmcs->hv_enlightenments_control.msr_bitmap = 1; + } + /* The MSR bitmap starts with all ones */ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -6931,7 +6990,6 @@ static int __init vmx_check_processor_compat(void) static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; - u64 ipat = 0; /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. @@ -6951,30 +7009,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * EPT memory type is used to emulate guest CD/MTRR. */ - if (is_mmio) { - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } + if (is_mmio) + return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { - ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_WRBACK; - goto exit; - } + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + } -exit: - return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; + return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) @@ -7509,6 +7559,7 @@ static void hardware_unsetup(void) static bool vmx_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ); @@ -7683,7 +7734,7 @@ static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, ept_lpage_level; + int r; store_idt(&dt); host_idt_base = dt.address; @@ -7761,10 +7812,10 @@ static __init int hardware_setup(void) ple_window_shrink = 0; } - if (!cpu_has_vmx_apicv()) { + if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - } if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; @@ -7780,16 +7831,8 @@ static __init int hardware_setup(void) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); - if (!enable_ept) - ept_lpage_level = 0; - else if (cpu_has_vmx_ept_1g_page()) - ept_lpage_level = PG_LEVEL_1G; - else if (cpu_has_vmx_ept_2m_page()) - ept_lpage_level = PG_LEVEL_2M; - else - ept_lpage_level = PG_LEVEL_4K; kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), - ept_lpage_level); + ept_caps_to_lpage_level(vmx_capability.ept)); /* * Only enable PML when hardware supports PML feature, and both EPT |