From 0f107682cb0398265d80237c353a6fa93161d219 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 28 Sep 2017 18:06:24 -0700 Subject: KVM: VMX: Don't expose PLE enable if there is no hardware support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM doesn't expose the PLE capability to the L1 hypervisor, however, ple_window still shows the default value on L1 hypervisor. This patch fixes it by clearing all the PLE related module parameter if there is no PLE capability. Reviewed-by: Konrad Rzeszutek Wilk Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 95a01609d7ee..941185e0606b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6779,8 +6779,13 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); - if (!cpu_has_vmx_ple()) + if (!cpu_has_vmx_ple()) { ple_gap = 0; + ple_window = 0; + ple_window_grow = 0; + ple_window_max = 0; + ple_window_shrink = 0; + } if (!cpu_has_vmx_apicv()) { enable_apicv = 0; -- cgit From c69518c86be5000bf4d8eb0e0d279d40779d2a18 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 5 Oct 2017 03:53:51 -0700 Subject: KVM: LAPIC: Fix lapic timer mode transition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SDM 10.5.4.1 TSC-Deadline Mode mentioned that "Transitioning between TSC-Deadline mode and other timer modes also disarms the timer". So the APIC Timer Initial Count Register for one-shot/periodic mode should be reset. This patch do it. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li [Removed unnecessary definition of APIC_LVT_TIMER_MASK.] Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 69c5612be786..6723e2ca3d9d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1307,6 +1307,9 @@ static void apic_update_lvtt(struct kvm_lapic *apic) apic->lapic_timer.timer_mode_mask; if (apic->lapic_timer.timer_mode != timer_mode) { + if (apic_lvtt_tscdeadline(apic) != (timer_mode == + APIC_LVT_TIMER_TSCDEADLINE)) + kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.timer_mode = timer_mode; hrtimer_cancel(&apic->lapic_timer.timer); } -- cgit From ccbfa1d39b9e6d6e578176d091362287c53c7d4d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 5 Oct 2017 18:54:24 -0700 Subject: KVM: LAPIC: Introduce limit_periodic_timer_frequency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the logic of limit lapic periodic timer frequency to a new function, this function will be used by later patches. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6723e2ca3d9d..8841bb539c90 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1301,6 +1301,27 @@ static void update_divide_count(struct kvm_lapic *apic) apic->divide_count); } +static void limit_periodic_timer_frequency(struct kvm_lapic *apic) +{ + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1445,23 +1466,7 @@ static bool set_target_expiration(struct kvm_lapic *apic) if (!apic->lapic_timer.period) return false; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } - } + limit_periodic_timer_frequency(apic); apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " -- cgit From dedf9c5e216902c6d34b5a0d0c40f4acbb3706d8 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 5 Oct 2017 18:54:25 -0700 Subject: KVM: LAPIC: Keep timer running when switching between one-shot and periodic mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we take TSC-deadline mode timer out of the picture, the Intel SDM does not say that the timer is disable when the timer mode is change, either from one-shot to periodic or vice versa. After this patch, the timer is no longer disarmed on change of mode, so the counter (TMCCT) keeps counting down. So what does a write to LVTT changes ? On baremetal, the change of mode is probably taken into account only when the counter reach 0. When this happen, LVTT is use to figure out if the counter should restard counting down from TMICT (so periodic mode) or stop counting (if one-shot mode). This patch is based on observation of the behavior of the APIC timer on baremetal as well as check that they does not go against the description written in the Intel SDM. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li [Fixed rate limiting of periodic timer.] Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8841bb539c90..f7ad11255557 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1308,7 +1308,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) * interval, since the hrtimers are not throttled by the host * scheduler. */ - if (apic_lvtt_period(apic)) { + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { s64 min_period = min_timer_period_us * 1000LL; if (apic->lapic_timer.period < min_period) { @@ -1329,10 +1329,12 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == - APIC_LVT_TIMER_TSCDEADLINE)) + APIC_LVT_TIMER_TSCDEADLINE)) { kvm_lapic_set_reg(apic, APIC_TMICT, 0); + hrtimer_cancel(&apic->lapic_timer.timer); + } apic->lapic_timer.timer_mode = timer_mode; - hrtimer_cancel(&apic->lapic_timer.timer); + limit_periodic_timer_frequency(apic); } } -- cgit From c301b909e4b94e6395251787e28b335c51309fff Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 6 Oct 2017 07:38:32 -0700 Subject: KVM: LAPIC: Apply change to TDCR right away to the timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The description in the Intel SDM of how the divide configuration register is used: "The APIC timer frequency will be the processor's bus clock or core crystal clock frequency divided by the value specified in the divide configuration register." Observation of baremetal shown that when the TDCR is change, the TMCCT does not change or make a big jump in value, but the rate at which it count down change. The patch update the emulation to APIC timer to so that a change to the divide configuration would be reflected in the value of the counter and when the next interrupt is triggered. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li [Fixed some whitespace and added a check for negative delta and running timer. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f7ad11255557..2c9e88a82738 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1456,6 +1456,30 @@ static void start_sw_period(struct kvm_lapic *apic) HRTIMER_MODE_ABS_PINNED); } +static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) +{ + ktime_t now, remaining; + u64 ns_remaining_old, ns_remaining_new; + + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + limit_periodic_timer_frequency(apic); + + now = ktime_get(); + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); + if (ktime_to_ns(remaining) < 0) + remaining = 0; + + ns_remaining_old = ktime_to_ns(remaining); + ns_remaining_new = mul_u64_u32_div(ns_remaining_old, + apic->divide_count, old_divisor); + + apic->lapic_timer.tscdeadline += + nsec_to_cycles(apic->vcpu, ns_remaining_new) - + nsec_to_cycles(apic->vcpu, ns_remaining_old); + apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new); +} + static bool set_target_expiration(struct kvm_lapic *apic) { ktime_t now; @@ -1748,13 +1772,21 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) start_apic_timer(apic); break; - case APIC_TDCR: + case APIC_TDCR: { + uint32_t old_divisor = apic->divide_count; + if (val & 4) apic_debug("KVM_WRITE:TDCR %x\n", val); kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); + if (apic->divide_count != old_divisor && + apic->lapic_timer.period) { + hrtimer_cancel(&apic->lapic_timer.timer); + update_target_expiration(apic, old_divisor); + restart_apic_timer(apic); + } break; - + } case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) { apic_debug("KVM_WRITE:ESR not zero %x\n", val); -- cgit From bb606a9b802584f753f48b6e3369a3026bf5a824 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:23 +0200 Subject: KVM: x86: mmu: returning void in a void function is strange MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's just drop the return. Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7a69cf053711..2586c7fd0dea 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2424,7 +2424,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { - return __shadow_walk_next(iterator, *iterator->sptep); + __shadow_walk_next(iterator, *iterator->sptep); } static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, -- cgit From 87ca74ad920ccc8b9fd1c00bd7177070f17ebc95 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:24 +0200 Subject: KVM: x86: mmu: free_page can handle NULL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2586c7fd0dea..3b7f94715c57 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4993,8 +4993,7 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu.pae_root); - if (vcpu->arch.mmu.lm_root != NULL) - free_page((unsigned long)vcpu->arch.mmu.lm_root); + free_page((unsigned long)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) -- cgit From 26de7988499b96f3bca09e5bdf16c0f46541fcc6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:25 +0200 Subject: KVM: x86: drop BUG_ON(vcpu->kvm) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit And also get rid of that superfluous local variable "kvm". Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03869eb7fcd6..69be798b40af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7974,16 +7974,12 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; - struct kvm *kvm; int r; - BUG_ON(vcpu->kvm == NULL); - kvm = vcpu->kvm; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; @@ -8001,7 +7997,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (r < 0) goto fail_free_pio_data; - if (irqchip_in_kernel(kvm)) { + if (irqchip_in_kernel(vcpu->kvm)) { r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; -- cgit From 12d79917a4d67efd4a80dcd57748da10f890508a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:26 +0200 Subject: KVM: VMX: vmx_vcpu_setup() cannot fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it a void and drop error handling code. Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 941185e0606b..ecac113fd0db 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5426,7 +5426,7 @@ static void ept_set_mmio_spte_mask(void) /* * Sets up the vmcs for emulated real mode. */ -static int vmx_vcpu_setup(struct vcpu_vmx *vmx) +static void vmx_vcpu_setup(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 unsigned long a; @@ -5539,8 +5539,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } - - return 0; } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9564,11 +9562,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; - err = vmx_vcpu_setup(vmx); + vmx_vcpu_setup(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); - if (err) - goto free_vmcs; if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { err = alloc_apic_access_page(kvm); if (err) -- cgit From f2d1da696f515a4ad2df3f03ad922306867c391a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:27 +0200 Subject: KVM: x86: no need to inititalize vcpu members to 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vmx and svm use zalloc, so this is not necessary. Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 69be798b40af..632561b2a3f6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7977,7 +7977,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) int r; vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); - vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -8019,10 +8018,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) fx_init(vcpu); - vcpu->arch.ia32_tsc_adjust_msr = 0x0; - vcpu->arch.pv_time_enabled = false; - - vcpu->arch.guest_supported_xcr0 = 0; vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); -- cgit From 0e1252dc46b9c9a805d704cf3a123132ebb0f8cf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:28 +0200 Subject: KVM: VMX: drop enable_ept check from ept_sync_context() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is only called with enable_ept. Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ecac113fd0db..c757070a32a7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1604,12 +1604,10 @@ static inline void ept_sync_global(void) static inline void ept_sync_context(u64 eptp) { - if (enable_ept) { - if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); - else - ept_sync_global(); - } + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + else + ept_sync_global(); } static __always_inline void vmcs_check16(unsigned long field) -- cgit From fdf288bf726a491ff1e300c245194be48a6882fd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:29 +0200 Subject: KVM: VMX: call ept_sync_global() with enable_ept only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ept_* function should only be called with enable_ept being set. Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c757070a32a7..7d944d3feb8a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3542,7 +3542,8 @@ static int hardware_enable(void) wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } kvm_cpu_vmxon(phys_addr); - ept_sync_global(); + if (enable_ept) + ept_sync_global(); return 0; } -- cgit From f5f51586dba5bb29b21ed5d4c649f4edf72af327 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:30 +0200 Subject: KVM: VMX: require INVEPT GLOBAL for EPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, we won't be able to do any flushes, so let's just require it. Should be absent in very strange configurations. Suggested-by: Paolo Bonzini Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7d944d3feb8a..7fffdd4446cf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1598,8 +1598,7 @@ static inline void vpid_sync_context(int vpid) static inline void ept_sync_global(void) { - if (cpu_has_vmx_invept_global()) - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); } static inline void ept_sync_context(u64 eptp) @@ -6747,7 +6746,8 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || - !cpu_has_vmx_ept_mt_wb()) { + !cpu_has_vmx_ept_mt_wb() || + !cpu_has_vmx_invept_global()) { enable_ept = 0; enable_unrestricted_guest = 0; enable_ept_ad_bits = 0; -- cgit From 9522ea9ef95e03bef2a1886ef3db1bde9d384915 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:31 +0200 Subject: KVM: VMX: drop unnecessary function declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7fffdd4446cf..40a54649d0d2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -900,14 +900,12 @@ static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static bool vmx_xsaves_supported(void); -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); -- cgit From 0ee096d0064ca7eddf13ac5f52fd008b6133f501 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:32 +0200 Subject: KVM: nVMX: no need to set vcpu->cpu when switching vmcs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vcpu->cpu is not cleared when doing a vmx_vcpu_put/load, so this can be dropped. Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 40a54649d0d2..8cf506de30c0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9478,7 +9478,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; put_cpu(); } -- cgit From 1c13bffd946ebb75d7b18e57b2984d581b6b0859 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:33 +0200 Subject: KVM: nVMX: no need to set ept/vpid caps to 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They are inititally 0, so no need to reset them to 0. Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8cf506de30c0..daf377554c1f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2826,8 +2826,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_PML; vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } - } else - vmx->nested.nested_vmx_ept_caps = 0; + } if (cpu_has_vmx_vmfunc()) { vmx->nested.nested_vmx_secondary_ctls_high |= @@ -2851,8 +2850,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_VPID; vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | VMX_VPID_EXTENT_SUPPORTED_MASK; - } else - vmx->nested.nested_vmx_vpid_caps = 0; + } if (enable_unrestricted_guest) vmx->nested.nested_vmx_secondary_ctls_high |= -- cgit From d8a6e365b208a36f9e789ee50c55096b71367431 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:34 +0200 Subject: KVM: VMX: cleanup init_rmode_identity_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No need for another enable_ept check. kvm->arch.ept_identity_map_addr only has to be inititalized once. Having alloc_identity_pagetable() is overkill and dropping BUG_ONs is always nice. Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index daf377554c1f..400b028e5dfb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -907,7 +907,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static int alloc_identity_pagetable(struct kvm *kvm); static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, @@ -4775,18 +4774,18 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm_pfn_t identity_map_pfn; u32 tmp; - if (!enable_ept) - return 0; - /* Protect kvm->arch.ept_identity_pagetable_done. */ mutex_lock(&kvm->slots_lock); if (likely(kvm->arch.ept_identity_pagetable_done)) goto out2; + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; - r = alloc_identity_pagetable(kvm); + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + kvm->arch.ept_identity_map_addr, PAGE_SIZE); if (r < 0) goto out2; @@ -4858,20 +4857,6 @@ out: return r; } -static int alloc_identity_pagetable(struct kvm *kvm) -{ - /* Called with kvm->slots_lock held. */ - - int r = 0; - - BUG_ON(kvm->arch.ept_identity_pagetable_done); - - r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm->arch.ept_identity_map_addr, PAGE_SIZE); - - return r; -} - static int allocate_vpid(void) { int vpid; @@ -9566,9 +9551,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) } if (enable_ept) { - if (!kvm->arch.ept_identity_map_addr) - kvm->arch.ept_identity_map_addr = - VMX_EPT_IDENTITY_PAGETABLE_ADDR; err = init_rmode_identity_map(kvm); if (err) goto free_vmcs; -- cgit From 1af1ac910bb3394ac1c0062f5781983dde40a8c0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:36 +0200 Subject: KVM: x86: allow setting identity map addr with no vcpus only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changing it afterwards doesn't make too much sense and will only result in inconsistencies. Reviewed-by: Radim Krčmář Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 632561b2a3f6..b0d291518e88 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4034,10 +4034,16 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; + mutex_lock(&kvm->lock); + r = -EINVAL; + if (kvm->created_vcpus) + goto set_identity_unlock; r = -EFAULT; if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) - goto out; + goto set_identity_unlock; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); +set_identity_unlock: + mutex_unlock(&kvm->lock); break; } case KVM_SET_NR_MMU_PAGES: -- cgit From 736fdf72518b400321f8b20c770bfb500f829928 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 24 Aug 2017 20:51:37 +0200 Subject: KVM: VMX: rename RDSEED and RDRAND vmx ctrls to reflect exiting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's just name these according to the SDM. This should make it clearer that the are used to enable exiting and not the feature itself. Signed-off-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/include/asm/vmx.h | 4 ++-- arch/x86/kvm/vmx.c | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index caec8417539f..8b6780751132 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -70,11 +70,11 @@ #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 -#define SECONDARY_EXEC_RDRAND 0x00000800 +#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 -#define SECONDARY_EXEC_RDSEED 0x00010000 +#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 400b028e5dfb..ffea07ede222 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3650,8 +3650,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_RDSEED | - SECONDARY_EXEC_RDRAND | + SECONDARY_EXEC_RDSEED_EXITING | + SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_VMFUNC; @@ -5261,13 +5261,13 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) static bool vmx_rdrand_supported(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDRAND; + SECONDARY_EXEC_RDRAND_EXITING; } static bool vmx_rdseed_supported(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDSEED; + SECONDARY_EXEC_RDSEED_EXITING; } static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) @@ -5361,30 +5361,30 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (vmx_rdrand_supported()) { bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND; + exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; if (nested) { if (rdrand_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_RDRAND; + SECONDARY_EXEC_RDRAND_EXITING; else vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND; + ~SECONDARY_EXEC_RDRAND_EXITING; } } if (vmx_rdseed_supported()) { bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED; + exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; if (nested) { if (rdseed_enabled) vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_RDSEED; + SECONDARY_EXEC_RDSEED_EXITING; else vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED; + ~SECONDARY_EXEC_RDSEED_EXITING; } } @@ -8401,9 +8401,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); case EXIT_REASON_RDRAND: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); case EXIT_REASON_RDSEED: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: -- cgit From 46bea48ac241fe0b413805952dda74dd0c09ba8b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 5 Oct 2017 18:07:24 -0700 Subject: kvm, mm: account kvm related kmem slabs to kmemcg The kvm slabs can consume a significant amount of system memory and indeed in our production environment we have observed that a lot of machines are spending significant amount of memory that can not be left as system memory overhead. Also the allocations from these slabs can be triggered directly by user space applications which has access to kvm and thus a buggy application can leak such memory. So, these caches should be accounted to kmemcg. Signed-off-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b7f94715c57..cdedf5320145 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5475,13 +5475,13 @@ int kvm_mmu_module_init(void) pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), - 0, 0, NULL); + 0, SLAB_ACCOUNT, NULL); if (!pte_list_desc_cache) goto nomem; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), - 0, 0, NULL); + 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) goto nomem; -- cgit From 86bbc1e6d7ce016f1b2e8c29864cd7b75b00ff96 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 6 Oct 2017 19:25:53 +0200 Subject: KVM: x86: handle 0 write to TSC_DEADLINE MSR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 0 should disable the timer, but start_hv_timer will recognize it as an expired timer instead. Signed-off-by: Radim Krčmář Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2c9e88a82738..39c1ae11ce1d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1549,6 +1549,9 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) return false; + if (!ktimer->tscdeadline) + return false; + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); if (r < 0) return false; -- cgit From 5d74a6999368ad1991491b1913bb80faf1925e67 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 6 Oct 2017 19:25:54 +0200 Subject: KVM: x86: really disarm lapic timer when clearing TMICT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit preemption timer only looks at tscdeadline and could inject already disarmed timer. Signed-off-by: Radim Krčmář Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 39c1ae11ce1d..96ade848ae0b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1489,8 +1489,10 @@ static bool set_target_expiration(struct kvm_lapic *apic) apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) * APIC_BUS_CYCLE_NS * apic->divide_count; - if (!apic->lapic_timer.period) + if (!apic->lapic_timer.period) { + apic->lapic_timer.tscdeadline = 0; return false; + } limit_periodic_timer_frequency(apic); -- cgit From 44275932589a84a24849290b0d5c22157016a5e6 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Fri, 6 Oct 2017 19:25:55 +0200 Subject: KVM: x86: thoroughly disarm LAPIC timer around TSC deadline switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our routines look at tscdeadline and period when deciding state of a timer. The timer is disarmed when switching between TSC deadline and other modes, so we should set everything to disarmed state. Signed-off-by: Radim Krčmář Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 96ade848ae0b..a778f1ae2927 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1330,8 +1330,10 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { - kvm_lapic_set_reg(apic, APIC_TMICT, 0); hrtimer_cancel(&apic->lapic_timer.timer); + kvm_lapic_set_reg(apic, APIC_TMICT, 0); + apic->lapic_timer.period = 0; + apic->lapic_timer.tscdeadline = 0; } apic->lapic_timer.timer_mode = timer_mode; limit_periodic_timer_frequency(apic); -- cgit From a554d207dc46b205dcd707888ba31b13c7cfc009 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Oct 2017 05:10:19 -0700 Subject: KVM: X86: Processor States following Reset or INIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - XCR0 is reset to 1 by RESET but not INIT - XSS is zeroed by both RESET and INIT - BNDCFGU, BND0-BND3, BNDCFGS, BNDSTATUS are zeroed by both RESET and INIT This patch does this according to SDM. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 ++ arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ffea07ede222..cbd7afa53bd0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5581,6 +5581,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); + if (kvm_mpx_supported()) + vmcs_write64(GUEST_BNDCFGS, 0); setup_msrs(vmx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b0d291518e88..4ac261000e7e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7804,18 +7804,40 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; + if (kvm_mpx_supported()) { + void *mpx_state_buffer; + + /* + * To avoid have the INIT path from kvm_apic_has_events() that be + * called with loaded FPU and does not let userspace fix the state. + */ + kvm_put_guest_fpu(vcpu); + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, + XFEATURE_MASK_BNDREGS); + if (mpx_state_buffer) + memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, + XFEATURE_MASK_BNDCSR); + if (mpx_state_buffer) + memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + } + if (!init_event) { kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; vcpu->arch.msr_misc_features_enables = 0; + + vcpu->arch.xcr0 = XFEATURE_MASK_FP; } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + vcpu->arch.ia32_xss = 0; + kvm_x86_ops->vcpu_reset(vcpu, init_event); } -- cgit From 8ad8182e935030f5af5122ce0c137529cb53b013 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 9 Oct 2017 15:51:53 -0700 Subject: KVM: VMX: Don't expose unrestricted_guest is enabled if ept is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SDM mentioned: "If either the “unrestricted guest†VM-execution control or the “mode-based execute control for EPT†VM- execution control is 1, the “enable EPT†VM-execution control must also be 1." However, we can still observe unrestricted_guest is Y after inserting the kvm-intel.ko w/ ept=N. It depends on later starts a guest in order that the function vmx_compute_secondary_exec_control() can be executed, then both the module parameter and exec control fields will be amended. This patch fixes it by amending module parameter immediately during vmcs data setup. Reviewed-by: Jim Mattson Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cbd7afa53bd0..74582375e0be 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6730,16 +6730,13 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || !cpu_has_vmx_ept_mt_wb() || - !cpu_has_vmx_invept_global()) { + !cpu_has_vmx_invept_global()) enable_ept = 0; - enable_unrestricted_guest = 0; - enable_ept_ad_bits = 0; - } if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; - if (!cpu_has_vmx_unrestricted_guest()) + if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) -- cgit From c1bd743e54cd653bd5e7082255dc236cfd40dbf0 Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Sat, 7 Oct 2017 23:15:23 -0400 Subject: arch/x86: remove redundant null checks before kmem_cache_destroy Remove redundant null checks before calling kmem_cache_destroy. Found with make coccicheck M=arch/x86/kvm on linux-next tag next-20170929. Signed-off-by: Tim Hansen Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cdedf5320145..d7a7eafc5ade 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5463,10 +5463,8 @@ static struct shrinker mmu_shrinker = { static void mmu_destroy_caches(void) { - if (pte_list_desc_cache) - kmem_cache_destroy(pte_list_desc_cache); - if (mmu_page_header_cache) - kmem_cache_destroy(mmu_page_header_cache); + kmem_cache_destroy(pte_list_desc_cache); + kmem_cache_destroy(mmu_page_header_cache); } int kvm_mmu_module_init(void) -- cgit From 1cf53587c03025885af359da3bd0be091aa34b53 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Oct 2017 12:51:56 +0200 Subject: KVM: SVM: unconditionally wake up VCPU on IOMMU interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checking the mode is unnecessary, and is done without a memory barrier separating the LAPIC write from the vcpu->mode read; in addition, kvm_vcpu_wake_up is already doing a check for waiters on the wait queue that has the same effect. In practice it's safe because spin_lock has full-barrier semantics on x86, but don't be too clever. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0e68f0b3cbf7..3e4337ee59cb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1034,15 +1034,12 @@ static int avic_ga_log_notifier(u32 ga_tag) } spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); - if (!vcpu) - return 0; - /* Note: * At this point, the IOMMU should have already set the pending * bit in the vAPIC backing page. So, we just need to schedule * in the vcpu. */ - if (vcpu->mode == OUTSIDE_GUEST_MODE) + if (vcpu) kvm_vcpu_wake_up(vcpu); return 0; -- cgit From d0006530576f1c7a49b2010eac7afdcb5a3613ae Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 11 Aug 2017 18:36:43 +0200 Subject: KVM: SVM: limit kvm_handle_page_fault to #PF handling It has always annoyed me a bit how SVM_EXIT_NPF is handled by pf_interception. This is also the only reason behind the under-documented need_unprotect argument to kvm_handle_page_fault. Let NPF go straight to kvm_mmu_page_fault, just like VMX does in handle_ept_violation and handle_ept_misconfig. Reviewed-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 5 ++--- arch/x86/kvm/mmu.h | 3 +-- arch/x86/kvm/svm.c | 15 +++++++++++++-- arch/x86/kvm/vmx.c | 3 +-- 4 files changed, 17 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d7a7eafc5ade..e4fb82c0a5d0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3820,8 +3820,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, - u64 fault_address, char *insn, int insn_len, - bool need_unprotect) + u64 fault_address, char *insn, int insn_len) { int r = 1; @@ -3829,7 +3828,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, default: trace_kvm_page_fault(fault_address, error_code); - if (need_unprotect && kvm_event_needs_reinjection(vcpu)) + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 64a2dbd2b1af..1092302aa16a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -65,8 +65,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, - u64 fault_address, char *insn, int insn_len, - bool need_unprotect); + u64 fault_address, char *insn, int insn_len); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3e4337ee59cb..84f18634d87c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2141,7 +2141,18 @@ static int pf_interception(struct vcpu_svm *svm) return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len, !npt_enabled); + svm->vmcb->control.insn_len); +} + +static int npf_interception(struct vcpu_svm *svm) +{ + u64 fault_address = svm->vmcb->control.exit_info_2; + u64 error_code = svm->vmcb->control.exit_info_1; + + trace_kvm_page_fault(fault_address, error_code); + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + svm->vmcb->control.insn_bytes, + svm->vmcb->control.insn_len); } static int db_interception(struct vcpu_svm *svm) @@ -4128,7 +4139,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MONITOR] = monitor_interception, [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, - [SVM_EXIT_NPF] = pf_interception, + [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = emulate_on_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 74582375e0be..c9214e3a01df 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5894,8 +5894,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) cr2 = vmcs_readl(EXIT_QUALIFICATION); /* EPT won't cause page fault directly */ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, - true); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; -- cgit From 0234bf885236a41ef05376039f2a8ebe7028a388 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Wed, 11 Oct 2017 16:54:40 +0200 Subject: KVM: x86: introduce ISA specific SMM entry/exit callbacks Entering and exiting SMM may require ISA specific handling under certain circumstances. This commit adds two new callbacks with empty implementations. Actual functionality will be added in following commits. * pre_enter_smm() is to be called when injecting an SMM, before any SMM related vcpu state has been changed * pre_leave_smm() is to be called when emulating the RSM instruction, when the vcpu is in real mode and before any SMM related vcpu state has been restored Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/emulate.c | 9 +++++++++ arch/x86/kvm/svm.c | 15 +++++++++++++++ arch/x86/kvm/vmx.c | 15 +++++++++++++++ arch/x86/kvm/x86.c | 15 ++++++++++++++- 6 files changed, 58 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index fa2558e12024..ad38c5e918ec 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -225,6 +225,8 @@ struct x86_emulate_ops { unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); + int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase); + }; typedef u32 __attribute__((vector_size(16))) sse128_t; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c73e493adf07..23a9a5339f3f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1061,6 +1061,9 @@ struct kvm_x86_ops { void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); void (*setup_mce)(struct kvm_vcpu *vcpu); + + int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); + int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d90cdc77e077..8079d141792a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2591,6 +2591,15 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) ctxt->ops->set_msr(ctxt, MSR_EFER, efer); smbase = ctxt->ops->get_smbase(ctxt); + + /* + * Give pre_leave_smm() a chance to make ISA-specific changes to the + * vCPU state (e.g. enter guest mode) before loading state from the SMM + * state-save area. + */ + if (ctxt->ops->pre_leave_smm(ctxt, smbase)) + return X86EMUL_UNHANDLEABLE; + if (emulator_has_longmode(ctxt)) ret = rsm_load_state_64(ctxt, smbase + 0x8000); else diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 84f18634d87c..c4e9b99d48d8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5401,6 +5401,18 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +{ + /* TODO: Implement */ + return 0; +} + +static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +{ + /* TODO: Implement */ + return 0; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5511,6 +5523,9 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, + + .pre_enter_smm = svm_pre_enter_smm, + .pre_leave_smm = svm_pre_leave_smm, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c9214e3a01df..1305bb65688b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11916,6 +11916,18 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } +static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +{ + /* TODO: Implement */ + return 0; +} + +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +{ + /* TODO: Implement */ + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -12041,6 +12053,9 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { #endif .setup_mce = vmx_setup_mce, + + .pre_enter_smm = vmx_pre_enter_smm, + .pre_leave_smm = vmx_pre_leave_smm, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4ac261000e7e..9e85a69ccb12 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5281,6 +5281,11 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); } +static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5322,6 +5327,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, + .pre_leave_smm = emulator_pre_leave_smm, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6647,13 +6653,20 @@ static void enter_smm(struct kvm_vcpu *vcpu) u32 cr0; trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); - vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, buf); else enter_smm_save_state_32(vcpu, buf); + /* + * Give pre_enter_smm() a chance to make ISA-specific changes to the + * vCPU state (e.g. leave guest mode) after we've saved the state into + * the SMM state-save area. + */ + kvm_x86_ops->pre_enter_smm(vcpu, buf); + + vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); if (kvm_x86_ops->get_nmi_mask(vcpu)) -- cgit From 72d7b374b14d67e973bce476e4a75552478cc42d Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Wed, 11 Oct 2017 16:54:41 +0200 Subject: KVM: x86: introduce ISA specific smi_allowed callback Similar to NMI, there may be ISA specific reasons why an SMI cannot be injected into the guest. This commit adds a new smi_allowed callback to be implemented in following commits. Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 2 +- 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 23a9a5339f3f..411ddbbaeabf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1062,6 +1062,7 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); + int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c4e9b99d48d8..e3c61a32249d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5401,6 +5401,11 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +static int svm_smi_allowed(struct kvm_vcpu *vcpu) +{ + return 1; +} + static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { /* TODO: Implement */ @@ -5524,6 +5529,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, + .smi_allowed = svm_smi_allowed, .pre_enter_smm = svm_pre_enter_smm, .pre_leave_smm = svm_pre_leave_smm, }; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1305bb65688b..156ecbaad1e6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11916,6 +11916,11 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } +static int vmx_smi_allowed(struct kvm_vcpu *vcpu) +{ + return 1; +} + static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { /* TODO: Implement */ @@ -12054,6 +12059,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, + .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e85a69ccb12..693bf8d01128 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6438,7 +6438,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } kvm_x86_ops->queue_exception(vcpu); - } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; enter_smm(vcpu); } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { -- cgit From 21f2d551183847bc7fbe8d866151d00cdad18752 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Wed, 11 Oct 2017 16:54:42 +0200 Subject: KVM: nVMX: set IDTR and GDTR limits when loading L1 host state Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers: "The GDTR and IDTR limits are each set to FFFFH." Signed-off-by: Ladi Prosek Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 156ecbaad1e6..6f4e29b343a8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11303,6 +11303,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) -- cgit From 72e9cbdb43384ceacc49e2fb6b8c8fb7c5988778 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Wed, 11 Oct 2017 16:54:43 +0200 Subject: KVM: nVMX: fix SMI injection in guest mode Entering SMM while running in guest mode wasn't working very well because several pieces of the vcpu state were left set up for nested operation. Some of the issues observed: * L1 was getting unexpected VM exits (using L1 interception controls but running in SMM execution environment) * SMM handler couldn't write to vmx_set_cr4 because of incorrect validity checks predicated on nested.vmxon * MMU was confused (walk_mmu was still set to nested_mmu) Intel SDM actually prescribes the logical processor to "leave VMX operation" upon entering SMM in 34.14.1 Default Treatment of SMI Delivery. What we need to do is basically get out of guest mode and set nested.vmxon to false for the duration of SMM. All this completely transparent to L1, i.e. L1 is not given control and no L1 observable state changes. To avoid code duplication this commit takes advantage of the existing nested vmexit and run functionality, perhaps at the cost of efficiency. To get out of guest mode, nested_vmx_vmexit with exit_reason == -1 is called, a trick already used in vmx_leave_nested. Re-entering is cleaner, using enter_vmx_non_root_mode. This commit fixes running Windows Server 2016 with Hyper-V enabled in a VM with OVMF firmware (OVMF_CODE-need-smm.fd). Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6f4e29b343a8..c460b0b439d3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -486,6 +486,14 @@ struct nested_vmx { u64 nested_vmx_cr4_fixed1; u64 nested_vmx_vmcs_enum; u64 nested_vmx_vmfunc_controls; + + /* SMM related state */ + struct { + /* in VMX operation on SMM entry? */ + bool vmxon; + /* in guest mode on SMM entry? */ + bool guest_mode; + } smm; }; #define POSTED_INTR_ON 0 @@ -11401,8 +11409,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); if (likely(!vmx->fail)) { - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); + if (exit_reason == -1) + sync_vmcs12(vcpu, vmcs12); + else + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) @@ -11466,7 +11477,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (enable_shadow_vmcs) + if (enable_shadow_vmcs && exit_reason != -1) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ @@ -11490,12 +11501,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; } - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); + if (exit_reason != -1) + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); load_vmcs12_host_state(vcpu, vmcs12); @@ -11920,18 +11932,44 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) static int vmx_smi_allowed(struct kvm_vcpu *vcpu) { + /* we need a nested vmexit to enter SMM, postpone if run is pending */ + if (to_vmx(vcpu)->nested.nested_run_pending) + return 0; return 1; } static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { - /* TODO: Implement */ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + vmx->nested.smm.guest_mode = is_guest_mode(vcpu); + if (vmx->nested.smm.guest_mode) + nested_vmx_vmexit(vcpu, -1, 0, 0); + + vmx->nested.smm.vmxon = vmx->nested.vmxon; + vmx->nested.vmxon = false; return 0; } static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) { - /* TODO: Implement */ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int ret; + + if (vmx->nested.smm.vmxon) { + vmx->nested.vmxon = true; + vmx->nested.smm.vmxon = false; + } + + if (vmx->nested.smm.guest_mode) { + vcpu->arch.hflags &= ~HF_SMM_MASK; + ret = enter_vmx_non_root_mode(vcpu, false); + vcpu->arch.hflags |= HF_SMM_MASK; + if (ret) + return ret; + + vmx->nested.smm.guest_mode = false; + } return 0; } -- cgit From c26340651b75d649bea585eba45e32b871188e6e Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Wed, 11 Oct 2017 16:54:44 +0200 Subject: KVM: nSVM: refactor nested_svm_vmrun Analogous to 858e25c06fb0 ("kvm: nVMX: Refactor nested_vmx_run()"), this commit splits nested_svm_vmrun into two parts. The newly introduced enter_svm_guest_mode modifies the vcpu state to transition from L1 to L2, while the code left in nested_svm_vmrun handles the VMRUN instruction. Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 132 ++++++++++++++++++++++++++++------------------------- 1 file changed, 69 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e3c61a32249d..6edefabd5a82 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2924,70 +2924,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) return true; } -static bool nested_svm_vmrun(struct vcpu_svm *svm) +static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb, struct page *page) { - struct vmcb *nested_vmcb; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; - struct page *page; - u64 vmcb_gpa; - - vmcb_gpa = svm->vmcb->save.rax; - - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) - return false; - - if (!nested_vmcb_checks(nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; - - nested_svm_unmap(page); - - return false; - } - - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); - - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); - - /* Clear internal status */ - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs - */ - hsave->save.es = vmcb->save.es; - hsave->save.cs = vmcb->save.cs; - hsave->save.ss = vmcb->save.ss; - hsave->save.ds = vmcb->save.ds; - hsave->save.gdtr = vmcb->save.gdtr; - hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.efer; - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); - hsave->save.rip = kvm_rip_read(&svm->vcpu); - hsave->save.rsp = vmcb->save.rsp; - hsave->save.rax = vmcb->save.rax; - if (npt_enabled) - hsave->save.cr3 = vmcb->save.cr3; - else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - - copy_vmcb_control_area(hsave, vmcb); - if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else @@ -3080,6 +3019,73 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) enable_gif(svm); mark_all_dirty(svm->vmcb); +} + +static bool nested_svm_vmrun(struct vcpu_svm *svm) +{ + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + struct page *page; + u64 vmcb_gpa; + + vmcb_gpa = svm->vmcb->save.rax; + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); + if (!nested_vmcb) + return false; + + if (!nested_vmcb_checks(nested_vmcb)) { + nested_vmcb->control.exit_code = SVM_EXIT_ERR; + nested_vmcb->control.exit_code_hi = 0; + nested_vmcb->control.exit_info_1 = 0; + nested_vmcb->control.exit_info_2 = 0; + + nested_svm_unmap(page); + + return false; + } + + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, + nested_vmcb->save.rip, + nested_vmcb->control.int_ctl, + nested_vmcb->control.event_inj, + nested_vmcb->control.nested_ctl); + + trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, + nested_vmcb->control.intercept_cr >> 16, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept); + + /* Clear internal status */ + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); + + /* + * Save the old vmcb, so we don't need to pick what we save, but can + * restore everything when a VMEXIT occurs + */ + hsave->save.es = vmcb->save.es; + hsave->save.cs = vmcb->save.cs; + hsave->save.ss = vmcb->save.ss; + hsave->save.ds = vmcb->save.ds; + hsave->save.gdtr = vmcb->save.gdtr; + hsave->save.idtr = vmcb->save.idtr; + hsave->save.efer = svm->vcpu.arch.efer; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); + hsave->save.rip = kvm_rip_read(&svm->vcpu); + hsave->save.rsp = vmcb->save.rsp; + hsave->save.rax = vmcb->save.rax; + if (npt_enabled) + hsave->save.cr3 = vmcb->save.cr3; + else + hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); + + copy_vmcb_control_area(hsave, vmcb); + + enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page); return true; } -- cgit From 05cade71cf3b925042569c3e8dc1fa68a2b26995 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Wed, 11 Oct 2017 16:54:45 +0200 Subject: KVM: nSVM: fix SMI injection in guest mode Entering SMM while running in guest mode wasn't working very well because several pieces of the vcpu state were left set up for nested operation. Some of the issues observed: * L1 was getting unexpected VM exits (using L1 interception controls but running in SMM execution environment) * MMU was confused (walk_mmu was still set to nested_mmu) * INTERCEPT_SMI was not emulated for L1 (KVM never injected SVM_EXIT_SMI) Intel SDM actually prescribes the logical processor to "leave VMX operation" upon entering SMM in 34.14.1 Default Treatment of SMI Delivery. AMD doesn't seem to document this but they provide fields in the SMM state-save area to stash the current state of SVM. What we need to do is basically get out of guest mode for the duration of SMM. All this completely transparent to L1, i.e. L1 is not given control and no L1 observable state changes. To avoid code duplication this commit takes advantage of the existing nested vmexit and run functionality, perhaps at the cost of efficiency. To get out of guest mode, nested_svm_vmexit is called, unchanged. Re-entering is performed using enter_svm_guest_mode. This commit fixes running Windows Server 2016 with Hyper-V enabled in a VM with OVMF firmware (OVMF_CODE-need-smm.fd). Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm.c | 58 ++++++++++++++++++++++++++++++++++++++--- arch/x86/kvm/x86.c | 3 --- 3 files changed, 58 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 411ddbbaeabf..8700b845f780 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1430,4 +1430,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #endif } +#define put_smstate(type, buf, offset, val) \ + *(type *)((buf) + (offset) - 0x7e00) = val + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6edefabd5a82..ff94552f85d0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5409,19 +5409,71 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) static int svm_smi_allowed(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + + /* Per APM Vol.2 15.22.2 "Response to SMI" */ + if (!gif_set(svm)) + return 0; + + if (is_guest_mode(&svm->vcpu) && + svm->nested.intercept & (1ULL << INTERCEPT_SMI)) { + /* TODO: Might need to set exit_info_1 and exit_info_2 here */ + svm->vmcb->control.exit_code = SVM_EXIT_SMI; + svm->nested.exit_required = true; + return 0; + } + return 1; } static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { - /* TODO: Implement */ + struct vcpu_svm *svm = to_svm(vcpu); + int ret; + + if (is_guest_mode(vcpu)) { + /* FED8h - SVM Guest */ + put_smstate(u64, smstate, 0x7ed8, 1); + /* FEE0h - SVM Guest VMCB Physical Address */ + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb); + + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + + ret = nested_svm_vmexit(svm); + if (ret) + return ret; + } return 0; } static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) { - /* TODO: Implement */ - return 0; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *nested_vmcb; + struct page *page; + struct { + u64 guest; + u64 vmcb; + } svm_state_save; + int ret; + + ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save, + sizeof(svm_state_save)); + if (ret) + return ret; + + if (svm_state_save.guest) { + vcpu->arch.hflags &= ~HF_SMM_MASK; + nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page); + if (nested_vmcb) + enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page); + else + ret = 1; + vcpu->arch.hflags |= HF_SMM_MASK; + } + return ret; } static struct kvm_x86_ops svm_x86_ops __ro_after_init = { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 693bf8d01128..5669af09b732 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6485,9 +6485,6 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) { u32 flags = 0; -- cgit From 9b8ebbdb74b5ad76b9dfd8b101af17839174b126 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 17 Aug 2017 15:03:32 +0200 Subject: KVM: x86: extend usage of RET_MMIO_PF_* constants The x86 MMU if full of code that returns 0 and 1 for retry/emulate. Use the existing RET_MMIO_PF_RETRY/RET_MMIO_PF_EMULATE enum, renaming it to drop the MMIO part. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 95 ++++++++++++++++++++++------------------------ arch/x86/kvm/paging_tmpl.h | 18 ++++----- 2 files changed, 55 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e4fb82c0a5d0..0b481cc9c725 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -150,6 +150,20 @@ module_param(dbg, bool, 0644); /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 +/* + * Return values of handle_mmio_page_fault and mmu.page_fault: + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * + * For handle_mmio_page_fault only: + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE = 1, + RET_PF_INVALID = 2, +}; + struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -2794,13 +2808,13 @@ done: return ret; } -static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, - int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, - bool speculative, bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, + bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; - bool emulate = false; + int ret = RET_PF_RETRY; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2830,12 +2844,12 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, true, host_writable)) { if (write_fault) - emulate = true; + ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } if (unlikely(is_mmio_spte(*sptep))) - emulate = true; + ret = RET_PF_EMULATE; pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", @@ -2855,7 +2869,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, kvm_release_pfn_clean(pfn); - return emulate; + return ret; } static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2994,14 +3008,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. - * Return 1 to tell kvm to emulate it. */ if (pfn == KVM_PFN_ERR_RO_FAULT) - return 1; + return RET_PF_EMULATE; if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); - return 0; + return RET_PF_RETRY; } return -EFAULT; @@ -3286,13 +3299,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, } if (fast_page_fault(vcpu, v, level, error_code)) - return 0; + return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) return r; @@ -3312,7 +3325,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } @@ -3659,54 +3672,38 @@ exit: return reserved; } -/* - * Return values of handle_mmio_page_fault: - * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction - * directly. - * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page - * fault path update the mmio spte. - * RET_MMIO_PF_RETRY: let CPU fault again on the address. - * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). - */ -enum { - RET_MMIO_PF_EMULATE = 1, - RET_MMIO_PF_INVALID = 2, - RET_MMIO_PF_RETRY = 0, - RET_MMIO_PF_BUG = -1 -}; - static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; if (mmio_info_in_cache(vcpu, addr, direct)) - return RET_MMIO_PF_EMULATE; + return RET_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) - return RET_MMIO_PF_BUG; + return -EINVAL; if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) - return RET_MMIO_PF_INVALID; + return RET_PF_INVALID; if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return RET_MMIO_PF_EMULATE; + return RET_PF_EMULATE; } /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ - return RET_MMIO_PF_RETRY; + return RET_PF_RETRY; } EXPORT_SYMBOL_GPL(handle_mmio_page_fault); @@ -3756,7 +3753,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return 1; + return RET_PF_EMULATE; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3875,7 +3872,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return 1; + return RET_PF_EMULATE; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3892,13 +3889,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } if (fast_page_fault(vcpu, gpa, level, error_code)) - return 0; + return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) return r; @@ -3918,7 +3915,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } static void nonpaging_init_context(struct kvm_vcpu *vcpu, @@ -4917,25 +4914,25 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, vcpu->arch.gpa_val = cr2; } + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); - if (r == RET_MMIO_PF_EMULATE) { + if (r == RET_PF_EMULATE) { emulation_type = 0; goto emulate; } - if (r == RET_MMIO_PF_RETRY) - return 1; - if (r < 0) - return r; - /* Must be RET_MMIO_PF_INVALID. */ } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), - false); + if (r == RET_PF_INVALID) { + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); + WARN_ON(r == RET_PF_INVALID); + } + + if (r == RET_PF_RETRY) + return 1; if (r < 0) return r; - if (!r) - return 1; /* * Before emulating the instruction, check if the error code diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f18d1f8d332b..5abae72266b7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -593,7 +593,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, emulate; + int top_level, ret; direct_access = gw->pte_access; @@ -659,15 +659,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } clear_sp_write_flooding_count(it.sptep); - emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, gw->gfn, pfn, prefault, map_writable); + ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return emulate; + return ret; out_gpte_changed: kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } /* @@ -762,12 +762,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!prefault) inject_page_fault(vcpu, &walker.fault); - return 0; + return RET_PF_RETRY; } if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { shadow_page_table_clear_flood(vcpu, addr); - return 1; + return RET_PF_EMULATE; } vcpu->arch.write_fault_to_shadow_pgtable = false; @@ -789,7 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, &map_writable)) - return 0; + return RET_PF_RETRY; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; @@ -834,7 +834,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); - return 0; + return RET_PF_RETRY; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) -- cgit From cc3d967f7e32ceeb9b78dc962126ebcf1a2b24b2 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Tue, 17 Oct 2017 16:02:39 +0200 Subject: KVM: SVM: detect opening of SMI window using STGI intercept MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 05cade71cf3b ("KVM: nSVM: fix SMI injection in guest mode") made KVM mask SMI if GIF=0 but it didn't do anything to unmask it when GIF is enabled. The issue manifests for me as a significantly longer boot time of Windows guests when running with SMM-enabled OVMF. This commit fixes it by intercepting STGI instead of requesting immediate exit if the reason why SMM was masked is GIF. Fixes: 05cade71cf3b ("KVM: nSVM: fix SMI injection in guest mode") Signed-off-by: Ladi Prosek Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 16 +++++++++++++++- arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 22 ++++++++++++++-------- 4 files changed, 36 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8700b845f780..7233445a20bd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1065,6 +1065,7 @@ struct kvm_x86_ops { int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); + int (*enable_smi_window)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ff94552f85d0..b71daed3cca2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3187,7 +3187,7 @@ static int stgi_interception(struct vcpu_svm *svm) /* * If VGIF is enabled, the STGI intercept is only added to - * detect the opening of the NMI window; remove it now. + * detect the opening of the SMI/NMI window; remove it now. */ if (vgif_enabled(svm)) clr_intercept(svm, INTERCEPT_STGI); @@ -5476,6 +5476,19 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) return ret; } +static int enable_smi_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!gif_set(svm)) { + if (vgif_enabled(svm)) + set_intercept(svm, INTERCEPT_STGI); + /* STGI will cause a vm exit */ + return 1; + } + return 0; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5590,6 +5603,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .smi_allowed = svm_smi_allowed, .pre_enter_smm = svm_pre_enter_smm, .pre_leave_smm = svm_pre_leave_smm, + .enable_smi_window = enable_smi_window, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c460b0b439d3..69d45734091f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11973,6 +11973,11 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) return 0; } +static int enable_smi_window(struct kvm_vcpu *vcpu) +{ + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -12102,6 +12107,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, + .enable_smi_window = enable_smi_window, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5669af09b732..3b51c8659741 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6892,17 +6892,23 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; else { - /* Enable NMI/IRQ window open exits if needed. + /* Enable SMI/NMI/IRQ window open exits if needed. * - * SMIs have two cases: 1) they can be nested, and - * then there is nothing to do here because RSM will - * cause a vmexit anyway; 2) or the SMI can be pending - * because inject_pending_event has completed the - * injection of an IRQ or NMI from the previous vmexit, - * and then we request an immediate exit to inject the SMI. + * SMIs have three cases: + * 1) They can be nested, and then there is nothing to + * do here because RSM will cause a vmexit anyway. + * 2) There is an ISA-specific reason why SMI cannot be + * injected, and the moment when this changes can be + * intercepted. + * 3) Or the SMI can be pending because + * inject_pending_event has completed the injection + * of an IRQ or NMI from the previous vmexit, and + * then we request an immediate exit to inject the + * SMI. */ if (vcpu->arch.smi_pending && !is_smm(vcpu)) - req_immediate_exit = true; + if (!kvm_x86_ops->enable_smi_window(vcpu)) + req_immediate_exit = true; if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) -- cgit From 575b3a2cb439b03fd603ea77c73c76f3ed237596 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 19 Oct 2017 07:00:34 +0800 Subject: KVM: nVMX: Fix EPT switching advertising MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I can use vmxcap tool to observe "EPTP Switching yes" even if EPT is not exposed to L1. EPT switching is advertised unconditionally since it is emulated, however, it can be treated as an extended feature for EPT and it should not be advertised if EPT itself is not exposed. This patch fixes it. Reviewed-by: David Hildenbrand Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 69d45734091f..dba0f6ad4e57 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2842,8 +2842,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * Advertise EPTP switching unconditionally * since we emulate it */ - vmx->nested.nested_vmx_vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; + if (enable_ept) + vmx->nested.nested_vmx_vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; } /* -- cgit From 61f1dd9099aba56b7e6e3c3c4b9ad13199bba06e Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 18 Oct 2017 16:02:19 -0700 Subject: KVM: VMX: Fix VPID capability detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In my setup, EPT is not exposed to L1, the VPID capability is exposed and can be observed by vmxcap tool in L1: INVVPID supported yes Individual-address INVVPID yes Single-context INVVPID yes All-context INVVPID yes Single-context-retaining-globals INVVPID yes However, the module parameter of VPID observed in L1 is always N, the cpu_has_vmx_invvpid() check in L1 KVM fails since vmx_capability.vpid is 0 and it is not read from MSR due to EPT is not exposed. The VPID can be used to tag linear mappings when EPT is not enabled. However, current logic just detects VPID capability if EPT is enabled, this patch fixes it. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index dba0f6ad4e57..e6c8ffa84968 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3681,14 +3681,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, + &vmx_capability.ept, &vmx_capability.vpid); + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); - rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, - vmx_capability.ept, vmx_capability.vpid); + } else if (vmx_capability.ept) { + vmx_capability.ept = 0; + pr_warn_once("EPT CAP should not exist if not support " + "1-setting enable EPT VM-execution control\n"); + } + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && + vmx_capability.vpid) { + vmx_capability.vpid = 0; + pr_warn_once("VPID CAP should not exist if not support " + "1-setting enable VPID VM-execution control\n"); } min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; -- cgit From 9ffd986c6e4e59c11857cbc78e4217e9569f3725 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 19 Oct 2017 06:47:56 -0700 Subject: KVM: X86: #GP when guest attempts to write MCi_STATUS register w/o 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both Intel SDM and AMD APM mentioned that MCi_STATUS, when the register is implemented, this register can be cleared by explicitly writing 0s to this register. Writing 1s to this register will cause a general-protection exception. The mce is emulated in qemu, so just the guest attempts to write 1 to this register should cause a #GP, this patch does it. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Signed-off-by: Wanpeng Li Reviewed-by: Jim Mattson Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b51c8659741..34c85aa2e2d1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2006,10 +2006,12 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } -static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 msr = msr_info->index; + u64 data = msr_info->data; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -2034,6 +2036,9 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; + if (!msr_info->host_initiated && + (offset & 0x3) == 1 && data != 0) + return -1; vcpu->arch.mce_banks[offset] = data; break; } @@ -2283,7 +2288,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return set_msr_mce(vcpu, msr, data); + return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: -- cgit