diff options
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.c | 183 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 68 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 3 |
6 files changed, 140 insertions, 146 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c8029b7845b6..959b59d13b5a 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -5,6 +5,7 @@ #include <asm/vmx.h> #include "lapic.h" +#include "x86.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void) { u64 perf_cap = 0; + if (!enable_pmu) + return perf_cap; + if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5e0ac57d6d1b..466d18fc0c5d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -21,7 +21,6 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) static struct kvm_event_hw_type_mapping intel_arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, @@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + /* The above index must match CPUID 0x0A.EBX bit vector */ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; @@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select && - intel_arch_events[i].unit_mask == unit_mask && - (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) - break; + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { + if (intel_arch_events[i].eventsel != event_select || + intel_arch_events[i].unit_mask != unit_mask) + continue; + + /* disable event that reported as not present by cpuid */ + if ((i < 7) && !(pmu->available_event_types & (1 << i))) + return PERF_COUNT_HW_MAX + 1; + + break; + } if (i == ARRAY_SIZE(intel_arch_events)) return PERF_COUNT_HW_MAX; @@ -481,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) + if (!entry || !enable_pmu) return; eax.full = entry->eax; edx.full = entry->edx; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 88c53c521094..aa1fe9085d77 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -19,7 +19,7 @@ * wake the target vCPUs. vCPUs are removed from the list and the notification * vector is reset when the vCPU is scheduled in. */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); /* * Protect the per-CPU list with a per-CPU spinlock to handle task migration. * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the @@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will * occur if a wakeup IRQ arrives and attempts to acquire the lock. */ -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); +static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { @@ -51,7 +51,9 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; + unsigned long flags; unsigned int dest; /* @@ -62,23 +64,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!enable_apicv || !lapic_in_kernel(vcpu)) return; - /* Nothing to do if PI.SN and PI.NDST both have the desired value. */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + /* + * If the vCPU wasn't on the wakeup list and wasn't migrated, then the + * full update can be skipped as neither the vector nor the destination + * needs to be changed. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { + /* + * Clear SN if it was set due to being preempted. Again, do + * this even if there is no assigned device for simplicity. + */ + if (pi_test_and_clear_sn(pi_desc)) + goto after_clear_sn; return; + } + + local_irq_save(flags); /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. + * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup + * list of the _previous_ pCPU, which will not be the same as the + * current pCPU if the task was migrated. */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_del(&vmx->pi_wakeup_list); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); } - /* The full case. Set the new destination and clear SN. */ dest = cpu_physical_id(cpu); if (!x2apic_mode) dest = (dest << 8) & 0xFF00; @@ -86,10 +99,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) do { old.control = new.control = READ_ONCE(pi_desc->control); + /* + * Clear SN (as above) and refresh the destination APIC ID to + * handle task migration (@cpu != vcpu->cpu). + */ new.ndst = dest; new.sn = 0; + + /* + * Restore the notification vector; in the blocking case, the + * descriptor was modified on "put" to use the wakeup vector. + */ + new.nv = POSTED_INTR_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); + local_irq_restore(flags); + after_clear_sn: /* @@ -111,83 +136,25 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) irq_remapping_cap(IRQ_POSTING_CAP); } -void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!vmx_can_use_vtd_pi(vcpu->kvm)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * Remove the vCPU from the wakeup list of the _previous_ pCPU, which - * will not be the same as the current pCPU if the task was migrated. - */ - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - - dest = cpu_physical_id(vcpu->cpu); - if (!x2apic_mode) - dest = (dest << 8) & 0xFF00; - - WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the vCPU was blocking"); - - do { - old.control = new.control = READ_ONCE(pi_desc->control); - - new.ndst = dest; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); - - vcpu->pre_pcpu = -1; -} - /* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set + * WAKEUP as the notification vector in the PI descriptor. */ -int pi_pre_block(struct kvm_vcpu *vcpu) +static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { - struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct pi_desc old, new; unsigned long flags; - if (!vmx_can_use_vtd_pi(vcpu->kvm) || - vmx_interrupt_blocked(vcpu)) - return 0; - local_irq_save(flags); - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_add_tail(&vmx->pi_wakeup_list, + &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - WARN(pi_desc->sn == 1, - "Posted Interrupt Suppress Notification set before blocking"); + WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); do { old.control = new.control = READ_ONCE(pi_desc->control); @@ -196,24 +163,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu) new.nv = POSTED_INTR_WAKEUP_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc)) - __pi_post_block(vcpu); + /* + * Send a wakeup IPI to this CPU if an interrupt may have been posted + * before the notification vector was updated, in which case the IRQ + * will arrive on the non-wakeup vector. An IPI is needed as calling + * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not + * enabled until it is safe to call try_to_wake_up() on the task being + * scheduled out). + */ + if (pi_test_on(&new)) + apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); local_irq_restore(flags); - return (vcpu->pre_pcpu == -1); } -void pi_post_block(struct kvm_vcpu *vcpu) +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { - unsigned long flags; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (vcpu->pre_pcpu == -1) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; - local_irq_save(flags); - __pi_post_block(vcpu); - local_irq_restore(flags); + if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + pi_enable_wakeup_handler(vcpu); + + /* + * Set SN when the vCPU is preempted. Note, the vCPU can both be seen + * as blocking and preempted, e.g. if it's preempted between setting + * its wait state and manually scheduling out. + */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } /* @@ -221,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu) */ void pi_wakeup_handler(void) { - struct kvm_vcpu *vcpu; int cpu = smp_processor_id(); + struct vcpu_vmx *vmx; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); + list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu), + pi_wakeup_list) { - if (pi_test_on(pi_desc)) - kvm_vcpu_kick(vcpu); + if (pi_test_on(&vmx->pi_desc)) + kvm_vcpu_wake_up(&vmx->vcpu); } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } void __init pi_init_cpu(int cpu) { - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); + raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) @@ -254,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) * Bail out of the block loop if the VM has an assigned * device, but the blocking vCPU didn't reconfigure the * PI.NV to the wakeup vector, i.e. the assigned device - * came along after the initial check in pi_pre_block(). + * came along after the initial check in vmx_vcpu_pi_put(). */ void vmx_pi_start_assignment(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 36ae035f14aa..eb14e76b84ef 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); @@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); -int pi_pre_block(struct kvm_vcpu *vcpu); -void pi_post_block(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1b2e9d8c5cc9..4ac676066d60 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3931,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) { #ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - if (vcpu->mode == IN_GUEST_MODE) { /* * The vector of interrupt to be delivered to vcpu had @@ -3964,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, */ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; + return; } #endif - return false; + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -3997,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0; } return -1; @@ -4035,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); - + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); return 0; } @@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } +static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending; +} + static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) { + if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } +static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + return 1; +} + static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6928,6 +6945,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); + INIT_LIST_HEAD(&vmx->pi_wakeup_list); + err = -ENOMEM; vmx->vpid = allocate_vpid(); @@ -7549,25 +7568,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - if (kvm_x86_ops.set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7710,6 +7710,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, + .vcpu_pre_run = vmx_vcpu_pre_run, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, @@ -7768,9 +7769,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f8fc7441baea..7f2c82e7f38f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -317,6 +317,9 @@ struct vcpu_vmx { /* Posted interrupt descriptor */ struct pi_desc pi_desc; + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; |