diff options
Diffstat (limited to 'arch/x86/kvm/pmu.c')
| -rw-r--r-- | arch/x86/kvm/pmu.c | 581 |
1 files changed, 443 insertions, 138 deletions
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 1690d41c1830..487ad19a236e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -26,21 +26,31 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +/* Unadultered PMU capabilities of the host, i.e. of hardware. */ +static struct x86_pmu_capability __read_mostly kvm_host_pmu; + +/* KVM's PMU capabilities, i.e. the intersection of KVM and hardware support. */ struct x86_pmu_capability __read_mostly kvm_pmu_cap; -EXPORT_SYMBOL_GPL(kvm_pmu_cap); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_cap); + +struct kvm_pmu_emulated_event_selectors { + u64 INSTRUCTIONS_RETIRED; + u64 BRANCH_INSTRUCTIONS_RETIRED; +}; +static struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + X86_MATCH_VFM(INTEL_ICELAKE_D, NULL), + X86_MATCH_VFM(INTEL_ICELAKE_X, NULL), /* Instruction-Accurate PDIR (PDIR++) */ - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL), {} }; /* Precise Distribution (PDist) */ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL), {} }; @@ -66,8 +76,8 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { * code. Each pmc, stored in kvm_pmc.idx field, is unique across * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: - * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters - * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed + * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters + * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ @@ -93,17 +103,54 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) +void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { - return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); -} + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; -static void kvm_pmi_trigger_fn(struct irq_work *irq_work) -{ - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + /* + * Hybrid PMUs don't play nice with virtualization without careful + * configuration by userspace, and KVM's APIs for reporting supported + * vPMU features do not account for hybrid PMUs. Disable vPMU support + * for hybrid PMUs until KVM gains a way to let userspace opt-in. + */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { + enable_pmu = false; + memset(&kvm_host_pmu, 0, sizeof(kvm_host_pmu)); + } else { + perf_get_x86_pmu_capability(&kvm_host_pmu); + } + + if (enable_pmu) { + /* + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. + */ + if (!kvm_host_pmu.num_counters_gp || + WARN_ON_ONCE(kvm_host_pmu.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_host_pmu.version) + enable_pmu = false; + } + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } - kvm_pmu_deliver_pmi(vcpu); + memcpy(&kvm_pmu_cap, &kvm_host_pmu, sizeof(kvm_host_pmu)); + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_MAX_NR_FIXED_COUNTERS); + + kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); + kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) @@ -129,20 +176,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - if (!pmc->intr || skip_pmi) - return; - - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else + if (pmc->intr && !skip_pmi) kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } @@ -153,9 +187,9 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; /* - * Ignore overflow events for counters that are scheduled to be - * reprogrammed, e.g. if a PMI for the previous event races with KVM's - * handling of a related guest WRMSR. + * Ignore asynchronous overflow events for counters that are scheduled + * to be reprogrammed, e.g. if a PMI for the previous event races with + * KVM's handling of a related guest WRMSR. */ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) return; @@ -187,6 +221,15 @@ static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) return 1; } +static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) +{ + u64 sample_period = (-counter_value) & pmc_bitmask(pmc); + + if (!sample_period) + sample_period = pmc_bitmask(pmc) + 1; + return sample_period; +} + static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) @@ -208,7 +251,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, attr.sample_period = get_sample_period(pmc, pmc->counter); if ((attr.config & HSW_IN_TX_CHECKPOINTED) && - guest_cpuid_is_intel(pmc->vcpu)) { + (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) { /* * HSW_IN_TX_CHECKPOINTED is not supported with nonzero * period. Just clear the sample period so at least @@ -241,17 +284,30 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, return 0; } -static void pmc_pause_counter(struct kvm_pmc *pmc) +static bool pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - - if (!pmc->perf_event || pmc->is_paused) - return; + u64 prev_counter; /* update counter, reset event value to avoid redundant accumulation */ - counter += perf_event_pause(pmc->perf_event, true); + if (pmc->perf_event && !pmc->is_paused) + counter += perf_event_pause(pmc->perf_event, true); + + /* + * Snapshot the previous counter *after* accumulating state from perf. + * If overflow already happened, hardware (via perf) is responsible for + * generating a PMI. KVM just needs to detect overflow on emulated + * counter events that haven't yet been processed. + */ + prev_counter = counter & pmc_bitmask(pmc); + + counter += pmc->emulated_counter; pmc->counter = counter & pmc_bitmask(pmc); + + pmc->emulated_counter = 0; pmc->is_paused = true; + + return pmc->counter < prev_counter; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -276,6 +332,51 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } +static void pmc_release_perf_event(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + pmc->current_config = 0; + pmc_to_pmu(pmc)->event_count--; + } +} + +static void pmc_stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = pmc_read_counter(pmc); + pmc_release_perf_event(pmc); + } +} + +static void pmc_update_sample_period(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event || pmc->is_paused || + !is_sampling_event(pmc->perf_event)) + return; + + perf_event_period(pmc->perf_event, + get_sample_period(pmc, pmc->counter)); +} + +void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + /* + * Drop any unconsumed accumulated counts, the WRMSR is a write, not a + * read-modify-write. Adjust the counter value so that its value is + * relative to the current count, as reading the current count from + * perf is faster than pausing and repgrogramming the event in order to + * reset it to '0'. Note, this very sneakily offsets the accumulated + * emulated count too, by using pmc_read_counter()! + */ + pmc->emulated_counter = 0; + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); + pmc_update_sample_period(pmc); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(pmc_write_counter); + static int filter_cmp(const void *pa, const void *pb, u64 mask) { u64 a = *(u64 *)pa & mask; @@ -370,7 +471,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, int idx) { - int fixed_idx = idx - INTEL_PMC_IDX_FIXED; + int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX; if (filter->action == KVM_PMU_EVENT_DENY && test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) @@ -382,14 +483,11 @@ static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, return true; } -static bool check_pmu_event_filter(struct kvm_pmc *pmc) +static bool pmc_is_event_allowed(struct kvm_pmc *pmc) { struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) - return false; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (!filter) return true; @@ -400,25 +498,21 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } -static bool pmc_event_is_allowed(struct kvm_pmc *pmc) -{ - return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && - check_pmu_event_filter(pmc); -} - -static void reprogram_counter(struct kvm_pmc *pmc) +static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; u64 new_config = eventsel; + bool emulate_overflow; u8 fixed_ctr_ctrl; - pmc_pause_counter(pmc); + emulate_overflow = pmc_pause_counter(pmc); - if (!pmc_event_is_allowed(pmc)) - goto reprogram_complete; + if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || + !pmc_is_event_allowed(pmc)) + return 0; - if (pmc->counter < pmc->prev_counter) + if (emulate_overflow) __kvm_perf_overflow(pmc, false); if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) @@ -426,70 +520,125 @@ static void reprogram_counter(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); - if (fixed_ctr_ctrl & 0x1) + pmc->idx - KVM_FIXED_PMC_BASE_IDX); + if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL) eventsel |= ARCH_PERFMON_EVENTSEL_OS; - if (fixed_ctr_ctrl & 0x2) + if (fixed_ctr_ctrl & INTEL_FIXED_0_USER) eventsel |= ARCH_PERFMON_EVENTSEL_USR; - if (fixed_ctr_ctrl & 0x8) + if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI) eventsel |= ARCH_PERFMON_EVENTSEL_INT; new_config = (u64)fixed_ctr_ctrl; } if (pmc->current_config == new_config && pmc_resume_counter(pmc)) - goto reprogram_complete; + return 0; pmc_release_perf_event(pmc); pmc->current_config = new_config; + return pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); +} + +static bool pmc_is_event_match(struct kvm_pmc *pmc, u64 eventsel) +{ + /* + * Ignore checks for edge detect (all events currently emulated by KVM + * are always rising edges), pin control (unsupported by modern CPUs), + * and counter mask and its invert flag (KVM doesn't emulate multiple + * events in a single clock cycle). + * + * Note, the uppermost nibble of AMD's mask overlaps Intel's IN_TX (bit + * 32) and IN_TXCP (bit 33), as well as two reserved bits (bits 35:34). + * Checking the "in HLE/RTM transaction" flags is correct as the vCPU + * can't be in a transaction if KVM is emulating an instruction. + * + * Checking the reserved bits might be wrong if they are defined in the + * future, but so could ignoring them, so do the simple thing for now. + */ + return !((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB); +} + +void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc) +{ + bitmap_clear(pmu->pmc_counting_instructions, pmc->idx, 1); + bitmap_clear(pmu->pmc_counting_branches, pmc->idx, 1); + /* - * If reprogramming fails, e.g. due to contention, leave the counter's - * regprogram bit set, i.e. opportunistically try again on the next PMU - * refresh. Don't make a new request as doing so can stall the guest - * if reprogramming repeatedly fails. + * Do NOT consult the PMU event filters, as the filters must be checked + * at the time of emulation to ensure KVM uses fresh information, e.g. + * omitting a PMC from a bitmap could result in a missed event if the + * filter is changed to allow counting the event. */ - if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, - (eventsel & pmu->raw_event_mask), - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT)) + if (!pmc_is_locally_enabled(pmc)) return; -reprogram_complete: - clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); - pmc->prev_counter = 0; + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_instructions, pmc->idx, 1); + + if (pmc_is_event_match(pmc, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED)) + bitmap_set(pmu->pmc_counting_branches, pmc->idx, 1); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_recalc_pmc_emulation); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { + DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; int bit; - for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); + bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX); - if (unlikely(!pmc)) { - clear_bit(bit, pmu->reprogram_pmi); - continue; - } + /* + * The reprogramming bitmap can be written asynchronously by something + * other than the task that holds vcpu->mutex, take care to clear only + * the bits that will actually processed. + */ + BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t)); + atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi); - reprogram_counter(pmc); + kvm_for_each_pmc(pmu, pmc, bit, bitmap) { + /* + * If reprogramming fails, e.g. due to contention, re-set the + * regprogram bit set, i.e. opportunistically try again on the + * next PMU refresh. Don't make a new request as doing so can + * stall the guest if reprogramming repeatedly fails. + */ + if (reprogram_counter(pmc)) + set_bit(pmc->idx, pmu->reprogram_pmi); } /* - * Unused perf_events are only released if the corresponding MSRs - * weren't accessed during the last vCPU time slice. kvm_arch_sched_in - * triggers KVM_REQ_PMU if cleanup is needed. + * Release unused perf_events if the corresponding guest MSRs weren't + * accessed during the last vCPU time slice (need_cleanup is set when + * the vCPU is scheduled back in). */ if (unlikely(pmu->need_cleanup)) kvm_pmu_cleanup(vcpu); + + kvm_for_each_pmc(pmu, pmc, bit, bitmap) + kvm_pmu_recalc_pmc_emulation(pmu, pmc); } -/* check if idx is a valid index to access PMU */ -bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) { - return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx); + /* + * On Intel, VMX interception has priority over RDPMC exceptions that + * aren't already handled by the emulator, i.e. there are no additional + * check needed for Intel PMUs. + * + * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts, + * i.e. an invalid PMC results in a #GP, not #VMEXIT. + */ + if (!kvm_pmu_ops.check_rdpmc_early) + return 0; + + return kvm_pmu_call(check_rdpmc_early)(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -528,10 +677,9 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) { - bool fast_mode = idx & (1u << 31); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - u64 mask = fast_mode ? ~0u : ~0ull; + u64 mask = ~0ull; if (!pmu->version) return 1; @@ -539,12 +687,12 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); + pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask); if (!pmc) return 1; if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) && - (static_call(kvm_x86_get_cpl)(vcpu) != 0) && + (kvm_x86_call(get_cpl)(vcpu) != 0) && kvm_is_cr0_bit_set(vcpu, X86_CR0_PE)) return 1; @@ -555,21 +703,29 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { - static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); + kvm_pmu_call(deliver_pmi)(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } } bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || - static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); + default: + break; + } + return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) || + kvm_pmu_call(is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); + struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -577,34 +733,167 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + msr_info->data = pmu->global_status; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_CORE_PERF_GLOBAL_CTRL: + msr_info->data = pmu->global_ctrl; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + msr_info->data = 0; + break; + default: + return kvm_pmu_call(get_msr)(vcpu, msr_info); + } + + return 0; } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + u64 data = msr_info->data; + u64 diff; + + /* + * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, + * whereas Intel generates #GP on attempts to write reserved/RO MSRs. + */ + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + /* Per PPR, Read-only MSR. Writes are ignored. */ + if (!msr_info->host_initiated) + break; + + if (data & pmu->global_status_rsvd) + return 1; + + pmu->global_status = data; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + data &= ~pmu->global_ctrl_rsvd; + fallthrough; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + /* + * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in + * GLOBAL_STATUS, and so the set of reserved bits is the same. + */ + if (data & pmu->global_status_rsvd) + return 1; + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: + if (!msr_info->host_initiated) + pmu->global_status |= data & ~pmu->global_status_rsvd; + break; + default: + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); + return kvm_pmu_call(set_msr)(vcpu, msr_info); + } + + return 0; } -/* refresh PMU settings. This function generally is called when underlying - * settings are changed (such as changes of PMU CPUID by guest VMs), which - * should rarely happen. +static void kvm_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + pmu->need_cleanup = false; + + bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); + + kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) { + pmc_stop_counter(pmc); + pmc->counter = 0; + pmc->emulated_counter = 0; + + if (pmc_is_gp(pmc)) + pmc->eventsel = 0; + } + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; + + kvm_pmu_call(reset)(vcpu); +} + + +/* + * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID + * and/or PERF_CAPABILITIES. */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) return; - bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); - static_call(kvm_x86_pmu_refresh)(vcpu); -} + /* + * Stop/release all existing counters/events before realizing the new + * vPMU model. + */ + kvm_pmu_reset(vcpu); -void kvm_pmu_reset(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + pmu->version = 0; + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_rsvd = ~0ull; + pmu->global_status_rsvd = ~0ull; + pmu->fixed_ctr_ctrl_rsvd = ~0ull; + pmu->pebs_enable_rsvd = ~0ull; + pmu->pebs_data_cfg_rsvd = ~0ull; + bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); + + if (!vcpu->kvm->arch.enable_pmu) + return; + + kvm_pmu_call(refresh)(vcpu); + + /* + * At RESET, both Intel and AMD CPUs set all enable bits for general + * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that + * was written for v1 PMUs don't unknowingly leave GP counters disabled + * in the global controls). Emulate that behavior when refreshing the + * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. + */ + if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) + pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); - irq_work_sync(&pmu->irq_work); - static_call(kvm_x86_pmu_reset)(vcpu); + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); + bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, + pmu->nr_arch_fixed_counters); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -612,11 +901,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - static_call(kvm_x86_pmu_init)(vcpu); - init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); - pmu->event_count = 0; - pmu->need_cleanup = false; - kvm_pmu_refresh(vcpu); + kvm_pmu_call(init)(vcpu); } /* Release perf_events for vPMCs that have been unused for a full time slice. */ @@ -632,14 +917,12 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, pmu->pmc_in_use, X86_PMC_IDX_MAX); - for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { - pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); - - if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) + kvm_for_each_pmc(pmu, pmc, i, bitmask) { + if (pmc->perf_event && !pmc_is_locally_enabled(pmc)) pmc_stop_counter(pmc); } - static_call_cond(kvm_x86_pmu_cleanup)(vcpu); + kvm_pmu_call(cleanup)(vcpu); bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } @@ -651,18 +934,10 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - pmc->prev_counter = pmc->counter; - pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + pmc->emulated_counter++; kvm_pmu_request_counter_reprogram(pmc); } -static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, - unsigned int perf_hw_id) -{ - return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & - AMD64_RAW_EVENT_MASK_NB); -} - static inline bool cpl_is_matched(struct kvm_pmc *pmc) { bool select_os, select_user; @@ -674,32 +949,62 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user = config & ARCH_PERFMON_EVENTSEL_USR; } else { config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); - select_os = config & 0x1; - select_user = config & 0x2; + pmc->idx - KVM_FIXED_PMC_BASE_IDX); + select_os = config & INTEL_FIXED_0_KERNEL; + select_user = config & INTEL_FIXED_0_USER; } - return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; + /* + * Skip the CPL lookup, which isn't free on Intel, if the result will + * be the same regardless of the CPL. + */ + if (select_os == select_user) + return select_os; + + return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os : + select_user; } -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) +static void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, + const unsigned long *event_pmcs) { + DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - int i; + int i, idx; - for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { - pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); + BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); - if (!pmc || !pmc_event_is_allowed(pmc)) + if (bitmap_empty(event_pmcs, X86_PMC_IDX_MAX)) + return; + + if (!kvm_pmu_has_perf_global_ctrl(pmu)) + bitmap_copy(bitmap, event_pmcs, X86_PMC_IDX_MAX); + else if (!bitmap_and(bitmap, event_pmcs, + (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) + return; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_for_each_pmc(pmu, pmc, i, bitmap) { + if (!pmc_is_event_allowed(pmc) || !cpl_is_matched(pmc)) continue; - /* Ignore checks for edge detect, pin control, invert and CMASK bits */ - if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) - kvm_pmu_incr_counter(pmc); + kvm_pmu_incr_counter(pmc); } + srcu_read_unlock(&vcpu->kvm->srcu, idx); +} + +void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_instructions); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_instruction_retired); + +void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu) +{ + kvm_pmu_trigger_event(vcpu, vcpu_to_pmu(vcpu)->pmc_counting_branches); } -EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_pmu_branch_retired); static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) { |
