From 571d91dcadfa3cef499010b4eddb9b58b0da4d24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:19 -0700 Subject: perf: Add branch stack counters Currently, the additional information of a branch entry is stored in a u64 space. With more and more information added, the space is running out. For example, the information of occurrences of events will be added for each branch. Two places were suggested to append the counters. https://lore.kernel.org/lkml/20230802215814.GH231007@hirez.programming.kicks-ass.net/ One place is right after the flags of each branch entry. It changes the existing struct perf_branch_entry. The later ARCH specific implementation has to be really careful to consistently pick the right struct. The other place is right after the entire struct perf_branch_stack. The disadvantage is that the pointer of the extra space has to be recorded. The common interface perf_sample_save_brstack() has to be updated. The latter is much straightforward, and should be easily understood and maintained. It is implemented in the patch. Add a new branch sample type, PERF_SAMPLE_BRANCH_COUNTERS, to indicate the event which is recorded in the branch info. The "u64 counters" may store the occurrences of several events. The information regarding the number of events/counters and the width of each counter should be exposed via sysfs as a reference for the perf tool. Define the branch_counter_nr and branch_counter_width ABI here. The support will be implemented later in the Intel-specific patch. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-1-kan.liang@linux.intel.com --- arch/x86/events/amd/core.c | 2 +- arch/x86/events/core.c | 2 +- arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/ds.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e24976593a29..4ee6390b45c9 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -940,7 +940,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) continue; if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 40ad1425ffa2..40c9af124128 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1702,7 +1702,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a08f794a0e79..41a164764a84 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3047,7 +3047,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index bf97ab904d40..cb3f329f8fa4 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1755,7 +1755,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, setup_pebs_time(event, data, pebs->tsc); if (has_branch_stack(event)) - perf_sample_save_brstack(data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1912,7 +1912,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); - perf_sample_save_brstack(data, event, &cpuc->lbr_stack); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); } } -- cgit From 85846b27072defc7ab3dcee7ff36563a040079dc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:20 -0700 Subject: perf/x86: Add PERF_X86_EVENT_NEEDS_BRANCH_STACK flag Currently, branch_sample_type !=0 is used to check whether a branch stack setup is required. But it doesn't check the sample type, unnecessary branch stack setup may be done for a counting event. E.g., perf record -e "{branch-instructions,branch-misses}:S" -j any Also, the event only with the new PERF_SAMPLE_BRANCH_COUNTERS branch sample type may not require a branch stack setup either. Add a new flag NEEDS_BRANCH_STACK to indicate whether the event requires a branch stack setup. Replace the needs_branch_stack() by checking the new flag. The counting event check is implemented here. The later patch will take the new PERF_SAMPLE_BRANCH_COUNTERS into account. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-2-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 14 +++++++++++--- arch/x86/events/perf_event_flags.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 41a164764a84..a99449c0d77c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2527,9 +2527,14 @@ static void intel_pmu_assign_event(struct perf_event *event, int idx) perf_report_aux_output_id(event, idx); } +static __always_inline bool intel_pmu_needs_branch_stack(struct perf_event *event) +{ + return event->hw.flags & PERF_X86_EVENT_NEEDS_BRANCH_STACK; +} + static void intel_pmu_del_event(struct perf_event *event) { - if (needs_branch_stack(event)) + if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); @@ -2820,7 +2825,7 @@ static void intel_pmu_add_event(struct perf_event *event) { if (event->attr.precise_ip) intel_pmu_pebs_add(event); - if (needs_branch_stack(event)) + if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); } @@ -3897,7 +3902,10 @@ static int intel_pmu_hw_config(struct perf_event *event) x86_pmu.pebs_aliases(event); } - if (needs_branch_stack(event)) { + if (needs_branch_stack(event) && is_sampling_event(event)) + event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK; + + if (intel_pmu_needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 1dc19b9b4426..a1685981c520 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -20,3 +20,4 @@ PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ +PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ -- cgit From 1f2376cd03dd3b965d130ed46a7c92769d614ba1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:21 -0700 Subject: perf: Add branch_sample_call_stack Add a helper function to check call stack sample type. The later patch will invoke the function in several places. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-3-kan.liang@linux.intel.com --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 40c9af124128..09050641ce5d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -601,7 +601,7 @@ int x86_pmu_hw_config(struct perf_event *event) } } - if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK) + if (branch_sample_call_stack(event)) event->attach_state |= PERF_ATTACH_TASK_DATA; /* -- cgit From 318c4985911245508f7e0bab5265e208a38b5f18 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:22 -0700 Subject: perf/x86/intel: Reorganize attrs and is_visible Some attrs and is_visible implementations are rather far away from one another which makes the whole thing hard to interpret. There are only two attribute groups which have both .attrs and .is_visible, group_default and group_caps_lbr. Move them together. No functional changes. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-4-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a99449c0d77c..584b58df7bf6 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5540,6 +5540,12 @@ static struct attribute *lbr_attrs[] = { NULL }; +static umode_t +lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + static char pmu_name_str[30]; static ssize_t pmu_name_show(struct device *cdev, @@ -5566,6 +5572,15 @@ static struct attribute *intel_pmu_attrs[] = { NULL, }; +static umode_t +default_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + if (attr == &dev_attr_allow_tsx_force_abort.attr) + return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0; + + return attr->mode; +} + static umode_t tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) { @@ -5587,27 +5602,12 @@ mem_is_visible(struct kobject *kobj, struct attribute *attr, int i) return pebs_is_visible(kobj, attr, i); } -static umode_t -lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) -{ - return x86_pmu.lbr_nr ? attr->mode : 0; -} - static umode_t exra_is_visible(struct kobject *kobj, struct attribute *attr, int i) { return x86_pmu.version >= 2 ? attr->mode : 0; } -static umode_t -default_is_visible(struct kobject *kobj, struct attribute *attr, int i) -{ - if (attr == &dev_attr_allow_tsx_force_abort.attr) - return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0; - - return attr->mode; -} - static struct attribute_group group_events_td = { .name = "events", }; -- cgit From 33744916196b4ed7a50f6f47af7c3ad46b730ce6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:23 -0700 Subject: perf/x86/intel: Support branch counters logging The branch counters logging (A.K.A LBR event logging) introduces a per-counter indication of precise event occurrences in LBRs. It can provide a means to attribute exposed retirement latency to combinations of events across a block of instructions. It also provides a means of attributing Timed LBR latencies to events. The feature is first introduced on SRF/GRR. It is an enhancement of the ARCH LBR. It adds new fields in the LBR_INFO MSRs to log the occurrences of events on the GP counters. The information is displayed by the order of counters. The design proposed in this patch requires that the events which are logged must be in a group with the event that has LBR. If there are more than one LBR group, the counters logging information only from the current group (overflowed) are stored for the perf tool, otherwise the perf tool cannot know which and when other groups are scheduled especially when multiplexing is triggered. The user can ensure it uses the maximum number of counters that support LBR info (4 by now) by making the group large enough. The HW only logs events by the order of counters. The order may be different from the order of enabling which the perf tool can understand. When parsing the information of each branch entry, convert the counter order to the enabled order, and store the enabled order in the extension space. Unconditionally reset LBRs for an LBR event group when it's deleted. The logged counter information is only valid for the current LBR group. If another LBR group is scheduled later, the information from the stale LBRs would be otherwise wrongly interpreted. Add a sanity check in intel_pmu_hw_config(). Disable the feature if other counter filters (inv, cmask, edge, in_tx) are set or LBR call stack mode is enabled. (For the LBR call stack mode, we cannot simply flush the LBR, since it will break the call stack. Also, there is no obvious usage with the call stack mode for now.) Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't require any branch stack setup. Expose the maximum number of supported counters and the width of the counters into the sysfs. The perf tool can use the information to parse the logged counters in each branch. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-5-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 103 ++++++++++++++++++++++++++++++++++--- arch/x86/events/intel/ds.c | 2 +- arch/x86/events/intel/lbr.c | 85 +++++++++++++++++++++++++++++- arch/x86/events/perf_event.h | 12 +++++ arch/x86/events/perf_event_flags.h | 1 + 5 files changed, 195 insertions(+), 8 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 584b58df7bf6..e068a96aeb54 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2792,6 +2792,7 @@ static void intel_pmu_enable_fixed(struct perf_event *event) static void intel_pmu_enable_event(struct perf_event *event) { + u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; @@ -2800,8 +2801,10 @@ static void intel_pmu_enable_event(struct perf_event *event) switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: + if (branch_sample_counters(event)) + enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: @@ -3052,7 +3055,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); + intel_pmu_lbr_save_brstack(&data, cpuc, event); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -3617,6 +3620,13 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); + /* Not all counters support the branch counter feature. */ + if (branch_sample_counters(event)) { + c2 = dyn_constraint(cpuc, c2, idx); + c2->idxmsk64 &= x86_pmu.lbr_counters; + c2->weight = hweight64(c2->idxmsk64); + } + return c2; } @@ -3905,6 +3915,58 @@ static int intel_pmu_hw_config(struct perf_event *event) if (needs_branch_stack(event) && is_sampling_event(event)) event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK; + if (branch_sample_counters(event)) { + struct perf_event *leader, *sibling; + int num = 0; + + if (!(x86_pmu.flags & PMU_FL_BR_CNTR) || + (event->attr.config & ~INTEL_ARCH_EVENT_MASK)) + return -EINVAL; + + /* + * The branch counter logging is not supported in the call stack + * mode yet, since we cannot simply flush the LBR during e.g., + * multiplexing. Also, there is no obvious usage with the call + * stack mode. Simply forbids it for now. + * + * If any events in the group enable the branch counter logging + * feature, the group is treated as a branch counter logging + * group, which requires the extra space to store the counters. + */ + leader = event->group_leader; + if (branch_sample_call_stack(leader)) + return -EINVAL; + if (branch_sample_counters(leader)) + num++; + leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; + + for_each_sibling_event(sibling, leader) { + if (branch_sample_call_stack(sibling)) + return -EINVAL; + if (branch_sample_counters(sibling)) + num++; + } + + if (num > fls(x86_pmu.lbr_counters)) + return -EINVAL; + /* + * Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't + * require any branch stack setup. + * Clear the bit to avoid unnecessary branch stack setup. + */ + if (0 == (event->attr.branch_sample_type & + ~(PERF_SAMPLE_BRANCH_PLM_ALL | + PERF_SAMPLE_BRANCH_COUNTERS))) + event->hw.flags &= ~PERF_X86_EVENT_NEEDS_BRANCH_STACK; + + /* + * Force the leader to be a LBR event. So LBRs can be reset + * with the leader event. See intel_pmu_lbr_del() for details. + */ + if (!intel_pmu_needs_branch_stack(leader)) + return -EINVAL; + } + if (intel_pmu_needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) @@ -4383,8 +4445,13 @@ cmt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, */ if (event->attr.precise_ip == 3) { /* Force instruction:ppp on PMC0, 1 and Fixed counter 0 */ - if (constraint_match(&fixed0_constraint, event->hw.config)) - return &fixed0_counter0_1_constraint; + if (constraint_match(&fixed0_constraint, event->hw.config)) { + /* The fixed counter 0 doesn't support LBR event logging. */ + if (branch_sample_counters(event)) + return &counter0_1_constraint; + else + return &fixed0_counter0_1_constraint; + } switch (c->idxmsk64 & 0x3ull) { case 0x1: @@ -4563,7 +4630,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) goto err; } - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); @@ -5535,15 +5602,39 @@ static ssize_t branches_show(struct device *cdev, static DEVICE_ATTR_RO(branches); +static ssize_t branch_counter_nr_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", fls(x86_pmu.lbr_counters)); +} + +static DEVICE_ATTR_RO(branch_counter_nr); + +static ssize_t branch_counter_width_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", LBR_INFO_BR_CNTR_BITS); +} + +static DEVICE_ATTR_RO(branch_counter_width); + static struct attribute *lbr_attrs[] = { &dev_attr_branches.attr, + &dev_attr_branch_counter_nr.attr, + &dev_attr_branch_counter_width.attr, NULL }; static umode_t lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.lbr_nr ? attr->mode : 0; + /* branches */ + if (i == 0) + return x86_pmu.lbr_nr ? attr->mode : 0; + + return (x86_pmu.flags & PMU_FL_BR_CNTR) ? attr->mode : 0; } static char pmu_name_str[30]; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cb3f329f8fa4..d49d661ec0a7 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1912,7 +1912,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); - perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); + intel_pmu_lbr_save_brstack(data, cpuc, event); } } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index c3b0d15a9841..78cd5084104e 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -676,6 +676,25 @@ void intel_pmu_lbr_del(struct perf_event *event) WARN_ON_ONCE(cpuc->lbr_users < 0); WARN_ON_ONCE(cpuc->lbr_pebs_users < 0); perf_sched_cb_dec(event->pmu); + + /* + * The logged occurrences information is only valid for the + * current LBR group. If another LBR group is scheduled in + * later, the information from the stale LBRs will be wrongly + * interpreted. Reset the LBRs here. + * + * Only clear once for a branch counter group with the leader + * event. Because + * - Cannot simply reset the LBRs with the !cpuc->lbr_users. + * Because it's possible that the last LBR user is not in a + * branch counter group, e.g., a branch_counters group + + * several normal LBR events. + * - The LBR reset can be done with any one of the events in a + * branch counter group, since they are always scheduled together. + * It's easy to force the leader event an LBR event. + */ + if (is_branch_counters_group(event) && event == event->group_leader) + intel_pmu_lbr_reset(); } static inline bool vlbr_exclude_host(void) @@ -866,6 +885,8 @@ static __always_inline u16 get_lbr_cycles(u64 info) return cycles; } +static_assert((64 - PERF_BRANCH_ENTRY_INFO_BITS_MAX) > LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS); + static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, struct lbr_entry *entries) { @@ -898,11 +919,67 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); e->type = get_lbr_br_type(info); + + /* + * Leverage the reserved field of cpuc->lbr_entries[i] to + * temporarily store the branch counters information. + * The later code will decide what content can be disclosed + * to the perf tool. Pleae see intel_pmu_lbr_counters_reorder(). + */ + e->reserved = (info >> LBR_INFO_BR_CNTR_OFFSET) & LBR_INFO_BR_CNTR_FULL_MASK; } cpuc->lbr_stack.nr = i; } +/* + * The enabled order may be different from the counter order. + * Update the lbr_counters with the enabled order. + */ +static void intel_pmu_lbr_counters_reorder(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + int i, j, pos = 0, order[X86_PMC_IDX_MAX]; + struct perf_event *leader, *sibling; + u64 src, dst, cnt; + + leader = event->group_leader; + if (branch_sample_counters(leader)) + order[pos++] = leader->hw.idx; + + for_each_sibling_event(sibling, leader) { + if (!branch_sample_counters(sibling)) + continue; + order[pos++] = sibling->hw.idx; + } + + WARN_ON_ONCE(!pos); + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + src = cpuc->lbr_entries[i].reserved; + dst = 0; + for (j = 0; j < pos; j++) { + cnt = (src >> (order[j] * LBR_INFO_BR_CNTR_BITS)) & LBR_INFO_BR_CNTR_MASK; + dst |= cnt << j * LBR_INFO_BR_CNTR_BITS; + } + cpuc->lbr_counters[i] = dst; + cpuc->lbr_entries[i].reserved = 0; + } +} + +void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, + struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_branch_counters_group(event)) { + intel_pmu_lbr_counters_reorder(cpuc, event); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, cpuc->lbr_counters); + return; + } + + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); +} + static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) { intel_pmu_store_lbr(cpuc, NULL); @@ -1173,8 +1250,10 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) for (i = 0; i < cpuc->lbr_stack.nr; ) { if (!cpuc->lbr_entries[i].from) { j = i; - while (++j < cpuc->lbr_stack.nr) + while (++j < cpuc->lbr_stack.nr) { cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; + cpuc->lbr_counters[j-1] = cpuc->lbr_counters[j]; + } cpuc->lbr_stack.nr--; if (!cpuc->lbr_entries[i].from) continue; @@ -1525,8 +1604,12 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_mispred = ecx.split.lbr_mispred; x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr; x86_pmu.lbr_br_type = ecx.split.lbr_br_type; + x86_pmu.lbr_counters = ecx.split.lbr_counters; x86_pmu.lbr_nr = lbr_nr; + if (!!x86_pmu.lbr_counters) + x86_pmu.flags |= PMU_FL_BR_CNTR; + if (x86_pmu.lbr_mispred) static_branch_enable(&x86_lbr_mispred); if (x86_pmu.lbr_timed_lbr) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 53dd5d495ba6..fb56518356ec 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -110,6 +110,11 @@ static inline bool is_topdown_event(struct perf_event *event) return is_metric_event(event) || is_slots_event(event); } +static inline bool is_branch_counters_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -283,6 +288,7 @@ struct cpu_hw_events { int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + u64 lbr_counters[MAX_LBR_ENTRIES]; /* branch stack extra */ union { struct er_account *lbr_sel; struct er_account *lbr_ctl; @@ -888,6 +894,7 @@ struct x86_pmu { unsigned int lbr_mispred:1; unsigned int lbr_timed_lbr:1; unsigned int lbr_br_type:1; + unsigned int lbr_counters:4; void (*lbr_reset)(void); void (*lbr_read)(struct cpu_hw_events *cpuc); @@ -1012,6 +1019,7 @@ do { \ #define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */ #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ +#define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -1552,6 +1560,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); +void intel_pmu_lbr_save_brstack(struct perf_sample_data *data, + struct cpu_hw_events *cpuc, + struct perf_event *event); + void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc); diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index a1685981c520..6c977c19f2cd 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -21,3 +21,4 @@ PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ +PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ -- cgit From 243218ca93037631f0224fdbefea045912cb761a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 16 Nov 2023 06:22:42 -0800 Subject: perf/x86/intel/cstate: Cleanup duplicate attr_groups The events of the cstate_core and cstate_pkg PMU have the same format. They both need to create a "events" group (with empty attrs). The attr_groups can be shared. Remove the dedicated attr_groups for each cstate PMU. Use the shared cstate_attr_groups to replace. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231116142245.1233485-1-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 44 +++++++++++------------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index cbeb6d2bf5b4..693bdcd92e8c 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -189,20 +189,20 @@ static struct attribute *attrs_empty[] = { * "events" group (with empty attrs) before updating * it with detected events. */ -static struct attribute_group core_events_attr_group = { +static struct attribute_group cstate_events_attr_group = { .name = "events", .attrs = attrs_empty, }; -DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63"); -static struct attribute *core_format_attrs[] = { - &format_attr_core_event.attr, +DEFINE_CSTATE_FORMAT_ATTR(cstate_event, event, "config:0-63"); +static struct attribute *cstate_format_attrs[] = { + &format_attr_cstate_event.attr, NULL, }; -static struct attribute_group core_format_attr_group = { +static struct attribute_group cstate_format_attr_group = { .name = "format", - .attrs = core_format_attrs, + .attrs = cstate_format_attrs, }; static cpumask_t cstate_core_cpu_mask; @@ -217,9 +217,9 @@ static struct attribute_group cpumask_attr_group = { .attrs = cstate_cpumask_attrs, }; -static const struct attribute_group *core_attr_groups[] = { - &core_events_attr_group, - &core_format_attr_group, +static const struct attribute_group *cstate_attr_groups[] = { + &cstate_events_attr_group, + &cstate_format_attr_group, &cpumask_attr_group, NULL, }; @@ -268,30 +268,8 @@ static struct perf_msr pkg_msr[] = { [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr }, }; -static struct attribute_group pkg_events_attr_group = { - .name = "events", - .attrs = attrs_empty, -}; - -DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63"); -static struct attribute *pkg_format_attrs[] = { - &format_attr_pkg_event.attr, - NULL, -}; -static struct attribute_group pkg_format_attr_group = { - .name = "format", - .attrs = pkg_format_attrs, -}; - static cpumask_t cstate_pkg_cpu_mask; -static const struct attribute_group *pkg_attr_groups[] = { - &pkg_events_attr_group, - &pkg_format_attr_group, - &cpumask_attr_group, - NULL, -}; - static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -478,7 +456,7 @@ static const struct attribute_group *pkg_attr_update[] = { }; static struct pmu cstate_core_pmu = { - .attr_groups = core_attr_groups, + .attr_groups = cstate_attr_groups, .attr_update = core_attr_update, .name = "cstate_core", .task_ctx_nr = perf_invalid_context, @@ -493,7 +471,7 @@ static struct pmu cstate_core_pmu = { }; static struct pmu cstate_pkg_pmu = { - .attr_groups = pkg_attr_groups, + .attr_groups = cstate_attr_groups, .attr_update = pkg_attr_update, .name = "cstate_pkg", .task_ctx_nr = perf_invalid_context, -- cgit From 3877d55a0db2688c2e4ab8a319614a0c81f8e2d2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 16 Nov 2023 06:22:44 -0800 Subject: perf/x86/intel/cstate: Add Sierra Forest support A new module C6 Residency Counter is introduced in the Sierra Forest. The scope of the new counter is module (A cluster of cores shared L2 cache). Create a brand new cstate_module PMU to profile the new counter. The only differences between the new cstate_module PMU and the existing cstate PMU are the scope and events. Regarding the choice of the new cstate_module PMU name, the current naming rule of a cstate PMU is "cstate_" + the scope of the PMU. The scope of the PMU is the cores shared L2. On SRF, Intel calls it "module", while the internal Linux sched code calls it "cluster". The "cstate_module" is used as the new PMU name, because - The Cstate PMU driver is a Intel specific driver. It doesn't impact other ARCHs. The name makes it consistent with the documentation. - The "cluster" mainly be used by the scheduler developer, while the user of cstate PMU is more likely a researcher reading HW docs and optimizing power. - In the Intel's SDM, the "cluster" has a different meaning/scope for topology. Using it will mislead the end users. Besides the module C6, the core C1/C6 and pkg C6 residency counters are supported in the Sierra Forest as well. Suggested-by: Artem Bityutskiy Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231116142245.1233485-3-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 113 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 4 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 693bdcd92e8c..4a46ef315284 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL + * MTL,SRF * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -52,7 +52,7 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, - * TGL,TNT,RKL,ADL,RPL,SPR,MTL + * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -75,7 +75,7 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, - * TGL,TNT,RKL,ADL,RPL,SPR,MTL + * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -97,6 +97,10 @@ * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, * TNT,RKL,ADL,RPL,MTL * Scope: Package (physical package) + * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. + * perf code: 0x00 + * Available model: SRF + * Scope: A cluster of cores shared L2 cache * */ @@ -130,6 +134,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, struct cstate_model { unsigned long core_events; unsigned long pkg_events; + unsigned long module_events; unsigned long quirks; }; @@ -270,6 +275,28 @@ static struct perf_msr pkg_msr[] = { static cpumask_t cstate_pkg_cpu_mask; +/* cstate_module PMU */ +static struct pmu cstate_module_pmu; +static bool has_cstate_module; + +enum perf_cstate_module_events { + PERF_CSTATE_MODULE_C6_RES = 0, + + PERF_CSTATE_MODULE_EVENT_MAX, +}; + +PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_module_c6, "event=0x00"); + +static unsigned long module_msr_mask; + +PMU_EVENT_GROUP(events, cstate_module_c6); + +static struct perf_msr module_msr[] = { + [PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr }, +}; + +static cpumask_t cstate_module_cpu_mask; + static ssize_t cstate_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -280,6 +307,8 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev, return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask); else if (pmu == &cstate_pkg_pmu) return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask); + else if (pmu == &cstate_module_pmu) + return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask); else return 0; } @@ -320,6 +349,15 @@ static int cstate_pmu_event_init(struct perf_event *event) event->hw.event_base = pkg_msr[cfg].msr; cpu = cpumask_any_and(&cstate_pkg_cpu_mask, topology_die_cpumask(event->cpu)); + } else if (event->pmu == &cstate_module_pmu) { + if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX) + return -EINVAL; + cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_MODULE_EVENT_MAX); + if (!(module_msr_mask & (1 << cfg))) + return -EINVAL; + event->hw.event_base = module_msr[cfg].msr; + cpu = cpumask_any_and(&cstate_module_cpu_mask, + topology_cluster_cpumask(event->cpu)); } else { return -ENOENT; } @@ -407,6 +445,17 @@ static int cstate_cpu_exit(unsigned int cpu) perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); } } + + if (has_cstate_module && + cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) { + + target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { + cpumask_set_cpu(target, &cstate_module_cpu_mask); + perf_pmu_migrate_context(&cstate_module_pmu, cpu, target); + } + } return 0; } @@ -433,6 +482,15 @@ static int cstate_cpu_init(unsigned int cpu) if (has_cstate_pkg && target >= nr_cpu_ids) cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); + /* + * If this is the first online thread of that cluster, set it + * in the cluster cpu mask as the designated reader. + */ + target = cpumask_any_and(&cstate_module_cpu_mask, + topology_cluster_cpumask(cpu)); + if (has_cstate_module && target >= nr_cpu_ids) + cpumask_set_cpu(cpu, &cstate_module_cpu_mask); + return 0; } @@ -455,6 +513,11 @@ static const struct attribute_group *pkg_attr_update[] = { NULL, }; +static const struct attribute_group *module_attr_update[] = { + &group_cstate_module_c6, + NULL +}; + static struct pmu cstate_core_pmu = { .attr_groups = cstate_attr_groups, .attr_update = core_attr_update, @@ -485,6 +548,21 @@ static struct pmu cstate_pkg_pmu = { .module = THIS_MODULE, }; +static struct pmu cstate_module_pmu = { + .attr_groups = cstate_attr_groups, + .attr_update = module_attr_update, + .name = "cstate_module", + .task_ctx_nr = perf_invalid_context, + .event_init = cstate_pmu_event_init, + .add = cstate_pmu_event_add, + .del = cstate_pmu_event_del, + .start = cstate_pmu_event_start, + .stop = cstate_pmu_event_stop, + .read = cstate_pmu_event_update, + .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, + .module = THIS_MODULE, +}; + static const struct cstate_model nhm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C3_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -599,6 +677,15 @@ static const struct cstate_model glm_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model srf_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES), + + .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), +}; + static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates), @@ -651,6 +738,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), @@ -692,10 +780,14 @@ static int __init cstate_probe(const struct cstate_model *cm) pkg_msr_mask = perf_msr_probe(pkg_msr, PERF_CSTATE_PKG_EVENT_MAX, true, (void *) &cm->pkg_events); + module_msr_mask = perf_msr_probe(module_msr, PERF_CSTATE_MODULE_EVENT_MAX, + true, (void *) &cm->module_events); + has_cstate_core = !!core_msr_mask; has_cstate_pkg = !!pkg_msr_mask; + has_cstate_module = !!module_msr_mask; - return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; + return (has_cstate_core || has_cstate_pkg || has_cstate_module) ? 0 : -ENODEV; } static inline void cstate_cleanup(void) @@ -708,6 +800,9 @@ static inline void cstate_cleanup(void) if (has_cstate_pkg) perf_pmu_unregister(&cstate_pkg_pmu); + + if (has_cstate_module) + perf_pmu_unregister(&cstate_module_pmu); } static int __init cstate_init(void) @@ -744,6 +839,16 @@ static int __init cstate_init(void) return err; } } + + if (has_cstate_module) { + err = perf_pmu_register(&cstate_module_pmu, cstate_module_pmu.name, -1); + if (err) { + has_cstate_module = false; + pr_info("Failed to register cstate cluster pmu\n"); + cstate_cleanup(); + return err; + } + } return 0; } -- cgit From bbb968696d0f3442ab823598def3b756cf4735c6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 16 Nov 2023 06:22:45 -0800 Subject: perf/x86/intel/cstate: Add Grand Ridge support The same as the Sierra Forest, the Grand Ridge supports core C1/C6 and module C6. But it doesn't support pkg C6 residency counter. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231116142245.1233485-4-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4a46ef315284..4b50a3a9818a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF + * MTL,SRF,GRR * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -52,7 +52,8 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, - * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF + * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, + * GRR * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -99,7 +100,7 @@ * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 - * Available model: SRF + * Available model: SRF,GRR * Scope: A cluster of cores shared L2 cache * */ @@ -677,6 +678,13 @@ static const struct cstate_model glm_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model grr_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), +}; + static const struct cstate_model srf_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -739,6 +747,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &grr_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), -- cgit From cf35791476fcb3230b98a42241a56242d60ebdd3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:35 -0800 Subject: perf/x86/intel/uncore: Generic uncore_get_uncores and MMIO format of SPR Factor out SPR_UNCORE_MMIO_COMMON_FORMAT which can be reused by Granite Rapids in the following patch. Granite Rapids have more uncore units than Sapphire Rapids. Add new parameters to support adjustable uncore units. No functional change. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-1-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 8250f0f59c2b..fc6587016af7 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6079,13 +6079,16 @@ static struct uncore_event_desc spr_uncore_imc_events[] = { { /* end: all zeroes */ }, }; +#define SPR_UNCORE_MMIO_COMMON_FORMAT() \ + SPR_UNCORE_COMMON_FORMAT(), \ + .ops = &spr_uncore_mmio_ops + static struct intel_uncore_type spr_uncore_imc = { - SPR_UNCORE_COMMON_FORMAT(), + SPR_UNCORE_MMIO_COMMON_FORMAT(), .name = "imc", .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, - .ops = &spr_uncore_mmio_ops, .event_descs = spr_uncore_imc_events, }; @@ -6412,7 +6415,8 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type, static struct intel_uncore_type ** uncore_get_uncores(enum uncore_access_type type_id, int num_extra, - struct intel_uncore_type **extra) + struct intel_uncore_type **extra, int max_num_types, + struct intel_uncore_type **uncores) { struct intel_uncore_type **types, **start_types; int i; @@ -6421,9 +6425,9 @@ uncore_get_uncores(enum uncore_access_type type_id, int num_extra, /* Only copy the customized features */ for (; *types; types++) { - if ((*types)->type_id >= UNCORE_SPR_NUM_UNCORE_TYPES) + if ((*types)->type_id >= max_num_types) continue; - uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]); + uncore_type_customized_copy(*types, uncores[(*types)->type_id]); } for (i = 0; i < num_extra; i++, types++) @@ -6470,7 +6474,9 @@ void spr_uncore_cpu_init(void) uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, UNCORE_SPR_MSR_EXTRA_UNCORES, - spr_msr_uncores); + spr_msr_uncores, + UNCORE_SPR_NUM_UNCORE_TYPES, + spr_uncores); type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA); if (type) { @@ -6552,7 +6558,9 @@ int spr_uncore_pci_init(void) spr_update_device_location(UNCORE_SPR_M3UPI); uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, UNCORE_SPR_PCI_EXTRA_UNCORES, - spr_pci_uncores); + spr_pci_uncores, + UNCORE_SPR_NUM_UNCORE_TYPES, + spr_uncores); return 0; } @@ -6560,12 +6568,16 @@ void spr_uncore_mmio_init(void) { int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true); - if (ret) - uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL); - else { + if (ret) { + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL, + UNCORE_SPR_NUM_UNCORE_TYPES, + spr_uncores); + } else { uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, UNCORE_SPR_MMIO_EXTRA_UNCORES, - spr_mmio_uncores); + spr_mmio_uncores, + UNCORE_SPR_NUM_UNCORE_TYPES, + spr_uncores); spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2; } -- cgit From b560e0cd882b11921c84307efe139f1247434c5e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:36 -0800 Subject: perf/x86/uncore: Use u64 to replace unsigned for the uncore offsets array The current perf doesn't save the complete address of an uncore unit. The complete address of each unit is calculated by the base address + offset. The type of the base address is u64, while the type of offset is unsigned. In the old platforms (without the discovery table method), the base address and offset are hard coded in the driver. Perf can always use the lowest address as the base address. Everything works well. In the new platforms (starting from SPR), the discovery table provides a complete address for all uncore units. To follow the current framework/codes, when parsing the discovery table, the complete address of the first box is stored as a base address. The offset of the following units is calculated by the complete address of the unit minus the base address (the address of the first unit). On GNR, the latter units may have a lower address compared to the first unit. So the offset is a negative value. The upper 32 bits are lost when casting a negative u64 to an unsigned type. Use u64 to replace unsigned for the uncore offsets array to correct the above case. There is no functional change. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-2-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.h | 6 +++--- arch/x86/events/intel/uncore_discovery.c | 5 +++-- arch/x86/events/intel/uncore_discovery.h | 2 +- arch/x86/events/intel/uncore_nhmex.c | 2 +- arch/x86/events/intel/uncore_snbep.c | 6 +++--- 5 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index c30fb5bb1222..7428ecaddf72 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -72,9 +72,9 @@ struct intel_uncore_type { unsigned single_fixed:1; unsigned pair_ctr_ctl:1; union { - unsigned *msr_offsets; - unsigned *pci_offsets; - unsigned *mmio_offsets; + u64 *msr_offsets; + u64 *pci_offsets; + u64 *mmio_offsets; }; unsigned *box_ids; struct event_constraint unconstrainted; diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index cb488e41807c..9a698a92962a 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -125,7 +125,8 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit, int die, bool parsed) { struct intel_uncore_discovery_type *type; - unsigned int *box_offset, *ids; + unsigned int *ids; + u64 *box_offset; int i; if (!unit->ctl || !unit->ctl_offset || !unit->ctr_offset) { @@ -153,7 +154,7 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit, if (!type) return; - box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL); + box_offset = kcalloc(type->num_boxes + 1, sizeof(u64), GFP_KERNEL); if (!box_offset) return; diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 6ee80ad3423e..22e769a81103 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -125,7 +125,7 @@ struct intel_uncore_discovery_type { u8 ctr_offset; /* Counter 0 offset */ u16 num_boxes; /* number of boxes for the uncore block */ unsigned int *ids; /* Box IDs */ - unsigned int *box_offset; /* Box offset */ + u64 *box_offset; /* Box offset */ }; bool intel_uncore_has_discovery_tables(int *ignore); diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index 173e2674be6e..56eea2c66cfb 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -306,7 +306,7 @@ static const struct attribute_group nhmex_uncore_cbox_format_group = { }; /* msr offset for each instance of cbox */ -static unsigned nhmex_cbox_msr_offsets[] = { +static u64 nhmex_cbox_msr_offsets[] = { 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index fc6587016af7..344319ab6dd5 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -5278,7 +5278,7 @@ void snr_uncore_mmio_init(void) /* ICX uncore support */ -static unsigned icx_cha_msr_offsets[] = { +static u64 icx_cha_msr_offsets[] = { 0x2a0, 0x2ae, 0x2bc, 0x2ca, 0x2d8, 0x2e6, 0x2f4, 0x302, 0x310, 0x31e, 0x32c, 0x33a, 0x348, 0x356, 0x364, 0x372, 0x380, 0x38e, 0x3aa, 0x3b8, 0x3c6, 0x3d4, 0x3e2, 0x3f0, 0x3fe, 0x40c, 0x41a, @@ -5326,7 +5326,7 @@ static struct intel_uncore_type icx_uncore_chabox = { .format_group = &snr_uncore_chabox_format_group, }; -static unsigned icx_msr_offsets[] = { +static u64 icx_msr_offsets[] = { 0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0, }; @@ -6184,7 +6184,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = { */ #define SPR_UNCORE_UPI_NUM_BOXES 4 -static unsigned int spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = { +static u64 spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = { 0, 0x8000, 0x10000, 0x18000 }; -- cgit From 632c4bf6d007862307440b177d9fee829857e8bb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:37 -0800 Subject: perf/x86/intel/uncore: Support Granite Rapids The same as Sapphire Rapids, Granite Rapids also supports the discovery table feature. All the basic uncore PMON information can be retrieved from the discovery table which resides in the BIOS. There are 4 new units are added on Granite Rapids, b2cmi, b2cxl, ubox, and mdf_sbo. The layout of the counters is exactly the same as the generic uncore counters. Only add a name for the new units. All the details can be retrieved from the discovery table. The description of the new units can be found at https://www.intel.com/content/www/us/en/secure/content-details/772943/content-details.html The other units, e.g., cha, iio, irp, pcu, and imc, are the same as Sapphire Rapids. Ignore the upi and b2upi units in the discovery table, which are broken for now. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-3-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 10 +++++ arch/x86/events/intel/uncore.h | 4 ++ arch/x86/events/intel/uncore_snbep.c | 87 ++++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 01023aa5125b..7fb1c54c9879 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1814,6 +1814,14 @@ static const struct intel_uncore_init_fun spr_uncore_init __initconst = { .uncore_units_ignore = spr_uncore_units_ignore, }; +static const struct intel_uncore_init_fun gnr_uncore_init __initconst = { + .cpu_init = gnr_uncore_cpu_init, + .pci_init = gnr_uncore_pci_init, + .mmio_init = gnr_uncore_mmio_init, + .use_discovery = true, + .uncore_units_ignore = gnr_uncore_units_ignore, +}; + static const struct intel_uncore_init_fun generic_uncore_init __initconst = { .cpu_init = intel_uncore_generic_uncore_cpu_init, .pci_init = intel_uncore_generic_uncore_pci_init, @@ -1865,6 +1873,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &gnr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init), {}, diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 7428ecaddf72..4838502d89ae 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -593,6 +593,7 @@ extern struct list_head pci2phy_map_head; extern struct pci_extra_dev *uncore_extra_pci_dev; extern struct event_constraint uncore_constraint_empty; extern int spr_uncore_units_ignore[]; +extern int gnr_uncore_units_ignore[]; /* uncore_snb.c */ int snb_uncore_pci_init(void); @@ -634,6 +635,9 @@ void icx_uncore_mmio_init(void); int spr_uncore_pci_init(void); void spr_uncore_cpu_init(void); void spr_uncore_mmio_init(void); +int gnr_uncore_pci_init(void); +void gnr_uncore_cpu_init(void); +void gnr_uncore_mmio_init(void); /* uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 344319ab6dd5..ab31cda797df 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6584,3 +6584,90 @@ void spr_uncore_mmio_init(void) } /* end of SPR uncore support */ + +/* GNR uncore support */ + +#define UNCORE_GNR_NUM_UNCORE_TYPES 23 +#define UNCORE_GNR_TYPE_15 15 +#define UNCORE_GNR_B2UPI 18 +#define UNCORE_GNR_TYPE_21 21 +#define UNCORE_GNR_TYPE_22 22 + +int gnr_uncore_units_ignore[] = { + UNCORE_SPR_UPI, + UNCORE_GNR_TYPE_15, + UNCORE_GNR_B2UPI, + UNCORE_GNR_TYPE_21, + UNCORE_GNR_TYPE_22, + UNCORE_IGNORE_END +}; + +static struct intel_uncore_type gnr_uncore_ubox = { + .name = "ubox", + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type gnr_uncore_b2cmi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "b2cmi", +}; + +static struct intel_uncore_type gnr_uncore_b2cxl = { + SPR_UNCORE_MMIO_COMMON_FORMAT(), + .name = "b2cxl", +}; + +static struct intel_uncore_type gnr_uncore_mdf_sbo = { + .name = "mdf_sbo", + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { + &spr_uncore_chabox, + &spr_uncore_iio, + &spr_uncore_irp, + NULL, + &spr_uncore_pcu, + &gnr_uncore_ubox, + &spr_uncore_imc, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + &gnr_uncore_b2cmi, + &gnr_uncore_b2cxl, + NULL, + NULL, + &gnr_uncore_mdf_sbo, + NULL, + NULL, +}; + +void gnr_uncore_cpu_init(void) +{ + uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, 0, NULL, + UNCORE_GNR_NUM_UNCORE_TYPES, + gnr_uncores); +} + +int gnr_uncore_pci_init(void) +{ + uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL, + UNCORE_GNR_NUM_UNCORE_TYPES, + gnr_uncores); + return 0; +} + +void gnr_uncore_mmio_init(void) +{ + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL, + UNCORE_GNR_NUM_UNCORE_TYPES, + gnr_uncores); +} + +/* end of GNR uncore support */ -- cgit From 388d76175bd9bbad52bbff25c88361d9e5c6615e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:38 -0800 Subject: perf/x86/intel/uncore: Support IIO free-running counters on GNR The free-running counters for IIO uncore blocks on Granite Rapids are similar to Sapphire Rapids. The key difference is the offset of the registers. The number of the IIO uncore blocks can also be retrieved from the discovery table. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-4-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ab31cda797df..aeaa8efe3c62 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6648,11 +6648,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { NULL, }; +static struct freerunning_counters gnr_iio_freerunning[] = { + [SPR_IIO_MSR_IOCLK] = { 0x290e, 0x01, 0x10, 1, 48 }, + [SPR_IIO_MSR_BW_IN] = { 0x360e, 0x10, 0x80, 8, 48 }, + [SPR_IIO_MSR_BW_OUT] = { 0x2e0e, 0x10, 0x80, 8, 48 }, +}; + void gnr_uncore_cpu_init(void) { - uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, 0, NULL, + uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, + UNCORE_SPR_MSR_EXTRA_UNCORES, + spr_msr_uncores, UNCORE_GNR_NUM_UNCORE_TYPES, gnr_uncores); + spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO); + spr_uncore_iio_free_running.freerunning = gnr_iio_freerunning; } int gnr_uncore_pci_init(void) -- cgit From cb4a6ccf35839895da63fcf6134d6fbd13224805 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Nov 2023 08:39:39 -0800 Subject: perf/x86/intel/uncore: Support Sierra Forest and Grand Ridge The same as Granite Rapids, the Sierra Forest and Grand Ridge also supports the discovery table feature and the same type of the uncore units. The difference of the available units and counters can be retrieved from the discovery table automatically. Just add the CPU model ID. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Ammy Yi Link: https://lore.kernel.org/r/20231117163939.2468007-5-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 7fb1c54c9879..7927c0b832fa 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1877,6 +1877,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); -- cgit From 0f9e0d7928d8e88d57b1482effab70edb9741ce1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 30 Nov 2023 11:52:46 +0530 Subject: perf/x86/amd: Reject branch stack for IBS events The AMD IBS PMU doesn't handle branch stacks, so it should not accept events with brstack. Signed-off-by: Namhyung Kim Signed-off-by: Ravi Bangoria Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231130062246.290-1-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/events') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 6911c5399d02..e91970b01d62 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -287,6 +287,9 @@ static int perf_ibs_init(struct perf_event *event) if (config & ~perf_ibs->config_mask) return -EINVAL; + if (has_branch_stack(event)) + return -EOPNOTSUPP; + ret = validate_group(event); if (ret) return ret; -- cgit From 1692cf434ba13ee212495b5af795b6a07e986ce4 Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 27 Nov 2023 10:52:45 -0800 Subject: perf/x86/intel/uncore: Fix NULL pointer dereference issue in upi_fill_topology() Get logical socket id instead of physical id in discover_upi_topology() to avoid out-of-bound access on 'upi = &type->topology[nid][idx];' line that leads to NULL pointer dereference in upi_fill_topology() Fixes: f680b6e6062e ("perf/x86/intel/uncore: Enable UPI topology discovery for Icelake Server") Reported-by: Kyle Meyer Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Kyle Meyer Link: https://lore.kernel.org/r/20231127185246.2371939-2-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index aeaa8efe3c62..1efbacfff47d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -5596,7 +5596,7 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i struct pci_dev *ubox = NULL; struct pci_dev *dev = NULL; u32 nid, gid; - int i, idx, ret = -EPERM; + int i, idx, lgc_pkg, ret = -EPERM; struct intel_uncore_topology *upi; unsigned int devfn; @@ -5614,8 +5614,13 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i for (i = 0; i < 8; i++) { if (nid != GIDNIDMAP(gid, i)) continue; + lgc_pkg = topology_phys_to_logical_pkg(i); + if (lgc_pkg < 0) { + ret = -EPERM; + goto err; + } for (idx = 0; idx < type->num_boxes; idx++) { - upi = &type->topology[nid][idx]; + upi = &type->topology[lgc_pkg][idx]; devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION); dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus), ubox->bus->number, @@ -5626,6 +5631,7 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i goto err; } } + break; } } err: -- cgit From fdd041028f2294228e10610b4fca6a1a83ac683d Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 27 Nov 2023 10:52:46 -0800 Subject: perf/x86/intel/uncore: Factor out topology_gidnid_map() The same code is used for retrieving package ID procedure from GIDNIDMAP register. Factor out topology_gidnid_map() to avoid code duplication. Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20231127185246.2371939-3-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 77 +++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 37 deletions(-) (limited to 'arch/x86/events') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 1efbacfff47d..a96496bef678 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1396,6 +1396,29 @@ err: return ret; } +static int topology_gidnid_map(int nodeid, u32 gidnid) +{ + int i, die_id = -1; + + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == GIDNIDMAP(gidnid, i)) { + if (topology_max_die_per_package() > 1) + die_id = i; + else + die_id = topology_phys_to_logical_pkg(i); + if (die_id < 0) + die_id = -ENODEV; + break; + } + } + + return die_id; +} + /* * build pci bus to socket mapping */ @@ -1435,22 +1458,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool break; } - /* - * every three bits in the Node ID mapping register maps - * to a particular node. - */ - for (i = 0; i < 8; i++) { - if (nodeid == GIDNIDMAP(config, i)) { - if (topology_max_die_per_package() > 1) - die_id = i; - else - die_id = topology_phys_to_logical_pkg(i); - if (die_id < 0) - die_id = -ENODEV; - map->pbus_to_dieid[bus] = die_id; - break; - } - } + map->pbus_to_dieid[bus] = topology_gidnid_map(nodeid, config); raw_spin_unlock(&pci2phy_map_lock); } else { segment = pci_domain_nr(ubox_dev->bus); @@ -5596,7 +5604,7 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i struct pci_dev *ubox = NULL; struct pci_dev *dev = NULL; u32 nid, gid; - int i, idx, lgc_pkg, ret = -EPERM; + int idx, lgc_pkg, ret = -EPERM; struct intel_uncore_topology *upi; unsigned int devfn; @@ -5611,27 +5619,22 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i break; } - for (i = 0; i < 8; i++) { - if (nid != GIDNIDMAP(gid, i)) - continue; - lgc_pkg = topology_phys_to_logical_pkg(i); - if (lgc_pkg < 0) { - ret = -EPERM; - goto err; - } - for (idx = 0; idx < type->num_boxes; idx++) { - upi = &type->topology[lgc_pkg][idx]; - devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION); - dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus), - ubox->bus->number, - devfn); - if (dev) { - ret = upi_fill_topology(dev, upi, idx); - if (ret) - goto err; - } + lgc_pkg = topology_gidnid_map(nid, gid); + if (lgc_pkg < 0) { + ret = -EPERM; + goto err; + } + for (idx = 0; idx < type->num_boxes; idx++) { + upi = &type->topology[lgc_pkg][idx]; + devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION); + dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus), + ubox->bus->number, + devfn); + if (dev) { + ret = upi_fill_topology(dev, upi, idx); + if (ret) + goto err; } - break; } } err: -- cgit