summaryrefslogtreecommitdiff
path: root/kernel/events
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 13:44:35 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 13:44:35 -1000
commitbceb7accb7b60f9844807c7433af06493ed058b7 (patch)
tree1a0dfc1c85b3dabb2cac578df90a79d336b58dc9 /kernel/events
parentcd063c8b9e1e95560e90bac7816234d8b2ee2897 (diff)
parent744940f1921c8feb90e3c4bcc1e153fdd6e10fe2 (diff)
Merge tag 'perf-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance event updates from Ingo Molnar: - Add AMD Unified Memory Controller (UMC) events introduced with Zen 4 - Simplify & clean up the uncore management code - Fall back from RDPMC to RDMSR on certain uncore PMUs - Improve per-package and cstate event reading - Extend the Intel ref-cycles event to GP counters - Fix Intel MTL event constraints - Improve the Intel hybrid CPU handling code - Micro-optimize the RAPL code - Optimize perf_cgroup_switch() - Improve large AUX area error handling - Misc fixes and cleanups * tag 'perf-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits) perf/x86/amd/uncore: Pass through error code for initialization failures, instead of -ENODEV perf/x86/amd/uncore: Fix uninitialized return value in amd_uncore_init() x86/cpu: Fix the AMD Fam 17h, Fam 19h, Zen2 and Zen4 MSR enumerations perf: Optimize perf_cgroup_switch() perf/x86/amd/uncore: Add memory controller support perf/x86/amd/uncore: Add group exclusivity perf/x86/amd/uncore: Use rdmsr if rdpmc is unavailable perf/x86/amd/uncore: Move discovery and registration perf/x86/amd/uncore: Refactor uncore management perf/core: Allow reading package events from perf_event_read_local perf/x86/cstate: Allow reading the package statistics from local CPU perf/x86/intel/pt: Fix kernel-doc comments perf/x86/rapl: Annotate 'struct rapl_pmus' with __counted_by perf/core: Rename perf_proc_update_handler() -> perf_event_max_sample_rate_handler(), for readability perf/x86/rapl: Fix "Using plain integer as NULL pointer" Sparse warning perf/x86/rapl: Use local64_try_cmpxchg in rapl_event_update() perf/x86/rapl: Stop doing cpu_relax() in the local64_cmpxchg() loop in rapl_event_update() perf/core: Bail out early if the request AUX area is out of bound perf/x86/intel: Extend the ref-cycles event to GP counters perf/x86/intel: Fix broken fixed event constraints extension ...
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c137
-rw-r--r--kernel/events/ring_buffer.c6
2 files changed, 83 insertions, 60 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a2f2a9525d72..683dc086ef10 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -375,6 +375,7 @@ enum event_type_t {
EVENT_TIME = 0x4,
/* see ctx_resched() for details */
EVENT_CPU = 0x8,
+ EVENT_CGROUP = 0x10,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -449,8 +450,8 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -684,20 +685,26 @@ do { \
___p; \
})
-static void perf_ctx_disable(struct perf_event_context *ctx)
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_disable(pmu_ctx->pmu);
+ }
}
-static void perf_ctx_enable(struct perf_event_context *ctx)
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_enable(pmu_ctx->pmu);
+ }
}
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, true);
- ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups++;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups--;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -2679,9 +2690,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
if (task_ctx) {
- perf_ctx_disable(task_ctx);
+ perf_ctx_disable(task_ctx, false);
task_ctx_sched_out(task_ctx, event_type);
}
@@ -2699,9 +2710,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
perf_event_sched_in(cpuctx, task_ctx);
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);
if (task_ctx)
- perf_ctx_enable(task_ctx);
+ perf_ctx_enable(task_ctx, false);
}
void perf_pmu_resched(struct pmu *pmu)
@@ -3246,6 +3257,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3292,8 +3306,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
is_active ^= ctx->is_active; /* changed bits */
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
__pmu_ctx_sched_out(pmu_ctx, is_active);
+ }
}
/*
@@ -3484,7 +3501,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
@@ -3504,7 +3521,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_sched_task_cb(ctx, false);
perf_event_swap_task_ctx_data(ctx, next_ctx);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3528,13 +3545,13 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
task_ctx_sched_out(ctx, EVENT_ALL);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
@@ -3820,47 +3837,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}
-static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ struct pmu *pmu)
{
- struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
-
- if (pmu) {
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
- }
+ visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
}
-static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- int can_add_hw = 1;
- if (pmu) {
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
}
}
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_flexible_sched_in(ctx, pmu);
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
}
static void
@@ -3868,6 +3870,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3900,11 +3905,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
}
static void perf_event_context_sched_in(struct task_struct *task)
@@ -3919,11 +3924,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (cpuctx->task_ctx == ctx) {
perf_ctx_lock(cpuctx, ctx);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
perf_ctx_sched_task_cb(ctx, true);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx);
goto rcu_unlock;
}
@@ -3936,7 +3941,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (!ctx->nr_events)
goto unlock;
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3946,7 +3951,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
* events, no need to flip the cpuctx's events around.
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
}
@@ -3955,9 +3960,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -4427,6 +4432,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
u16 local_pkg, event_pkg;
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return event_cpu;
+
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
int local_cpu = smp_processor_id();
@@ -4529,6 +4537,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
u64 *enabled, u64 *running)
{
unsigned long flags;
+ int event_oncpu;
+ int event_cpu;
int ret = 0;
/*
@@ -4553,15 +4563,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
goto out;
}
+ /*
+ * Get the event CPU numbers, and adjust them to local if the event is
+ * a per-package event that can be read locally
+ */
+ event_oncpu = __perf_event_read_cpu(event, event->oncpu);
+ event_cpu = __perf_event_read_cpu(event, event->cpu);
+
/* If this is a per-CPU event, it must be for this CPU */
if (!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id()) {
+ event_cpu != smp_processor_id()) {
ret = -EINVAL;
goto out;
}
/* If this is a pinned event it must be running on this CPU */
- if (event->attr.pinned && event->oncpu != smp_processor_id()) {
+ if (event->attr.pinned && event_oncpu != smp_processor_id()) {
ret = -EBUSY;
goto out;
}
@@ -4571,7 +4588,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
* oncpu == -1).
*/
- if (event->oncpu == smp_processor_id())
+ if (event_oncpu == smp_processor_id())
event->pmu->read(event);
*value = local64_read(&event->count);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb1e180b5f0a..e8d82c2f07d0 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -700,6 +700,12 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
watermark = 0;
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)