summaryrefslogtreecommitdiff
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c177
1 files changed, 111 insertions, 66 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c72a41f11af..683dc086ef10 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -375,6 +375,7 @@ enum event_type_t {
EVENT_TIME = 0x4,
/* see ctx_resched() for details */
EVENT_CPU = 0x8,
+ EVENT_CGROUP = 0x10,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -449,8 +450,8 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int perf_cpu = sysctl_perf_cpu_time_max_percent;
@@ -684,20 +685,26 @@ do { \
___p; \
})
-static void perf_ctx_disable(struct perf_event_context *ctx)
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_disable(pmu_ctx->pmu);
+ }
}
-static void perf_ctx_enable(struct perf_event_context *ctx)
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
perf_pmu_enable(pmu_ctx->pmu);
+ }
}
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, true);
- ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups++;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (!is_cgroup_event(event))
return;
+ event->pmu_ctx->nr_cgroups--;
+
/*
* Because cgroup events are always per-cpu events,
* @ctx == &cpuctx->ctx.
@@ -1954,6 +1965,7 @@ static void perf_group_attach(struct perf_event *event)
list_add_tail(&event->sibling_list, &group_leader->sibling_list);
group_leader->nr_siblings++;
+ group_leader->group_generation++;
perf_event__header_size(group_leader);
@@ -2144,6 +2156,7 @@ static void perf_group_detach(struct perf_event *event)
if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
+ event->group_leader->group_generation++;
goto out;
}
@@ -2677,9 +2690,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
if (task_ctx) {
- perf_ctx_disable(task_ctx);
+ perf_ctx_disable(task_ctx, false);
task_ctx_sched_out(task_ctx, event_type);
}
@@ -2697,9 +2710,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
perf_event_sched_in(cpuctx, task_ctx);
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);
if (task_ctx)
- perf_ctx_enable(task_ctx);
+ perf_ctx_enable(task_ctx, false);
}
void perf_pmu_resched(struct pmu *pmu)
@@ -3244,6 +3257,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3290,8 +3306,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
is_active ^= ctx->is_active; /* changed bits */
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
__pmu_ctx_sched_out(pmu_ctx, is_active);
+ }
}
/*
@@ -3482,7 +3501,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
@@ -3502,7 +3521,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_sched_task_cb(ctx, false);
perf_event_swap_task_ctx_data(ctx, next_ctx);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3526,13 +3545,13 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
task_ctx_sched_out(ctx, EVENT_ALL);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
}
}
@@ -3818,47 +3837,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}
-static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ struct pmu *pmu)
{
- struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
-
- if (pmu) {
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->pinned_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
- }
+ visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
}
-static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+ struct perf_event_groups *groups,
+ bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- int can_add_hw = 1;
- if (pmu) {
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu,
- merge_sched_in, &can_add_hw);
- } else {
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- can_add_hw = 1;
- visit_groups_merge(ctx, &ctx->flexible_groups,
- smp_processor_id(), pmu_ctx->pmu,
- merge_sched_in, &can_add_hw);
- }
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (cgroup && !pmu_ctx->nr_cgroups)
+ continue;
+ pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
}
}
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_flexible_sched_in(ctx, pmu);
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
}
static void
@@ -3866,6 +3870,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
+ bool cgroup = event_type & EVENT_CGROUP;
+
+ event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
@@ -3898,11 +3905,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, NULL);
+ ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
}
static void perf_event_context_sched_in(struct task_struct *task)
@@ -3917,11 +3924,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (cpuctx->task_ctx == ctx) {
perf_ctx_lock(cpuctx, ctx);
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
perf_ctx_sched_task_cb(ctx, true);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx);
goto rcu_unlock;
}
@@ -3934,7 +3941,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
if (!ctx->nr_events)
goto unlock;
- perf_ctx_disable(ctx);
+ perf_ctx_disable(ctx, false);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3944,7 +3951,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
* events, no need to flip the cpuctx's events around.
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
- perf_ctx_disable(&cpuctx->ctx);
+ perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
}
@@ -3953,9 +3960,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_enable(&cpuctx->ctx, false);
- perf_ctx_enable(ctx);
+ perf_ctx_enable(ctx, false);
unlock:
perf_ctx_unlock(cpuctx, ctx);
@@ -4425,6 +4432,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
u16 local_pkg, event_pkg;
+ if ((unsigned)event_cpu >= nr_cpu_ids)
+ return event_cpu;
+
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
int local_cpu = smp_processor_id();
@@ -4527,6 +4537,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
u64 *enabled, u64 *running)
{
unsigned long flags;
+ int event_oncpu;
+ int event_cpu;
int ret = 0;
/*
@@ -4551,15 +4563,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
goto out;
}
+ /*
+ * Get the event CPU numbers, and adjust them to local if the event is
+ * a per-package event that can be read locally
+ */
+ event_oncpu = __perf_event_read_cpu(event, event->oncpu);
+ event_cpu = __perf_event_read_cpu(event, event->cpu);
+
/* If this is a per-CPU event, it must be for this CPU */
if (!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id()) {
+ event_cpu != smp_processor_id()) {
ret = -EINVAL;
goto out;
}
/* If this is a pinned event it must be running on this CPU */
- if (event->attr.pinned && event->oncpu != smp_processor_id()) {
+ if (event->attr.pinned && event_oncpu != smp_processor_id()) {
ret = -EBUSY;
goto out;
}
@@ -4569,7 +4588,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
* oncpu == -1).
*/
- if (event->oncpu == smp_processor_id())
+ if (event_oncpu == smp_processor_id())
event->pmu->read(event);
*value = local64_read(&event->count);
@@ -5440,7 +5459,7 @@ static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
struct perf_event_context *ctx = leader->ctx;
- struct perf_event *sub;
+ struct perf_event *sub, *parent;
unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -5450,6 +5469,33 @@ static int __perf_read_group_add(struct perf_event *leader,
return ret;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ /*
+ * Verify the grouping between the parent and child (inherited)
+ * events is still in tact.
+ *
+ * Specifically:
+ * - leader->ctx->lock pins leader->sibling_list
+ * - parent->child_mutex pins parent->child_list
+ * - parent->ctx->mutex pins parent->sibling_list
+ *
+ * Because parent->ctx != leader->ctx (and child_list nests inside
+ * ctx->mutex), group destruction is not atomic between children, also
+ * see perf_event_release_kernel(). Additionally, parent can grow the
+ * group.
+ *
+ * Therefore it is possible to have parent and child groups in a
+ * different configuration and summing over such a beast makes no sense
+ * what so ever.
+ *
+ * Reject this.
+ */
+ parent = leader->parent;
+ if (parent &&
+ (parent->group_generation != leader->group_generation ||
+ parent->nr_siblings != leader->nr_siblings)) {
+ ret = -ECHILD;
+ goto unlock;
+ }
/*
* Since we co-schedule groups, {enabled,running} times of siblings
@@ -5483,8 +5529,9 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] = atomic64_read(&sub->lost_samples);
}
+unlock:
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- return 0;
+ return ret;
}
static int perf_read_group(struct perf_event *event,
@@ -5503,10 +5550,6 @@ static int perf_read_group(struct perf_event *event,
values[0] = 1 + leader->nr_siblings;
- /*
- * By locking the child_mutex of the leader we effectively
- * lock the child list of all siblings.. XXX explain how.
- */
mutex_lock(&leader->child_mutex);
ret = __perf_read_group_add(leader, read_format, values);
@@ -13346,6 +13389,8 @@ static int inherit_group(struct perf_event *parent_event,
!perf_get_aux_event(child_ctr, leader))
return -EINVAL;
}
+ if (leader)
+ leader->group_generation = parent_event->group_generation;
return 0;
}