diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 177 |
1 files changed, 111 insertions, 66 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c72a41f11af..683dc086ef10 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -375,6 +375,7 @@ enum event_type_t { EVENT_TIME = 0x4, /* see ctx_resched() for details */ EVENT_CPU = 0x8, + EVENT_CGROUP = 0x10, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -449,8 +450,8 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int perf_event_max_sample_rate_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -684,20 +685,26 @@ do { \ ___p; \ }) -static void perf_ctx_disable(struct perf_event_context *ctx) +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; perf_pmu_disable(pmu_ctx->pmu); + } } -static void perf_ctx_enable(struct perf_event_context *ctx) +static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; perf_pmu_enable(pmu_ctx->pmu); + } } static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); @@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, true); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL); + ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task) * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL); + ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } @@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct if (!is_cgroup_event(event)) return; + event->pmu_ctx->nr_cgroups++; + /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. @@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (!is_cgroup_event(event)) return; + event->pmu_ctx->nr_cgroups--; + /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. @@ -1954,6 +1965,7 @@ static void perf_group_attach(struct perf_event *event) list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; + group_leader->group_generation++; perf_event__header_size(group_leader); @@ -2144,6 +2156,7 @@ static void perf_group_detach(struct perf_event *event) if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; + event->group_leader->group_generation++; goto out; } @@ -2677,9 +2690,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, false); if (task_ctx) { - perf_ctx_disable(task_ctx); + perf_ctx_disable(task_ctx, false); task_ctx_sched_out(task_ctx, event_type); } @@ -2697,9 +2710,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, perf_event_sched_in(cpuctx, task_ctx); - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, false); if (task_ctx) - perf_ctx_enable(task_ctx); + perf_ctx_enable(task_ctx, false); } void perf_pmu_resched(struct pmu *pmu) @@ -3244,6 +3257,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3290,8 +3306,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) is_active ^= ctx->is_active; /* changed bits */ - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; __pmu_ctx_sched_out(pmu_ctx, is_active); + } } /* @@ -3482,7 +3501,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); /* PMIs are disabled; ctx->nr_pending is stable. */ if (local_read(&ctx->nr_pending) || @@ -3502,7 +3521,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, false); perf_event_swap_task_ctx_data(ctx, next_ctx); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not @@ -3526,13 +3545,13 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); inside_switch: perf_ctx_sched_task_cb(ctx, false); task_ctx_sched_out(ctx, EVENT_ALL); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } @@ -3818,47 +3837,32 @@ static int merge_sched_in(struct perf_event *event, void *data) return 0; } -static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void pmu_groups_sched_in(struct perf_event_context *ctx, + struct perf_event_groups *groups, + struct pmu *pmu) { - struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - - if (pmu) { - visit_groups_merge(ctx, &ctx->pinned_groups, - smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); - } else { - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - can_add_hw = 1; - visit_groups_merge(ctx, &ctx->pinned_groups, - smp_processor_id(), pmu_ctx->pmu, - merge_sched_in, &can_add_hw); - } - } + visit_groups_merge(ctx, groups, smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); } -static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void ctx_groups_sched_in(struct perf_event_context *ctx, + struct perf_event_groups *groups, + bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - int can_add_hw = 1; - if (pmu) { - visit_groups_merge(ctx, &ctx->flexible_groups, - smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); - } else { - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - can_add_hw = 1; - visit_groups_merge(ctx, &ctx->flexible_groups, - smp_processor_id(), pmu_ctx->pmu, - merge_sched_in, &can_add_hw); - } + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); } } -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void __pmu_ctx_sched_in(struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_flexible_sched_in(ctx, pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); } static void @@ -3866,6 +3870,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3898,11 +3905,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, NULL); + ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, NULL); + ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); } static void perf_event_context_sched_in(struct task_struct *task) @@ -3917,11 +3924,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); perf_ctx_sched_task_cb(ctx, true); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -3934,7 +3941,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3944,7 +3951,7 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, false); ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); } @@ -3953,9 +3960,9 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_sched_task_cb(cpuctx->task_ctx, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, false); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -4425,6 +4432,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { u16 local_pkg, event_pkg; + if ((unsigned)event_cpu >= nr_cpu_ids) + return event_cpu; + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { int local_cpu = smp_processor_id(); @@ -4527,6 +4537,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; + int event_oncpu; + int event_cpu; int ret = 0; /* @@ -4551,15 +4563,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* + * Get the event CPU numbers, and adjust them to local if the event is + * a per-package event that can be read locally + */ + event_oncpu = __perf_event_read_cpu(event, event->oncpu); + event_cpu = __perf_event_read_cpu(event, event->cpu); + /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && - event->cpu != smp_processor_id()) { + event_cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ - if (event->attr.pinned && event->oncpu != smp_processor_id()) { + if (event->attr.pinned && event_oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } @@ -4569,7 +4588,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event_oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); @@ -5440,7 +5459,7 @@ static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; - struct perf_event *sub; + struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -5450,6 +5469,33 @@ static int __perf_read_group_add(struct perf_event *leader, return ret; raw_spin_lock_irqsave(&ctx->lock, flags); + /* + * Verify the grouping between the parent and child (inherited) + * events is still in tact. + * + * Specifically: + * - leader->ctx->lock pins leader->sibling_list + * - parent->child_mutex pins parent->child_list + * - parent->ctx->mutex pins parent->sibling_list + * + * Because parent->ctx != leader->ctx (and child_list nests inside + * ctx->mutex), group destruction is not atomic between children, also + * see perf_event_release_kernel(). Additionally, parent can grow the + * group. + * + * Therefore it is possible to have parent and child groups in a + * different configuration and summing over such a beast makes no sense + * what so ever. + * + * Reject this. + */ + parent = leader->parent; + if (parent && + (parent->group_generation != leader->group_generation || + parent->nr_siblings != leader->nr_siblings)) { + ret = -ECHILD; + goto unlock; + } /* * Since we co-schedule groups, {enabled,running} times of siblings @@ -5483,8 +5529,9 @@ static int __perf_read_group_add(struct perf_event *leader, values[n++] = atomic64_read(&sub->lost_samples); } +unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); - return 0; + return ret; } static int perf_read_group(struct perf_event *event, @@ -5503,10 +5550,6 @@ static int perf_read_group(struct perf_event *event, values[0] = 1 + leader->nr_siblings; - /* - * By locking the child_mutex of the leader we effectively - * lock the child list of all siblings.. XXX explain how. - */ mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); @@ -13346,6 +13389,8 @@ static int inherit_group(struct perf_event *parent_event, !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } + if (leader) + leader->group_generation = parent_event->group_generation; return 0; } |