diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 5269 |
1 files changed, 3213 insertions, 2056 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..823aa0824916 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -53,6 +53,8 @@ #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> +#include <linux/buildid.h> +#include <linux/task_work.h> #include "internal.h" @@ -131,6 +133,7 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) /** * cpu_function_call - call a function on the cpu + * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * @@ -152,26 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) +enum event_type_t { + EVENT_FLEXIBLE = 0x01, + EVENT_PINNED = 0x02, + EVENT_TIME = 0x04, + EVENT_FROZEN = 0x08, + /* see ctx_resched() for details */ + EVENT_CPU = 0x10, + EVENT_CGROUP = 0x20, + + /* compound helpers */ + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, +}; + +static inline void __perf_ctx_lock(struct perf_event_context *ctx) { - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); + raw_spin_lock(&ctx->lock); + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - raw_spin_lock(&cpuctx->ctx.lock); + __perf_ctx_lock(&cpuctx->ctx); if (ctx) - raw_spin_lock(&ctx->lock); + __perf_ctx_lock(ctx); +} + +static inline void __perf_ctx_unlock(struct perf_event_context *ctx) +{ + /* + * If ctx_sched_in() didn't again set any ALL flags, clean up + * after ctx_sched_out() by clearing is_active. + */ + if (ctx->is_active & EVENT_FROZEN) { + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + else + ctx->is_active &= ~EVENT_FROZEN; + } + raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); + __perf_ctx_unlock(ctx); + __perf_ctx_unlock(&cpuctx->ctx); } #define TASK_TOMBSTONE ((void *)-1L) @@ -181,6 +213,14 @@ static bool is_kernel_event(struct perf_event *event) return READ_ONCE(event->owner) == TASK_TOMBSTONE; } +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +struct perf_event_context *perf_cpu_task_ctx(void) +{ + lockdep_assert_irqs_disabled(); + return this_cpu_ptr(&perf_cpu_context)->task_ctx; +} + /* * On task ctx scheduling... * @@ -214,7 +254,7 @@ static int event_function(void *info) struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; @@ -259,6 +299,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, @@ -268,7 +309,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to - * stabilize the the event->ctx relation. See + * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); @@ -286,22 +327,25 @@ again: if (!task_function_call(task, event_function, &efs)) return; - raw_spin_lock_irq(&ctx->lock); + local_irq_disable(); + cpuctx = this_cpu_ptr(&perf_cpu_context); + perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task == TASK_TOMBSTONE) { - raw_spin_unlock_irq(&ctx->lock); - return; - } + if (task == TASK_TOMBSTONE) + goto unlock; if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); goto again; } func(event, NULL, ctx, data); - raw_spin_unlock_irq(&ctx->lock); +unlock: + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); } /* @@ -311,7 +355,7 @@ again: static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; @@ -364,18 +408,8 @@ unlock: (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_TIME = 0x4, - /* see ctx_resched() for details */ - EVENT_CPU = 0x8, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - /* * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ static void perf_sched_delayed(struct work_struct *work); @@ -384,7 +418,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -397,11 +430,18 @@ static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; +static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static cpumask_var_t perf_online_core_mask; +static cpumask_var_t perf_online_die_mask; +static cpumask_var_t perf_online_cluster_mask; +static cpumask_var_t perf_online_pkg_mask; +static cpumask_var_t perf_online_sys_mask; +static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: @@ -442,10 +482,10 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static bool perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -468,7 +508,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; -int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, +int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -528,7 +568,7 @@ void perf_sample_event_took(u64 sample_len_ns) __this_cpu_write(running_sample_length, running_len); /* - * Note: this will be biased artifically low until we have + * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ @@ -565,23 +605,11 @@ void perf_sample_event_took(u64 sample_len_ns) static atomic64_t perf_event_id; -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); - static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } -extern __weak const char *perf_pmu_name(void) -{ - return "pmu"; -} - static inline u64 perf_clock(void) { return local_clock(); @@ -602,10 +630,10 @@ static inline u64 perf_event_clock(struct perf_event *event) * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is - * OFF, irrespecive of what the group member states are. This results in + * OFF, irrespective of what the group member states are. This results in * __perf_effective_state(). * - * A futher ramification is that when a group leader flips between OFF and + * A further ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * @@ -674,13 +702,56 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) WRITE_ONCE(event->state, state); } +/* + * UP store-release, load-acquire + */ + +#define __store_release(ptr, val) \ +do { \ + barrier(); \ + WRITE_ONCE(*(ptr), (val)); \ +} while (0) + +#define __load_acquire(ptr) \ +({ \ + __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ + barrier(); \ + ___p; \ +}) + +#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ + if (_cgroup && !_epc->nr_cgroups) \ + continue; \ + else if (_pmu && _epc->pmu != _pmu) \ + continue; \ + else + +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) +{ + struct perf_event_pmu_context *pmu_ctx; + + for_each_epc(pmu_ctx, ctx, NULL, cgroup) + perf_pmu_disable(pmu_ctx->pmu); +} + +static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) +{ + struct perf_event_pmu_context *pmu_ctx; + + for_each_epc(pmu_ctx, ctx, NULL, cgroup) + perf_pmu_enable(pmu_ctx->pmu); +} + +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); + #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) @@ -719,35 +790,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); + struct perf_cgroup_info *t; - info = this_cpu_ptr(cgrp->info); + t = per_cpu_ptr(event->cgrp->info, event->cpu); + if (!__load_acquire(&t->active)) + return t->time; + now += READ_ONCE(t->timeoffset); + return now; +} - info->time += now - info->timestamp; +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->time += now - info->timestamp; info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; + struct perf_cgroup_info *info; if (cgrp) { + u64 now = perf_clock(); + for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); - __update_cgrp_time(cgrp); + info = this_cpu_ptr(cgrp->info); + + __update_cgrp_time(info, now, true); + if (final) + __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { - struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and @@ -756,19 +843,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current, event->ctx); + info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - __update_cgrp_time(event->cgrp); + if (info->active) + __update_cgrp_time(info, perf_clock(), true); } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp; + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; @@ -777,127 +864,59 @@ perf_cgroup_set_timestamp(struct task_struct *task, * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ - if (!task || !ctx->nr_cgroups) + if (!cgrp) return; - cgrp = perf_cgroup_from_task(task, ctx); + WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); } } -static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); - -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ - /* * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next */ -static void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task) { - struct perf_cpu_context *cpuctx; - struct list_head *list; - unsigned long flags; - - /* - * Disable interrupts and preemption to avoid this CPU's - * cgrp_cpuctx_entry to change under us. - */ - local_irq_save(flags); - - list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } - - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, - &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } - - local_irq_restore(flags); -} - -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_cgroup *cgrp; - rcu_read_lock(); /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock + * cpuctx->cgrp is set when the first cgroup event enabled, + * and is cleared when the last cgroup event disabled. */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(next, NULL); + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; - /* - * only schedule out current cgroup events if we know - * that we are switching to a different cgroup. Otherwise, - * do no touch the cgroup events. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - rcu_read_unlock(); -} + cgrp = perf_cgroup_from_task(task, NULL); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + return; -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_ctx_disable(&cpuctx->ctx, true); - rcu_read_lock(); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(prev, NULL); - + cpuctx->cgrp = cgrp; /* - * only need to schedule in cgroup events if we are changing - * cgroup during ctxsw. Cgroup events were not scheduled - * out of ctxsw out if that was not the case. + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWIN); + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); - rcu_read_unlock(); + perf_ctx_enable(&cpuctx->ctx, true); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -908,14 +927,14 @@ static int perf_cgroup_ensure_storage(struct perf_event *event, int cpu, heap_size, ret = 0; /* - * Allow storage to have sufficent space for an iterator for each + * Allow storage to have sufficient space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) heap_size++; for_each_possible_cpu(cpu) { - cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; @@ -947,22 +966,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int ret = 0; - if (!f.file) + if (fd_empty(f)) return -EBADF; - css = css_tryget_online_from_dir(f.file->f_path.dentry, + css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } + if (IS_ERR(css)) + return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) - goto out; + return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -976,20 +993,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, perf_detach_cgroup(event); ret = -EINVAL; } -out: - fdput(f); return ret; } static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - -static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; @@ -997,30 +1004,18 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct if (!is_cgroup_event(event)) return; + event->pmu_ctx->nr_cgroups++; + /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); - /* - * Since setting cpuctx->cgrp is conditional on the current @cgrp - * matching the event's cgroup, we must do this for every new event, - * because if the first would mismatch, the second would not try again - * and we would leave cpuctx->cgrp unset. - */ - if (ctx->is_active && !cpuctx->cgrp) { - struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - cpuctx->cgrp = cgrp; - } - if (ctx->nr_cgroups++) return; - list_add(&cpuctx->cgrp_cpuctx_entry, - per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); + cpuctx->cgrp = perf_cgroup_from_task(current, ctx); } static inline void @@ -1031,6 +1026,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (!is_cgroup_event(event)) return; + event->pmu_ctx->nr_cgroups--; + /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. @@ -1040,10 +1037,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (--ctx->nr_cgroups) return; - if (ctx->is_active && cpuctx->cgrp) - cpuctx->cgrp = NULL; - - list_del(&cpuctx->cgrp_cpuctx_entry); + cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ @@ -1066,17 +1060,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ -} - -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, + bool final) { } @@ -1088,22 +1073,16 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -static inline void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { + return 0; } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } @@ -1117,6 +1096,10 @@ static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } + +static void perf_cgroup_switch(struct task_struct *task) +{ +} #endif /* @@ -1129,34 +1112,30 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); - cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); + cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); + rotations = perf_rotate_context(cpc); - raw_spin_lock(&cpuctx->hrtimer_lock); + raw_spin_lock(&cpc->hrtimer_lock); if (rotations) - hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + hrtimer_forward_now(hr, cpc->hrtimer_interval); else - cpuctx->hrtimer_active = 0; - raw_spin_unlock(&cpuctx->hrtimer_lock); + cpc->hrtimer_active = 0; + raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } -static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; + struct pmu *pmu = cpc->epc.pmu; u64 interval; - /* no multiplexing needed for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return; - /* * check default is sane, if not set then force to * default interval (1/tick) @@ -1165,34 +1144,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - raw_spin_lock_init(&cpuctx->hrtimer_lock); + raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } -static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; - /* not for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return 0; - - raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); - if (!cpuctx->hrtimer_active) { - cpuctx->hrtimer_active = 1; - hrtimer_forward_now(timer, cpuctx->hrtimer_interval); + raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); + if (!cpc->hrtimer_active) { + cpc->hrtimer_active = 1; + hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } - raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); + raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } +static int perf_mux_hrtimer_restart_ipi(void *arg) +{ + return perf_mux_hrtimer_restart(arg); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1207,32 +1186,9 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, active_ctx_list); - -/* - * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and - * perf_event_task_tick() are fully serialized because they're strictly cpu - * affine and perf_event_ctx{activate,deactivate} are called with IRQs - * disabled, while perf_event_task_tick is called from IRQ context. - */ -static void perf_event_ctx_activate(struct perf_event_context *ctx) +static void perf_assert_pmu_disabled(struct pmu *pmu) { - struct list_head *head = this_cpu_ptr(&active_ctx_list); - - lockdep_assert_irqs_disabled(); - - WARN_ON(!list_empty(&ctx->active_ctx_list)); - - list_add(&ctx->active_ctx_list, head); -} - -static void perf_event_ctx_deactivate(struct perf_event_context *ctx) -{ - lockdep_assert_irqs_disabled(); - - WARN_ON(list_empty(&ctx->active_ctx_list)); - - list_del_init(&ctx->active_ctx_list); + WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } static void get_ctx(struct perf_event_context *ctx) @@ -1259,7 +1215,6 @@ static void free_ctx(struct rcu_head *head) struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -1301,7 +1256,7 @@ static void put_ctx(struct perf_event_context *ctx) * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * - * But remember that that these are parent<->child context relations, and + * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * @@ -1332,8 +1287,9 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock - * perf_event::mmap_mutex * mmap_lock + * perf_event::mmap_mutex + * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock @@ -1440,11 +1396,11 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. * - * This has to cope with with the fact that until it is locked, + * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; @@ -1460,7 +1416,7 @@ retry: */ local_irq_save(*flags); rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might @@ -1473,7 +1429,7 @@ retry: * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); - if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); @@ -1500,12 +1456,12 @@ retry: * reference count so that the context can't get freed. */ static struct perf_event_context * -perf_pin_task_context(struct task_struct *task, int ctxn) +perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -1525,22 +1481,61 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_event_context *ctx) +static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); - ctx->time += now - ctx->timestamp; + lockdep_assert_held(&ctx->lock); + + if (adv) + ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +} + +static void update_context_time(struct perf_event_context *ctx) +{ + __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + if (unlikely(!ctx)) + return 0; + if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx ? ctx->time : 0; + return ctx->time; +} + +static u64 perf_event_time_now(struct perf_event *event, u64 now) +{ + struct perf_event_context *ctx = event->ctx; + + if (unlikely(!ctx)) + return 0; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time_now(event, now); + + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) + return ctx->time; + + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -1595,82 +1590,125 @@ static void perf_event_groups_init(struct perf_event_groups *groups) groups->index = 0; } +static inline struct cgroup *event_cgroup(const struct perf_event *event) +{ + struct cgroup *cgroup = NULL; + +#ifdef CONFIG_CGROUP_PERF + if (event->cgrp) + cgroup = event->cgrp->css.cgroup; +#endif + + return cgroup; +} + /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ -static bool -perf_event_groups_less(struct perf_event *left, struct perf_event *right) +static __always_inline int +perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, + const struct cgroup *left_cgroup, const u64 left_group_index, + const struct perf_event *right) { - if (left->cpu < right->cpu) - return true; - if (left->cpu > right->cpu) - return false; + if (left_cpu < right->cpu) + return -1; + if (left_cpu > right->cpu) + return 1; + + if (left_pmu) { + if (left_pmu < right->pmu_ctx->pmu) + return -1; + if (left_pmu > right->pmu_ctx->pmu) + return 1; + } #ifdef CONFIG_CGROUP_PERF - if (left->cgrp != right->cgrp) { - if (!left->cgrp || !left->cgrp->css.cgroup) { - /* - * Left has no cgroup but right does, no cgroups come - * first. - */ - return true; - } - if (!right->cgrp || !right->cgrp->css.cgroup) { - /* - * Right has no cgroup but left does, no cgroups come - * first. - */ - return false; - } - /* Two dissimilar cgroups, order by id. */ - if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) - return true; + { + const struct cgroup *right_cgroup = event_cgroup(right); - return false; + if (left_cgroup != right_cgroup) { + if (!left_cgroup) { + /* + * Left has no cgroup but right does, no + * cgroups come first. + */ + return -1; + } + if (!right_cgroup) { + /* + * Right has no cgroup but left does, no + * cgroups come first. + */ + return 1; + } + /* Two dissimilar cgroups, order by id. */ + if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) + return -1; + + return 1; + } } #endif - if (left->group_index < right->group_index) - return true; - if (left->group_index > right->group_index) - return false; + if (left_group_index < right->group_index) + return -1; + if (left_group_index > right->group_index) + return 1; - return false; + return 0; +} + +#define __node_2_pe(node) \ + rb_entry((node), struct perf_event, group_node) + +static inline bool __group_less(struct rb_node *a, const struct rb_node *b) +{ + struct perf_event *e = __node_2_pe(a); + return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), + e->group_index, __node_2_pe(b)) < 0; +} + +struct __group_key { + int cpu; + struct pmu *pmu; + struct cgroup *cgroup; +}; + +static inline int __group_cmp(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); +} + +static inline int +__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), + b->group_index, b); } /* - * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for - * key (see perf_event_groups_less). This places it last inside the CPU - * subtree. + * Insert @event into @groups' tree; using + * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} + * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { - struct perf_event *node_event; - struct rb_node *parent; - struct rb_node **node; - event->group_index = ++groups->index; - node = &groups->tree.rb_node; - parent = *node; - - while (*node) { - parent = *node; - node_event = container_of(*node, struct perf_event, group_node); - - if (perf_event_groups_less(event, node_event)) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&event->group_node, parent, node); - rb_insert_color(&event->group_node, &groups->tree); + rb_add(&event->group_node, &groups->tree, __group_less); } /* @@ -1712,82 +1750,47 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Get the leftmost event in the cpu/cgroup subtree. + * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, - struct cgroup *cgrp) + struct pmu *pmu, struct cgroup *cgrp) { - struct perf_event *node_event = NULL, *match = NULL; - struct rb_node *node = groups->tree.rb_node; -#ifdef CONFIG_CGROUP_PERF - u64 node_cgrp_id, cgrp_id = 0; - - if (cgrp) - cgrp_id = cgrp->kn->id; -#endif - - while (node) { - node_event = container_of(node, struct perf_event, group_node); - - if (cpu < node_event->cpu) { - node = node->rb_left; - continue; - } - if (cpu > node_event->cpu) { - node = node->rb_right; - continue; - } -#ifdef CONFIG_CGROUP_PERF - node_cgrp_id = 0; - if (node_event->cgrp && node_event->cgrp->css.cgroup) - node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; + struct __group_key key = { + .cpu = cpu, + .pmu = pmu, + .cgroup = cgrp, + }; + struct rb_node *node; - if (cgrp_id < node_cgrp_id) { - node = node->rb_left; - continue; - } - if (cgrp_id > node_cgrp_id) { - node = node->rb_right; - continue; - } -#endif - match = node_event; - node = node->rb_left; - } + node = rb_find_first(&key, &groups->tree, __group_cmp); + if (node) + return __node_2_pe(node); - return match; + return NULL; } -/* - * Like rb_entry_next_safe() for the @cpu subtree. - */ static struct perf_event * -perf_event_groups_next(struct perf_event *event) +perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { - struct perf_event *next; -#ifdef CONFIG_CGROUP_PERF - u64 curr_cgrp_id = 0; - u64 next_cgrp_id = 0; -#endif - - next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); - if (next == NULL || next->cpu != event->cpu) - return NULL; - -#ifdef CONFIG_CGROUP_PERF - if (event->cgrp && event->cgrp->css.cgroup) - curr_cgrp_id = event->cgrp->css.cgroup->kn->id; + struct __group_key key = { + .cpu = event->cpu, + .pmu = pmu, + .cgroup = event_cgroup(event), + }; + struct rb_node *next; - if (next->cgrp && next->cgrp->css.cgroup) - next_cgrp_id = next->cgrp->css.cgroup->kn->id; + next = rb_next_match(&key, &event->group_node, __group_cmp); + if (next) + return __node_2_pe(next); - if (curr_cgrp_id != next_cgrp_id) - return NULL; -#endif - return next; + return NULL; } +#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ + for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ + event; event = perf_event_groups_next(event, pmu)) + /* * Iterate through the whole groups tree. */ @@ -1798,6 +1801,14 @@ perf_event_groups_next(struct perf_event *event) typeof(*event), group_node)) /* + * Does the event attribute request inherit with PERF_SAMPLE_READ + */ +static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) +{ + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); +} + +/* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ @@ -1823,13 +1834,18 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (has_inherit_and_sample_read(&event->attr)) + local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); ctx->generation++; + event->pmu_ctx->nr_events++; } /* @@ -1841,28 +1857,34 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -static void __perf_event_read_size(struct perf_event *event, int nr_siblings) +static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_ID) + if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_GROUP) { + if (read_format & PERF_FORMAT_LOST) + entry += sizeof(u64); + + if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } - size += entry * nr; - event->read_size = size; + /* + * Since perf_event_validate_size() limits this to 16k and inhibits + * adding more siblings, this will never overflow. + */ + return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) @@ -1879,8 +1901,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); - if (sample_type & PERF_SAMPLE_WEIGHT) - size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; @@ -1912,8 +1934,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) */ static void perf_event__header_size(struct perf_event *event) { - __perf_event_read_size(event, - event->group_leader->nr_siblings); + event->read_size = + __perf_event_read_size(event->attr.read_format, + event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } @@ -1944,23 +1967,44 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +/* + * Check that adding an event to the group does not result in anybody + * overflowing the 64k event limit imposed by the output buffer. + * + * Specifically, check that the read_size for the event does not exceed 16k, + * read_size being the one term that grows with groups size. Since read_size + * depends on per-event read_format, also (re)check the existing events. + * + * This leaves 48k for the constant size fields and things like callchains, + * branch stacks and register sets. + */ static bool perf_event_validate_size(struct perf_event *event) { - /* - * The values computed here will be over-written when we actually - * attach the event. - */ - __perf_event_read_size(event, event->group_leader->nr_siblings + 1); - __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); - perf_event__id_header_size(event); + struct perf_event *sibling, *group_leader = event->group_leader; + + if (__perf_event_read_size(event->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + + if (__perf_event_read_size(group_leader->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; /* - * Sum the lot; should not exceed the 64k limit we have on records. - * Conservative limit to allow for callchains and other variable fields. + * When creating a new group leader, group_leader->ctx is initialized + * after the size has been validated, but we cannot safely use + * for_each_sibling_event() until group_leader->ctx is set. A new group + * leader cannot have any siblings yet, so we can safely skip checking + * the non-existent siblings. */ - if (event->read_size + event->header_size + - event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) - return false; + if (event == group_leader) + return true; + + for_each_sibling_event(sibling, group_leader) { + if (__perf_event_read_size(sibling->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + } return true; } @@ -1972,7 +2016,8 @@ static void perf_group_attach(struct perf_event *event) lockdep_assert_held(&event->ctx->lock); /* - * We can have double attach due to group movement in perf_event_open. + * We can have double attach due to group movement (move_group) in + * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; @@ -1988,6 +2033,7 @@ static void perf_group_attach(struct perf_event *event) list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; + group_leader->group_generation++; perf_event__header_size(group_leader); @@ -2014,8 +2060,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; + if (has_inherit_and_sample_read(&event->attr)) + local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); @@ -2035,6 +2085,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) } ctx->generation++; + event->pmu_ctx->nr_events--; } static int @@ -2051,13 +2102,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *iter; /* @@ -2086,14 +2135,14 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, cpuctx, ctx); + event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } static bool perf_need_aux_event(struct perf_event *event) { - return !!event->attr.aux_output || !!event->attr.aux_sample_size; + return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, @@ -2118,6 +2167,10 @@ static int perf_get_aux_event(struct perf_event *event, !perf_aux_output_match(event, group_leader)) return 0; + if ((event->attr.aux_pause || event->attr.aux_resume) && + !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return 0; + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; @@ -2137,8 +2190,8 @@ static int perf_get_aux_event(struct perf_event *event, static inline struct list_head *get_event_list(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; + return event->attr.pinned ? &event->pmu_ctx->pinned_active : + &event->pmu_ctx->flexible_active; } /* @@ -2149,10 +2202,7 @@ static inline struct list_head *get_event_list(struct perf_event *event) */ static inline void perf_remove_sibling_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } @@ -2180,6 +2230,7 @@ static void perf_group_detach(struct perf_event *event) if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; + event->group_leader->group_generation++; goto out; } @@ -2199,7 +2250,7 @@ static void perf_group_detach(struct perf_event *event) /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; - if (!RB_EMPTY_NODE(&event->group_node)) { + if (sibling->attach_state & PERF_ATTACH_CONTEXT) { add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) @@ -2216,52 +2267,47 @@ out: perf_event__header_size(leader); } -static bool is_orphaned_event(struct perf_event *event) -{ - return event->state == PERF_EVENT_STATE_DEAD; -} +static void sync_child_event(struct perf_event *child_event); -static inline int __pmu_filter_match(struct perf_event *event) +static void perf_child_detach(struct perf_event *event) { - struct pmu *pmu = event->pmu; - return pmu->filter_match ? pmu->filter_match(event) : 1; -} + struct perf_event *parent_event = event->parent; -/* - * Check whether we should attempt to schedule an event group based on - * PMU-specific filtering. An event group can consist of HW and SW events, - * potentially with a SW leader, so we must check all the filters, to - * determine whether a group is schedulable: - */ -static inline int pmu_filter_match(struct perf_event *event) -{ - struct perf_event *sibling; + if (!(event->attach_state & PERF_ATTACH_CHILD)) + return; - if (!__pmu_filter_match(event)) - return 0; + event->attach_state &= ~PERF_ATTACH_CHILD; - for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) - return 0; - } + if (WARN_ON_ONCE(!parent_event)) + return; - return 1; + lockdep_assert_held(&parent_event->child_mutex); + + sync_child_event(event); + list_del_init(&event->child_list); +} + +static bool is_orphaned_event(struct perf_event *event) +{ + return event->state == PERF_EVENT_STATE_DEAD; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && - perf_cgroup_match(event) && pmu_filter_match(event); + perf_cgroup_match(event); } static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; + // XXX cpc serialization, probably per-cpu IRQ disabled + WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -2280,49 +2326,87 @@ event_sched_out(struct perf_event *event, event->pmu->del(event, 0); event->oncpu = -1; - if (READ_ONCE(event->pending_disable) >= 0) { - WRITE_ONCE(event->pending_disable, -1); + if (event->pending_disable) { + event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } + perf_event_set_state(event, state); if (!is_software_event(event)) - cpuctx->active_oncpu--; - if (!--ctx->nr_active) - perf_event_ctx_deactivate(ctx); - if (event->attr.freq && event->attr.sample_freq) + cpc->active_oncpu--; + if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq--; - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; + epc->nr_freq--; + } + if (event->attr.exclusive || !cpc->active_oncpu) + cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void -group_sched_out(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; - perf_pmu_disable(ctx->pmu); + perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); +} + +static inline void +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, final); + } +} - perf_pmu_enable(ctx->pmu); +static inline void +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + __ctx_time_update(cpuctx, ctx, false); +} + +/* + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). + */ +static inline void +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + ctx_time_update(cpuctx, ctx); + if (ctx->is_active & EVENT_TIME) + ctx->is_active |= EVENT_FROZEN; +} + +static inline void +ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_event(event); + } } #define DETACH_GROUP 0x01UL +#define DETACH_CHILD 0x02UL +#define DETACH_DEAD 0x04UL /* * Cross CPU call to remove a performance event @@ -2336,21 +2420,43 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_context *ctx, void *info) { + struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - } + ctx_time_update(cpuctx, ctx); - event_sched_out(event, cpuctx, ctx); + /* + * Ensure event_sched_out() switches to OFF, at the very least + * this avoids raising perf_pending_task() at this time. + */ + if (flags & DETACH_DEAD) + event->pending_disable = 1; + event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); + if (flags & DETACH_CHILD) + perf_child_detach(event); list_del_event(event, ctx); + if (flags & DETACH_DEAD) + event->state = PERF_EVENT_STATE_DEAD; + + if (!pmu_ctx->nr_events) { + pmu_ctx->rotate_necessary = 0; + + if (ctx->task && ctx->is_active) { + struct perf_cpu_pmu_context *cpc; + + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + } if (!ctx->nr_events && ctx->is_active) { + if (ctx == &cpuctx->ctx) + update_cgrp_time_from_cpuctx(cpuctx, true); + ctx->is_active = 0; - ctx->rotate_necessary = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; @@ -2374,25 +2480,21 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla lockdep_assert_held(&ctx->mutex); - event_function_call(event, __perf_remove_from_context, (void *)flags); - /* - * The above event_function_call() can NO-OP when it hits - * TASK_TOMBSTONE. In that case we must already have been detached - * from the context (by perf_event_exit_event()) but the grouping - * might still be in-tact. + * Because of perf_event_exit_task(), perf_remove_from_context() ought + * to work in the face of TASK_TOMBSTONE, unlike every other + * event_function_call() user. */ - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); - if ((flags & DETACH_GROUP) && - (event->attach_state & PERF_ATTACH_GROUP)) { - /* - * Since in that case we cannot possibly be scheduled, simply - * detach now. - */ - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); + raw_spin_lock_irq(&ctx->lock); + if (!ctx->is_active) { + __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), + ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); + return; } + raw_spin_unlock_irq(&ctx->lock); + + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* @@ -2406,18 +2508,18 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + perf_pmu_disable(event->pmu_ctx->pmu); + ctx_time_update_event(ctx, event); if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); + group_sched_out(event, ctx); else - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); + + perf_pmu_enable(event->pmu_ctx->pmu); } /* @@ -2430,7 +2532,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_event it's OK because event->ctx + * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2469,43 +2571,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { - WRITE_ONCE(event->pending_disable, smp_processor_id()); - /* can fail, see perf_pending_event_disable() */ - irq_work_queue(&event->pending); -} - -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, event->tstamp); - else - event->shadow_ctx_time = event->tstamp - ctx->timestamp; + event->pending_disable = 1; + irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -2514,10 +2581,10 @@ static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2548,8 +2615,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx); - perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { @@ -2560,14 +2625,13 @@ event_sched_in(struct perf_event *event, } if (!is_software_event(event)) - cpuctx->active_oncpu++; - if (!ctx->nr_active++) - perf_event_ctx_activate(ctx); - if (event->attr.freq && event->attr.sample_freq) + cpc->active_oncpu++; + if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq++; - + epc->nr_freq++; + } if (event->attr.exclusive) - cpuctx->exclusive = 1; + cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); @@ -2576,26 +2640,24 @@ out: } static int -group_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = ctx->pmu; + struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) + if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { - if (event_sched_in(event, cpuctx, ctx)) { + if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } @@ -2614,9 +2676,9 @@ group_error: if (event == partial_group) break; - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); } - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); @@ -2626,10 +2688,11 @@ error: /* * Work out whether we can put this event group on the CPU now. */ -static int group_can_go_on(struct perf_event *event, - struct perf_cpu_context *cpuctx, - int can_add_hw) +static int group_can_go_on(struct perf_event *event, int can_add_hw) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + /* * Groups consisting entirely of software events can always go on. */ @@ -2639,7 +2702,7 @@ static int group_can_go_on(struct perf_event *event, * If an exclusive group is already on, no other hardware * events can go on. */ - if (cpuctx->exclusive) + if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already @@ -2661,38 +2724,31 @@ static void add_event_to_ctx(struct perf_event *event, perf_group_attach(event); } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type); -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); - -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, +static void task_ctx_sched_out(struct perf_event_context *ctx, + struct pmu *pmu, enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct task_struct *task) + struct pmu *pmu) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* @@ -2712,10 +2768,10 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, - enum event_type_t event_type) + struct pmu *pmu, enum event_type_t event_type) { - enum event_type_t ctx_event_type; bool cpu_event = !!(event_type & EVENT_CPU); + struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be @@ -2724,11 +2780,17 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; - ctx_event_type = event_type & EVENT_ALL; + event_type &= EVENT_ALL; - perf_pmu_disable(cpuctx->ctx.pmu); - if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx, event_type); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + task_ctx_sched_out(task_ctx, pmu, event_type); + } /* * Decide which cpu ctx groups to schedule out based on the types @@ -2738,21 +2800,28 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - cpu_ctx_sched_out(cpuctx, ctx_event_type); - else if (ctx_event_type & EVENT_PINNED) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, pmu, event_type); + else if (event_type & EVENT_PINNED) + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, current); - perf_pmu_enable(cpuctx->ctx.pmu); + perf_event_sched_in(cpuctx, task_ctx, pmu); + + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_enable(epc->pmu); + + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_enable(epc->pmu); + } } void perf_pmu_resched(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } @@ -2766,7 +2835,7 @@ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; @@ -2808,9 +2877,10 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, + get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2841,7 +2911,7 @@ perf_install_in_context(struct perf_event_context *ctx, WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) - event->cpu = cpu; + WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx @@ -2857,7 +2927,8 @@ perf_install_in_context(struct perf_event_context *ctx, * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ - if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && + ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); @@ -2952,8 +3023,7 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - if (ctx->is_active) - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2961,25 +3031,21 @@ static void __perf_event_enable(struct perf_event *event, if (!ctx->is_active) return; - if (!event_filter_match(event)) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + if (!event_filter_match(event)) return; - } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; - } task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* @@ -3189,27 +3255,116 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, return err; } +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { + int (*func)(struct perf_event *, struct perf_event_attr *); + struct perf_event *child; + int err; + if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: - return perf_event_modify_breakpoint(event, attr); + func = perf_event_modify_breakpoint; + break; default: /* Place holder for future additions. */ return -EOPNOTSUPP; } + + WARN_ON_ONCE(event->ctx->parent_ctx); + + mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); + err = func(event, attr); + if (err) + goto out; + list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); + err = func(child, attr); + if (err) + goto out; + } +out: + mutex_unlock(&event->child_mutex); + return err; } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { + struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; + struct pmu *pmu = pmu_ctx->pmu; + + if (ctx->task && !(ctx->is_active & EVENT_ALL)) { + struct perf_cpu_pmu_context *cpc; + + cpc = this_cpu_ptr(pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + + if (!(event_type & EVENT_ALL)) + return; + + perf_pmu_disable(pmu); + if (event_type & EVENT_PINNED) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->pinned_active, + active_list) + group_sched_out(event, ctx); + } + + if (event_type & EVENT_FLEXIBLE) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->flexible_active, + active_list) + group_sched_out(event, ctx); + /* + * Since we cleared EVENT_FLEXIBLE, also clear + * rotate_necessary, is will be reset by + * ctx_flexible_sched_in() when needed. + */ + pmu_ctx->rotate_necessary = 0; + } + perf_pmu_enable(pmu); +} + +/* + * Be very careful with the @pmu argument since this will change ctx state. + * The @pmu argument works for ctx_resched(), because that is symmetric in + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. + * + * However, if you were to be asymmetrical, you could end up with messed up + * state, eg. ctx->is_active cleared even though most EPCs would still actually + * be active. + */ +static void +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3223,16 +3378,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; - } - /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -3243,35 +3388,36 @@ static void ctx_sched_out(struct perf_event_context *ctx, * * would only update time for the pinned events. */ - if (is_active & EVENT_TIME) { - /* update (and stop) ctx time */ - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - } + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); - is_active ^= ctx->is_active; /* changed bits */ + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + ctx->is_active &= ~event_type; - if (!ctx->nr_active || !(is_active & EVENT_ALL)) - return; + if (!(ctx->is_active & EVENT_ALL)) { + /* + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() + * does not observe a hole. perf_ctx_unlock() will clean up. + */ + if (ctx->is_active & EVENT_FROZEN) + ctx->is_active &= EVENT_TIME_FROZEN; + else + ctx->is_active = 0; + } - perf_pmu_disable(ctx->pmu); - if (is_active & EVENT_PINNED) { - list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) - group_sched_out(event, cpuctx, ctx); + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!(ctx->is_active & EVENT_ALL)) + cpuctx->task_ctx = NULL; } - if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) - group_sched_out(event, cpuctx, ctx); + is_active ^= ctx->is_active; /* changed bits */ - /* - * Since we cleared EVENT_FLEXIBLE, also clear - * rotate_necessary, is will be reset by - * ctx_flexible_sched_in() when needed. - */ - ctx->rotate_necessary = 0; - } - perf_pmu_enable(ctx->pmu); + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_out(pmu_ctx, is_active); } /* @@ -3376,26 +3522,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -static void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ + for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ + pos2 = list_first_entry(head2, typeof(*pos2), member); \ + !list_entry_is_head(pos1, head1, member) && \ + !list_entry_is_head(pos2, head2, member); \ + pos1 = list_next_entry(pos1, member), \ + pos2 = list_next_entry(pos2, member)) + +static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, + struct perf_event_context *next_ctx) { - struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; + struct perf_event_pmu_context *prev_epc, *next_epc; + + if (!prev_ctx->nr_task_data) + return; + + double_list_for_each_entry(prev_epc, next_epc, + &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, + pmu_ctx_entry) { + + if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) + continue; + + /* + * PMU specific parts of task perf context can require + * additional synchronization. As an example of such + * synchronization see implementation details of Intel + * LBR call stack data profiling; + */ + if (prev_epc->pmu->swap_task_ctx) + prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); + else + swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); + } +} + +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) +{ + struct perf_event_pmu_context *pmu_ctx; + struct perf_cpu_pmu_context *cpc; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + + if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) + pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); + } +} + +static void +perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; - struct perf_cpu_context *cpuctx; int do_switch = 1; - struct pmu *pmu; if (likely(!ctx)) return; - pmu = ctx->pmu; - cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) - return; - rcu_read_lock(); - next_ctx = next->perf_event_ctxp[ctxn]; + next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; @@ -3420,26 +3608,32 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + perf_ctx_disable(ctx, false); + + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ + if (local_read(&ctx->nr_no_switch_fast) || + local_read(&next_ctx->nr_no_switch_fast)) { + /* + * Must not swap out ctx when there's pending + * events that rely on the ctx->task relation. + * + * Likewise, when a context contains inherit + + * SAMPLE_READ events they should be switched + * out using the slow path so that they are + * treated as if they were distinct contexts. + */ + raw_spin_unlock(&next_ctx->lock); + rcu_read_unlock(); + goto inside_switch; + } + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_pmu_disable(pmu); + perf_ctx_sched_task_cb(ctx, false); + perf_event_swap_task_ctx_data(ctx, next_ctx); - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(ctx, false); - - /* - * PMU specific parts of task perf context can require - * additional synchronization. As an example of such - * synchronization see implementation details of Intel - * LBR call stack data profiling; - */ - if (pmu->swap_task_ctx) - pmu->swap_task_ctx(ctx, next_ctx); - else - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); - - perf_pmu_enable(pmu); + perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not @@ -3448,8 +3642,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, * since those values are always verified under * ctx->lock which we're now holding. */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); - RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; @@ -3463,30 +3657,41 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_pmu_disable(pmu); + perf_ctx_disable(ctx, false); - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(ctx, false); - task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); +inside_switch: + perf_ctx_sched_task_cb(ctx, false); + task_ctx_sched_out(ctx, NULL, EVENT_ALL); - perf_pmu_enable(pmu); + perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); + void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); - --cpuctx->sched_cb_usage; + this_cpu_dec(perf_sched_cb_usages); + barrier(); + + if (!--cpc->sched_cb_usage) + list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + + if (!cpc->sched_cb_usage++) + list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); - cpuctx->sched_cb_usage++; + barrier(); + this_cpu_inc(perf_sched_cb_usages); } /* @@ -3497,30 +3702,44 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + pmu = cpc->epc.pmu; + /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); - pmu->sched_task(cpuctx->task_ctx, sched_in); + pmu->sched_task(cpc->task_epc, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_cpu_pmu_context *cpc; + + /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ + if (prev == next || cpuctx->task_ctx) + return; + + list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) + __perf_pmu_sched_task(cpc, sched_in); +} + static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); -#define for_each_task_context_nr(ctxn) \ - for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) - /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. @@ -3535,33 +3754,23 @@ static void perf_event_switch(struct task_struct *task, void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { - int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); - for_each_task_context_nr(ctxn) - perf_event_context_sched_out(task, ctxn, next); + perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_out(task, next); -} - -/* - * Called with IRQs disabled - */ -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); + perf_cgroup_switch(next); } -static bool perf_less_group_idx(const void *l, const void *r) +static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args) { const struct perf_event *le = *(const struct perf_event **)l; const struct perf_event *re = *(const struct perf_event **)r; @@ -3569,20 +3778,14 @@ static bool perf_less_group_idx(const void *l, const void *r) return le->group_index < re->group_index; } -static void swap_ptr(void *l, void *r) -{ - void **lp = l, **rp = r; - - swap(*lp, *rp); -} +DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); static const struct min_heap_callbacks perf_min_heap = { - .elem_size = sizeof(struct perf_event *), .less = perf_less_group_idx, - .swp = swap_ptr, + .swp = NULL, }; -static void __heap_add(struct min_heap *heap, struct perf_event *event) +static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) { struct perf_event **itrs = heap->data; @@ -3592,22 +3795,40 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event) } } -static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, +static void __link_epc(struct perf_event_pmu_context *pmu_ctx) +{ + struct perf_cpu_pmu_context *cpc; + + if (!pmu_ctx->ctx->task) + return; + + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = pmu_ctx; +} + +static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, + struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif + struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; - struct min_heap event_heap; + struct perf_event_min_heap event_heap; struct perf_event **evt; int ret; - if (cpuctx) { - event_heap = (struct min_heap){ + if (pmu->filter && pmu->filter(pmu, cpu)) + return 0; + + if (!ctx->task) { + cpuctx = this_cpu_ptr(&perf_cpu_context); + event_heap = (struct perf_event_min_heap){ .data = cpuctx->heap, .nr = 0, .size = cpuctx->heap_size, @@ -3620,44 +3841,77 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, css = &cpuctx->cgrp->css; #endif } else { - event_heap = (struct min_heap){ + event_heap = (struct perf_event_min_heap){ .data = itrs, .nr = 0, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ - __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); + __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; - __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) - __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif - min_heapify_all(&event_heap, &perf_min_heap); + if (event_heap.nr) { + __link_epc((*evt)->pmu_ctx); + perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); + } + + min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); while (event_heap.nr) { ret = func(*evt, data); if (ret) return ret; - *evt = perf_event_groups_next(*evt); + *evt = perf_event_groups_next(*evt, pmu); if (*evt) - min_heapify(&event_heap, 0, &perf_min_heap); + min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); else - min_heap_pop(&event_heap, &perf_min_heap); + min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); } return 0; } +/* + * Because the userpage is strictly per-event (there is no concept of context, + * so there cannot be a context indirection), every userpage must be updated + * when context time starts :-( + * + * IOW, we must not miss EVENT_TIME edges. + */ +static inline bool event_update_userpage(struct perf_event *event) +{ + if (likely(!atomic_read(&event->mmap_count))) + return false; + + perf_event_update_time(event); + perf_event_update_userpage(event); + + return true; +} + +static inline void group_update_userpage(struct perf_event *group_event) +{ + struct perf_event *event; + + if (!event_update_userpage(group_event)) + return; + + for_each_sibling_event(event, group_event) + event_update_userpage(event); +} + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) @@ -3666,70 +3920,78 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, cpuctx, *can_add_hw)) { - if (!group_sched_in(event, cpuctx, ctx)) + if (group_can_go_on(event, *can_add_hw)) { + if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { + *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); - } + } else { + struct perf_cpu_pmu_context *cpc; - *can_add_hw = 0; - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); + event->pmu_ctx->rotate_necessary = 1; + cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); + perf_mux_hrtimer_restart(cpc); + group_update_userpage(event); + } } return 0; } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void pmu_groups_sched_in(struct perf_event_context *ctx, + struct perf_event_groups *groups, + struct pmu *pmu) { int can_add_hw = 1; - - if (ctx != &cpuctx->ctx) - cpuctx = NULL; - - visit_groups_merge(cpuctx, &ctx->pinned_groups, - smp_processor_id(), + visit_groups_merge(ctx, groups, smp_processor_id(), pmu, merge_sched_in, &can_add_hw); } -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { - int can_add_hw = 1; - - if (ctx != &cpuctx->ctx) - cpuctx = NULL; + struct perf_event_context *ctx = pmu_ctx->ctx; - visit_groups_merge(cpuctx, &ctx->flexible_groups, - smp_processor_id(), - merge_sched_in, &can_add_hw); + if (event_type & EVENT_PINNED) + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + if (event_type & EVENT_FLEXIBLE) + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - u64 now; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; + if (!(is_active & EVENT_TIME)) { + /* start ctx time */ + __update_context_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { - if (!is_active) + if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -3737,45 +3999,41 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (is_active & EVENT_TIME) { - /* start ctx time */ - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - } - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); + if (is_active & EVENT_PINNED) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + } /* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); + if (is_active & EVENT_FLEXIBLE) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + } } -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) +static void perf_event_context_sched_in(struct task_struct *task) { - struct perf_event_context *ctx = &cpuctx->ctx; - - ctx_sched_in(ctx, cpuctx, event_type, task); -} + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu = ctx->pmu; + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp); + if (!ctx) + goto rcu_unlock; - cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) { - if (cpuctx->sched_cb_usage) - __perf_pmu_sched_task(cpuctx, true); - return; + perf_ctx_lock(cpuctx, ctx); + perf_ctx_disable(ctx, false); + + perf_ctx_sched_task_cb(ctx, true); + + perf_ctx_enable(ctx, false); + perf_ctx_unlock(cpuctx, ctx); + goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); @@ -3786,7 +4044,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(pmu); + perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3795,17 +4053,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { + perf_ctx_disable(&cpuctx->ctx, false); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); + } - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(cpuctx->task_ctx, true); + perf_event_sched_in(cpuctx, ctx, NULL); - perf_pmu_enable(pmu); + perf_ctx_sched_task_cb(cpuctx->task_ctx, true); + + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) + perf_ctx_enable(&cpuctx->ctx, false); + + perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); +rcu_unlock: + rcu_read_unlock(); } /* @@ -3822,29 +4087,13 @@ unlock: void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - struct perf_event_context *ctx; - int ctxn; - - /* - * If cgroup events exist on this CPU, then we need to check if we have - * to switch in PMU state; cgroup event are system-wide mode only. - * - * Since cgroup events are CPU events, we must schedule these in before - * we schedule in the task events. - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (likely(!ctx)) - continue; - - perf_event_context_sched_in(ctx, task); - } + perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); + + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -3932,7 +4181,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ + if (delta >= 0) + delta += 7; + else + delta -= 7; + delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; @@ -3952,49 +4205,32 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo } } -/* - * combine freq adjustment with unthrottling to avoid two passes over the - * events. At the same time, make sure, having freq events does not change - * the rate of unthrottling as that would introduce bias. - */ -static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, - int needs_unthr) +static void perf_adjust_freq_unthr_events(struct list_head *event_list) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; - /* - * only need to iterate over all events iff: - * - context have events in frequency mode (needs freq adjust) - * - there are events to unthrottle on this cpu - */ - if (!(ctx->nr_freq || needs_unthr)) - return; - - raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + list_for_each_entry(event, event_list, active_list) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; + // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; - perf_pmu_disable(event->pmu); - hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); - event->pmu->start(event, 0); + if (!event->attr.freq || !event->attr.sample_freq) + event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) - goto next; + continue; /* * stop the event and update event->count @@ -4016,11 +4252,43 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); - next: - perf_pmu_enable(event->pmu); + } +} + +/* + * combine freq adjustment with unthrottling to avoid two passes over the + * events. At the same time, make sure, having freq events does not change + * the rate of unthrottling as that would introduce bias. + */ +static void +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) +{ + struct perf_event_pmu_context *pmu_ctx; + + /* + * only need to iterate over all events iff: + * - context have events in frequency mode (needs freq adjust) + * - there are events to unthrottle on this cpu + */ + if (!(ctx->nr_freq || unthrottle)) + return; + + raw_spin_lock(&ctx->lock); + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (!(pmu_ctx->nr_freq || unthrottle)) + continue; + if (!perf_pmu_ctx_is_active(pmu_ctx)) + continue; + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) + continue; + + perf_pmu_disable(pmu_ctx->pmu); + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); + perf_pmu_enable(pmu_ctx->pmu); } - perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4042,72 +4310,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_event_to_rotate(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; + struct rb_node *node; + struct rb_root *tree; + struct __group_key key = { + .pmu = pmu_ctx->pmu, + }; /* pick the first active flexible event */ - event = list_first_entry_or_null(&ctx->flexible_active, + event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); + if (event) + goto out; /* if no active flexible event, pick the first event */ - if (!event) { - event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), - typeof(*event), group_node); + tree = &pmu_ctx->ctx->flexible_groups.tree; + + if (!pmu_ctx->ctx->task) { + key.cpu = smp_processor_id(); + + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + goto out; } + key.cpu = -1; + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) { + event = __node_2_pe(node); + goto out; + } + + key.cpu = smp_processor_id(); + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + +out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ - ctx->rotate_necessary = 0; + pmu_ctx->rotate_necessary = 0; return event; } -static bool perf_rotate_context(struct perf_cpu_context *cpuctx) +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; - struct perf_event_context *task_ctx = NULL; int cpu_rotate, task_rotate; + struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ - cpu_rotate = cpuctx->ctx.rotate_necessary; - task_ctx = cpuctx->task_ctx; - task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; + cpu_epc = &cpc->epc; + pmu = cpu_epc->pmu; + task_epc = cpc->task_epc; + + cpu_rotate = cpu_epc->rotate_necessary; + task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + perf_pmu_disable(pmu); if (task_rotate) - task_event = ctx_event_to_rotate(task_ctx); + task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate) - cpu_event = ctx_event_to_rotate(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(cpu_epc); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ - if (task_event || (task_ctx && cpu_event)) - ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); - if (cpu_event) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (task_event || (task_epc && cpu_event)) { + update_context_time(task_epc->ctx); + __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); + } - if (task_event) - rotate_ctx(task_ctx, task_event); - if (cpu_event) + if (cpu_event) { + update_context_time(&cpuctx->ctx); + __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); + } - perf_event_sched_in(cpuctx, task_ctx, current); + if (task_event) + rotate_ctx(task_epc->ctx, task_event); - perf_pmu_enable(cpuctx->ctx.pmu); + if (task_event || (task_epc && cpu_event)) + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); + + perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; @@ -4115,8 +4420,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&active_ctx_list); - struct perf_event_context *ctx, *tmp; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; int throttled; lockdep_assert_irqs_disabled(); @@ -4125,8 +4430,13 @@ void perf_event_task_tick(void) throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) - perf_adjust_freq_unthr_context(ctx, throttled); + perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); + + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_adjust_freq_unthr_context(ctx, !!throttled); + rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, @@ -4148,9 +4458,9 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(int ctxn) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; @@ -4158,13 +4468,16 @@ static void perf_event_enable_on_exec(int ctxn) int enabled = 0; local_irq_save(flags); - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx || !ctx->nr_events) + if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) goto out; - cpuctx = __get_cpu_context(ctx); + if (!ctx->nr_events) + goto out; + + cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); @@ -4175,9 +4488,7 @@ static void perf_event_enable_on_exec(int ctxn) */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx, event_type); - } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_resched(cpuctx, ctx, NULL, event_type); } perf_ctx_unlock(cpuctx, ctx); @@ -4188,19 +4499,74 @@ out: put_ctx(clone_ctx); } +static void perf_remove_from_owner(struct perf_event *event); +static void perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx); + +/* + * Removes all events from the current task that have been marked + * remove-on-exec, and feeds their values back to parent events. + */ +static void perf_event_remove_on_exec(struct perf_event_context *ctx) +{ + struct perf_event_context *clone_ctx = NULL; + struct perf_event *event, *next; + unsigned long flags; + bool modified = false; + + mutex_lock(&ctx->mutex); + + if (WARN_ON_ONCE(ctx->task != current)) + goto unlock; + + list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { + if (!event->attr.remove_on_exec) + continue; + + if (!is_kernel_event(event)) + perf_remove_from_owner(event); + + modified = true; + + perf_event_exit_event(event, ctx); + } + + raw_spin_lock_irqsave(&ctx->lock, flags); + if (modified) + clone_ctx = unclone_ctx(ctx); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + +unlock: + mutex_unlock(&ctx->mutex); + + if (clone_ctx) + put_ctx(clone_ctx); +} + struct perf_read_data { struct perf_event *event; bool group; int ret; }; +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); + static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { + int local_cpu = smp_processor_id(); u16 local_pkg, event_pkg; - if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { - int local_cpu = smp_processor_id(); + if ((unsigned)event_cpu >= nr_cpu_ids) + return event_cpu; + + if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); + if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) + return local_cpu; + } + + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); @@ -4219,7 +4585,7 @@ static void __perf_event_read(void *info) struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* @@ -4233,10 +4599,7 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (data->group) @@ -4271,11 +4634,26 @@ unlock: raw_spin_unlock(&ctx->lock); } -static inline u64 perf_event_count(struct perf_event *event) +static inline u64 perf_event_count(struct perf_event *event, bool self) { + if (self) + return local64_read(&event->count); + return local64_read(&event->count) + atomic64_read(&event->child_count); } +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = perf_event_time_now(event, *now); + __perf_update_times(event, ctx_time, enabled, running); +} + /* * NMI-safe method to read a local event, that is an event that * is: @@ -4288,6 +4666,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; + int event_oncpu; + int event_cpu; int ret = 0; /* @@ -4312,15 +4692,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* + * Get the event CPU numbers, and adjust them to local if the event is + * a per-package event that can be read locally + */ + event_oncpu = __perf_event_read_cpu(event, event->oncpu); + event_cpu = __perf_event_read_cpu(event, event->cpu); + /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && - event->cpu != smp_processor_id()) { + event_cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ - if (event->attr.pinned && event->oncpu != smp_processor_id()) { + if (event->attr.pinned && event_oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } @@ -4330,15 +4717,14 @@ int perf_event_read_local(struct perf_event *event, u64 *value, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event_oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); if (enabled || running) { - u64 now = event->shadow_ctx_time + perf_clock(); - u64 __enabled, __running; + u64 __enabled, __running, __now; - __perf_update_times(event, now, &__enabled, &__running); + calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) @@ -4413,10 +4799,7 @@ again: * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (group) @@ -4434,17 +4817,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->active_ctx_list); + INIT_LIST_HEAD(&ctx->pmu_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); - INIT_LIST_HEAD(&ctx->pinned_active); - INIT_LIST_HEAD(&ctx->flexible_active); refcount_set(&ctx->refcount, 1); } +static void +__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) +{ + epc->pmu = pmu; + INIT_LIST_HEAD(&epc->pmu_ctx_entry); + INIT_LIST_HEAD(&epc->pinned_active); + INIT_LIST_HEAD(&epc->flexible_active); + atomic_set(&epc->refcount, 1); +} + static struct perf_event_context * -alloc_perf_context(struct pmu *pmu, struct task_struct *task) +alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; @@ -4455,7 +4846,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); - ctx->pmu = pmu; return ctx; } @@ -4484,15 +4874,12 @@ find_lively_task_by_vpid(pid_t vpid) * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, - struct perf_event *event) +find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; - void *task_ctx_data = NULL; unsigned long flags; - int ctxn, err; - int cpu = event->cpu; + int err; if (!task) { /* Must be root to operate on a CPU event: */ @@ -4500,52 +4887,33 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (err) return ERR_PTR(err); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } err = -EINVAL; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto errout; - - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = alloc_task_ctx_data(pmu); - if (!task_ctx_data) { - err = -ENOMEM; - goto errout; - } - } - retry: - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; - if (task_ctx_data && !ctx->task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { - ctx = alloc_perf_context(pmu, task); + ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; - if (task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } - err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -4554,12 +4922,12 @@ retry: */ if (task->flags & PF_EXITING) err = -ESRCH; - else if (task->perf_event_ctxp[ctxn]) + else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; - rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); @@ -4572,26 +4940,157 @@ retry: } } - free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } +static struct perf_event_pmu_context * +find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, + struct perf_event *event) +{ + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; + void *task_ctx_data = NULL; + + if (!ctx->task) { + /* + * perf_pmu_migrate_context() / __perf_pmu_install_event() + * relies on the fact that find_get_pmu_context() cannot fail + * for CPU contexts. + */ + struct perf_cpu_pmu_context *cpc; + + cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + epc = &cpc->epc; + raw_spin_lock_irq(&ctx->lock); + if (!epc->ctx) { + atomic_set(&epc->refcount, 1); + epc->embedded = 1; + list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + epc->ctx = ctx; + } else { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + } + raw_spin_unlock_irq(&ctx->lock); + return epc; + } + + new = kzalloc(sizeof(*epc), GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + task_ctx_data = alloc_task_ctx_data(pmu); + if (!task_ctx_data) { + kfree(new); + return ERR_PTR(-ENOMEM); + } + } + + __perf_init_event_pmu_context(new, pmu); + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because perf_event_init_task() doesn't actually hold the + * child_ctx->mutex. + */ + + raw_spin_lock_irq(&ctx->lock); + list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (epc->pmu == pmu) { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + goto found_epc; + } + /* Make sure the pmu_ctx_list is sorted by PMU type: */ + if (!pos && epc->pmu->type > pmu->type) + pos = epc; + } + + epc = new; + new = NULL; + + if (!pos) + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + else + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); + + epc->ctx = ctx; + +found_epc: + if (task_ctx_data && !epc->task_ctx_data) { + epc->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + ctx->nr_task_data++; + } + raw_spin_unlock_irq(&ctx->lock); + + free_task_ctx_data(pmu, task_ctx_data); + kfree(new); + + return epc; +} + +static void get_pmu_ctx(struct perf_event_pmu_context *epc) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); +} + +static void free_epc_rcu(struct rcu_head *head) +{ + struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); + + kfree(epc->task_ctx_data); + kfree(epc); +} + +static void put_pmu_ctx(struct perf_event_pmu_context *epc) +{ + struct perf_event_context *ctx = epc->ctx; + unsigned long flags; + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) + return; + + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); + + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; + + WARN_ON_ONCE(!list_empty(&epc->pinned_active)); + WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + + raw_spin_unlock_irqrestore(&ctx->lock, flags); + + if (epc->embedded) + return; + + call_rcu(&epc->rcu_head, free_epc_rcu); +} + static void perf_event_free_filter(struct perf_event *event); -static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { - struct perf_event *event; + struct perf_event *event = container_of(head, typeof(*event), rcu_head); - event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); - kfree(event); + kmem_cache_free(perf_event_cache, event); } static void ring_buffer_attach(struct perf_event *event, @@ -4631,15 +5130,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event) detach_sb_event(event); } -static void unaccount_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_dec(&per_cpu(perf_cgroup_events, cpu)); -} - #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif @@ -4669,10 +5159,12 @@ static void unaccount_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); + if (event->attr.build_id) + atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) @@ -4703,8 +5195,6 @@ static void unaccount_event(struct perf_event *event) schedule_delayed_work(&perf_sched_work, HZ); } - unaccount_event_cpu(event, event->cpu); - unaccount_pmu_sb_event(event); } @@ -4723,7 +5213,7 @@ static void perf_sched_delayed(struct work_struct *work) * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, - * 3) two matching events on the same context. + * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). @@ -4805,9 +5295,35 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); +static void perf_pending_task_sync(struct perf_event *event) +{ + struct callback_head *head = &event->pending_task; + + if (!event->pending_work) + return; + /* + * If the task is queued to the current task's queue, we + * obviously can't wait for it to complete. Simply cancel it. + */ + if (task_work_cancel(current, head)) { + event->pending_work = 0; + local_dec(&event->ctx->nr_no_switch_fast); + return; + } + + /* + * All accesses related to the event are within the same RCU section in + * perf_pending_task(). The RCU grace period before the event is freed + * will make sure all those accesses are complete by then. + */ + rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); +} + static void _free_event(struct perf_event *event) { - irq_work_sync(&event->pending); + irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); unaccount_event(event); @@ -4847,6 +5363,9 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); + if (event->pmu_ctx) + put_pmu_ctx(event->pmu_ctx); + /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. @@ -4947,8 +5466,8 @@ int perf_event_release_kernel(struct perf_event *event) LIST_HEAD(free_list); /* - * If we got here through err_file: fput(event_file); we will not have - * attached to a context yet. + * If we got here through err_alloc: free_event(event); we will not + * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & @@ -4961,9 +5480,7 @@ int perf_event_release_kernel(struct perf_event *event) ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP); - raw_spin_lock_irq(&ctx->lock); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. @@ -4975,14 +5492,14 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - event->state = PERF_EVENT_STATE_DEAD; - raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { + void *var = NULL; /* * Cannot change, child events are not migrated, see the @@ -5023,11 +5540,23 @@ again: * this can't be the last reference. */ put_event(event); + } else { + var = &ctx->refcount; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); + + if (var) { + /* + * If perf_event_free_task() has deleted all events from the + * ctx while the child_mutex got released above, make sure to + * notify about the preceding put_ctx(). + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); + } goto again; } mutex_unlock(&event->child_mutex); @@ -5072,7 +5601,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); - total += perf_event_count(event); + total += perf_event_count(event, false); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -5081,7 +5610,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); - total += perf_event_count(child); + total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -5107,7 +5636,7 @@ static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; - struct perf_event *sub; + struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -5117,6 +5646,33 @@ static int __perf_read_group_add(struct perf_event *leader, return ret; raw_spin_lock_irqsave(&ctx->lock, flags); + /* + * Verify the grouping between the parent and child (inherited) + * events is still in tact. + * + * Specifically: + * - leader->ctx->lock pins leader->sibling_list + * - parent->child_mutex pins parent->child_list + * - parent->ctx->mutex pins parent->sibling_list + * + * Because parent->ctx != leader->ctx (and child_list nests inside + * ctx->mutex), group destruction is not atomic between children, also + * see perf_event_release_kernel(). Additionally, parent can grow the + * group. + * + * Therefore it is possible to have parent and child groups in a + * different configuration and summing over such a beast makes no sense + * what so ever. + * + * Reject this. + */ + parent = leader->parent; + if (parent && + (parent->group_generation != leader->group_generation || + parent->nr_siblings != leader->nr_siblings)) { + ret = -ECHILD; + goto unlock; + } /* * Since we co-schedule groups, {enabled,running} times of siblings @@ -5136,18 +5692,23 @@ static int __perf_read_group_add(struct perf_event *leader, /* * Write {count,id} tuples for every sibling. */ - values[n++] += perf_event_count(leader); + values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { - values[n++] += perf_event_count(sub); + values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); } +unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); - return 0; + return ret; } static int perf_read_group(struct perf_event *event, @@ -5166,10 +5727,6 @@ static int perf_read_group(struct perf_event *event, values[0] = 1 + leader->nr_siblings; - /* - * By locking the child_mutex of the leader we effectively - * lock the child list of all siblings.. XXX explain how. - */ mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); @@ -5200,7 +5757,7 @@ static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; - u64 values[4]; + u64 values[5]; int n = 0; values[n++] = __perf_event_read_value(event, &enabled, &running); @@ -5210,6 +5767,8 @@ static int perf_read_one(struct perf_event *event, values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; @@ -5377,7 +5936,7 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(event->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. @@ -5393,7 +5952,7 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); - perf_pmu_enable(ctx->pmu); + perf_pmu_enable(event->pmu); } } @@ -5410,14 +5969,15 @@ static int _perf_event_period(struct perf_event *event, u64 value) if (!value) return -EINVAL; - if (event->attr.freq && value > sysctl_perf_event_sample_rate) - return -EINVAL; - - if (perf_event_check_period(event, value)) - return -EINVAL; - - if (!event->attr.freq && (value & (1ULL << 63))) - return -EINVAL; + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) + return -EINVAL; + } else { + if (perf_event_check_period(event, value)) + return -EINVAL; + if (value & (1ULL << 63)) + return -EINVAL; + } event_function_call(event, __perf_event_period, &value); @@ -5439,24 +5999,14 @@ EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; -static inline int perf_fget_light(int fd, struct fd *p) +static inline bool is_perf_file(struct fd f) { - struct fd f = fdget(fd); - if (!f.file) - return -EBADF; - - if (f.file->f_op != &perf_fops) { - fdput(f); - return -EBADF; - } - *p = f; - return 0; + return !fd_empty(f) && fd_file(f)->f_op == &perf_fops; } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -5499,27 +6049,36 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_OUTPUT: { - int ret; + CLASS(fd, output)(arg); // arg == -1 => empty + struct perf_event *output_event = NULL; if (arg != -1) { - struct perf_event *output_event; - struct fd output; - ret = perf_fget_light(arg, &output); - if (ret) - return ret; - output_event = output.file->private_data; - ret = perf_event_set_output(event, output_event); - fdput(output); - } else { - ret = perf_event_set_output(event, NULL); + if (!is_perf_file(output)) + return -EBADF; + output_event = fd_file(output)->private_data; } - return ret; + return perf_event_set_output(event, output_event); } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: - return perf_event_set_bpf_prog(event, arg); + { + struct bpf_prog *prog; + int err; + + prog = bpf_prog_get(arg); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = perf_event_set_bpf_prog(event, prog, 0); + if (err) { + bpf_prog_put(prog); + return err; + } + + return 0; + } case PERF_EVENT_IOC_PAUSE_OUTPUT: { struct perf_buffer *rb; @@ -5643,18 +6202,6 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = event->shadow_ctx_time + *now; - __perf_update_times(event, ctx_time, enabled, running); -} - static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; @@ -5718,7 +6265,7 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); + userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); @@ -5738,47 +6285,14 @@ unlock: } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) -{ - struct perf_event *event = vmf->vma->vm_file->private_data; - struct perf_buffer *rb; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { struct perf_buffer *old_rb = NULL; unsigned long flags; + WARN_ON_ONCE(event->parent); + if (event->rb) { /* * Should be impossible, we set this when removing @@ -5836,6 +6350,9 @@ static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -5849,6 +6366,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -5907,12 +6427,11 @@ static void perf_mmap_close(struct vm_area_struct *vma) event->pmu->event_unmapped(event, vma->vm_mm); /* - * rb->aux_mmap_count will always drop before rb->mmap_count and - * event->mmap_count, so it is ok to use event->mmap_mutex to - * serialize with perf_mmap here. + * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex + * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -5929,7 +6448,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); - mutex_unlock(&event->mmap_mutex); + mutex_unlock(&rb->aux_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) @@ -6005,18 +6524,93 @@ out_put: ring_buffer_put(rb); /* could be last */ } +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) +{ + /* The first page is the user control page, others are read-only. */ + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, + .pfn_mkwrite = perf_mmap_pfn_mkwrite, }; +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) +{ + unsigned long nr_pages = vma_pages(vma); + int err = 0; + unsigned long pagenum; + + /* + * We map this as a VM_PFNMAP VMA. + * + * This is not ideal as this is designed broadly for mappings of PFNs + * referencing memory-mapped I/O ranges or non-system RAM i.e. for which + * !pfn_valid(pfn). + * + * We are mapping kernel-allocated memory (memory we manage ourselves) + * which would more ideally be mapped using vm_insert_page() or a + * similar mechanism, that is as a VM_MIXEDMAP mapping. + * + * However this won't work here, because: + * + * 1. It uses vma->vm_page_prot, but this field has not been completely + * setup at the point of the f_op->mmp() hook, so we are unable to + * indicate that this should be mapped CoW in order that the + * mkwrite() hook can be invoked to make the first page R/W and the + * rest R/O as desired. + * + * 2. Anything other than a VM_PFNMAP of valid PFNs will result in + * vm_normal_page() returning a struct page * pointer, which means + * vm_ops->page_mkwrite() will be invoked rather than + * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping + * to work around retry logic in the fault handler, however this + * field is no longer allowed to be used within struct page. + * + * 3. Having a struct page * made available in the fault logic also + * means that the page gets put on the rmap and becomes + * inappropriately accessible and subject to map and ref counting. + * + * Ideally we would have a mechanism that could explicitly express our + * desires, but this is not currently the case, so we instead use + * VM_PFNMAP. + * + * We manage the lifetime of these mappings with internal refcounts (see + * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of + * this mapping is maintained correctly. + */ + for (pagenum = 0; pagenum < nr_pages; pagenum++) { + unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; + struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); + + if (page == NULL) { + err = -EINVAL; + break; + } + + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); + if (err) + break; + } + +#ifdef CONFIG_MMU + /* Clear any partial mappings on error. */ + if (err) + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); +#endif + + return err; +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; @@ -6055,6 +6649,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; + if (nr_pages > INT_MAX) + return -ENOMEM; mutex_lock(&event->mmap_mutex); ret = -EINVAL; @@ -6063,6 +6659,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto aux_unlock; + aux_mutex = &rb->aux_mutex; + mutex_lock(aux_mutex); + aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); @@ -6115,21 +6714,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) { + if (data_page_nr(event->rb) != nr_pages) { ret = -EINVAL; goto unlock; } if (!atomic_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close() through - * perf_event_set_output(). Try again, hope for better - * luck. + * Raced against perf_mmap_close(); remove the + * event and try again. */ + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); goto again; } + /* We need the rb to map pages. */ + rb = event->rb; goto unlock; } @@ -6193,6 +6794,7 @@ accounting: ring_buffer_attach(event, rb); + perf_event_update_time(event); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { @@ -6212,15 +6814,20 @@ unlock: atomic_dec(&rb->mmap_count); } aux_unlock: + if (aux_mutex) + mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; + if (!ret) + ret = map_range(rb, vma); + if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); @@ -6244,7 +6851,6 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { - .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, @@ -6261,14 +6867,6 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ -static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) -{ - /* only the parent has fasync state */ - if (event->parent) - event = event->parent; - return &event->fasync; -} - void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); @@ -6279,16 +6877,49 @@ void perf_event_wakeup(struct perf_event *event) } } -static void perf_pending_event_disable(struct perf_event *event) +static void perf_sigtrap(struct perf_event *event) +{ + /* + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). + */ + if (WARN_ON_ONCE(event->ctx->task != current)) + return; + + /* + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. + */ + if (current->flags & PF_EXITING) + return; + + send_sig_perf((void __user *)event->pending_addr, + event->orig_type, event->attr.sig_data); +} + +/* + * Deliver the pending work in-event-context or follow the context. + */ +static void __perf_pending_disable(struct perf_event *event) { - int cpu = READ_ONCE(event->pending_disable); + int cpu = READ_ONCE(event->oncpu); + /* + * If the event isn't running; we done. event_sched_out() will have + * taken care of things. + */ if (cpu < 0) return; + /* + * Yay, we hit home and are in the context of the event. + */ if (cpu == smp_processor_id()) { - WRITE_ONCE(event->pending_disable, -1); - perf_event_disable_local(event); + if (event->pending_disable) { + event->pending_disable = 0; + perf_event_disable_local(event); + } return; } @@ -6308,26 +6939,43 @@ static void perf_pending_event_disable(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_event() + * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending, cpu); + irq_work_queue_on(&event->pending_disable_irq, cpu); } -static void perf_pending_event(struct irq_work *entry) +static void perf_pending_disable(struct irq_work *entry) { - struct perf_event *event = container_of(entry, struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); int rctx; - rctx = perf_swevent_get_recursion_context(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ + rctx = perf_swevent_get_recursion_context(); + __perf_pending_disable(event); + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); +} + +static void perf_pending_irq(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, struct perf_event, pending_irq); + int rctx; - perf_pending_event_disable(event); + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + /* + * The wakeup isn't bound to the context of the event -- it can happen + * irrespective of where the event is. + */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); @@ -6337,26 +6985,95 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ -struct perf_guest_info_callbacks *perf_guest_cbs; +static void perf_pending_task(struct callback_head *head) +{ + struct perf_event *event = container_of(head, struct perf_event, pending_task); + int rctx; + + /* + * All accesses to the event must belong to the same implicit RCU read-side + * critical section as the ->pending_work reset. See comment in + * perf_pending_task_sync(). + */ + rcu_read_lock(); + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) + if (event->pending_work) { + event->pending_work = 0; + perf_sigtrap(event); + local_dec(&event->ctx->nr_no_switch_fast); + rcuwait_wake_up(&event->pending_work_wait); + } + rcu_read_unlock(); + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); +} + +#ifdef CONFIG_GUEST_PERF_EVENTS +struct perf_guest_info_callbacks __rcu *perf_guest_cbs; + +DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); +DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + +void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = cbs; - return 0; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) + return; + + rcu_assign_pointer(perf_guest_cbs, cbs); + static_call_update(__perf_guest_state, cbs->state); + static_call_update(__perf_guest_get_ip, cbs->get_ip); + + /* Implementing ->handle_intel_pt_intr is optional. */ + if (cbs->handle_intel_pt_intr) + static_call_update(__perf_guest_handle_intel_pt_intr, + cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = NULL; - return 0; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) + return; + + rcu_assign_pointer(perf_guest_cbs, NULL); + static_call_update(__perf_guest_state, (void *)&__static_call_return0); + static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, + (void *)&__static_call_return0); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +#endif + +static bool should_sample_guest(struct perf_event *event) +{ + return !event->attr.exclude_guest && perf_guest_state(); +} + +unsigned long perf_misc_flags(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_arch_guest_misc_flags(regs); + + return perf_arch_misc_flags(regs); +} + +unsigned long perf_instruction_pointer(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_guest_get_ip(); + + return perf_arch_instruction_pointer(regs); +} static void perf_output_sample_regs(struct perf_output_handle *handle, @@ -6464,7 +7181,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; - mm_segment_t fs; /* * We dump: @@ -6482,9 +7198,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -6512,7 +7226,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event, if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) goto out; @@ -6532,10 +7246,10 @@ out: return data->aux_size; } -long perf_pmu_snapshot_aux(struct perf_buffer *rb, - struct perf_event *event, - struct perf_output_handle *handle, - unsigned long size) +static long perf_pmu_snapshot_aux(struct perf_buffer *rb, + struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) { unsigned long flags; long ret; @@ -6578,7 +7292,7 @@ static void perf_aux_sample_output(struct perf_event *event, if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) return; @@ -6610,14 +7324,20 @@ out_put: ring_buffer_put(rb); } -static void __perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) -{ - u64 sample_type = event->attr.sample_type; +/* + * A set of common sample data types saved even for non-sample records + * when event->attr.sample_id_all is set. + */ +#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ + PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) - data->type = sample_type; - header->size += event->id_header_size; +static void __perf_event_header__init_id(struct perf_sample_data *data, + struct perf_event *event, + u64 sample_type) +{ + data->type = event->attr.sample_type; + data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ @@ -6644,8 +7364,10 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { - if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + if (event->attr.sample_id_all) { + header->size += event->id_header_size; + __perf_event_header__init_id(data, event, event->attr.sample_type); + } } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -6685,10 +7407,10 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; - u64 values[4]; + u64 values[5]; int n = 0; - values[n++] = perf_event_count(event); + values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); @@ -6699,18 +7421,28 @@ static void perf_output_read_one(struct perf_output_handle *handle, } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; - u64 values[5]; + unsigned long flags; + u64 values[6]; int n = 0; + bool self = has_inherit_and_sample_read(&event->attr); + + /* + * Disabling interrupts avoids all counter scheduling + * (context switches, timer based rotation and IPIs). + */ + local_irq_save(flags); values[n++] = 1 + leader->nr_siblings; @@ -6724,9 +7456,11 @@ static void perf_output_read_group(struct perf_output_handle *handle, (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); - values[n++] = perf_event_count(leader); + values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); @@ -6737,12 +7471,16 @@ static void perf_output_read_group(struct perf_output_handle *handle, (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); - values[n++] = perf_event_count(sub); + values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } + + local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ @@ -6754,6 +7492,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. + * + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread + * counts rather than attempting to accumulate some value across all children on + * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) @@ -6779,11 +7521,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -6872,9 +7609,17 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); + /* + * Add the extension space which is appended + * right after the struct perf_branch_stack. + */ + if (data->br_stack_cntr) { + size = data->br_stack->nr * sizeof(u64); + perf_output_copy(handle, data->br_stack_cntr, size); + } } else { /* * we always store at least the value of nr @@ -6907,8 +7652,8 @@ void perf_output_sample(struct perf_output_handle *handle, data->regs_user.regs); } - if (sample_type & PERF_SAMPLE_WEIGHT) - perf_output_put(handle, data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); @@ -6970,7 +7715,6 @@ void perf_output_sample(struct perf_output_handle *handle, static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; - struct page *p = NULL; if (!virt) return 0; @@ -6989,14 +7733,15 @@ static u64 perf_virt_to_phys(u64 virt) * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { + struct page *p; + pagefault_disable(); - if (get_user_page_fast_only(virt, 0, &p)) + if (get_user_page_fast_only(virt, 0, &p)) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + put_page(p); + } pagefault_enable(); } - - if (p) - put_page(p); } return phys_addr; @@ -7009,7 +7754,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { u64 size = 0; -#ifdef CONFIG_HAVE_FAST_GUP +#ifdef CONFIG_HAVE_GUP_FAST pgd_t *pgdp, pgd; p4d_t *p4dp, p4d; pud_t *pudp, pud; @@ -7041,7 +7786,8 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); - pmd = READ_ONCE(*pmdp); +again: + pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; @@ -7049,11 +7795,14 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pmd_leaf_size(pmd); ptep = pte_offset_map(&pmd, addr); + if (!ptep) + goto again; + pte = ptep_get_lockless(ptep); if (pte_present(pte)) - size = pte_leaf_size(pte); + size = __pte_leaf_size(pmd, pte); pte_unmap(ptep); -#endif /* CONFIG_HAVE_FAST_GUP */ +#endif /* CONFIG_HAVE_GUP_FAST */ return size; } @@ -7109,76 +7858,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) return callchain ?: &__empty_callchain; } -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, +static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) +{ + return d * !!(flags & s); +} + +void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header) + event->header_size; - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - - __perf_event_header__init_id(header, data, event); - - if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) - data->ip = perf_instruction_pointer(regs); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) - data->callchain = perf_callchain(event, regs); - - size += data->callchain->nr; + /* + * Add the sample flags that are dependent to others. And clear the + * sample flags that have already been done by the PMU driver. + */ + filtered_sample_type = sample_type; + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, + PERF_SAMPLE_IP); + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | + PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, + PERF_SAMPLE_REGS_USER); + filtered_sample_type &= ~data->sample_flags; - header->size += size * sizeof(u64); + if (filtered_sample_type == 0) { + /* Make sure it has the correct data->type for output */ + data->type = event->attr.sample_type; + return; } - if (sample_type & PERF_SAMPLE_RAW) { - struct perf_raw_record *raw = data->raw; - int size; - - if (raw) { - struct perf_raw_frag *frag = &raw->frag; - u32 sum = 0; + __perf_event_header__init_id(data, event, filtered_sample_type); - do { - sum += frag->size; - if (perf_raw_frag_last(frag)) - break; - frag = frag->next; - } while (1); + if (filtered_sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(event, regs); + data->sample_flags |= PERF_SAMPLE_IP; + } - size = round_up(sum + sizeof(u32), sizeof(u64)); - raw->size = size - sizeof(u32); - frag->pad = raw->size - sum; - } else { - size = sizeof(u64); - } + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) + perf_sample_save_callchain(data, event, regs); - header->size += size; + if (filtered_sample_type & PERF_SAMPLE_RAW) { + data->raw = NULL; + data->dyn_size += sizeof(u64); + data->sample_flags |= PERF_SAMPLE_RAW; } - if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - int size = sizeof(u64); /* nr */ - if (data->br_stack) { - if (perf_sample_save_hw_index(event)) - size += sizeof(u64); - - size += data->br_stack->nr - * sizeof(struct perf_branch_entry); - } - header->size += size; + if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { + data->br_stack = NULL; + data->dyn_size += sizeof(u64); + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } - if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) + if (filtered_sample_type & PERF_SAMPLE_REGS_USER) perf_sample_regs_user(&data->regs_user, regs); - if (sample_type & PERF_SAMPLE_REGS_USER) { + /* + * It cannot use the filtered_sample_type here as REGS_USER can be set + * by STACK_USER (using __cond_set() above) and we don't want to update + * the dyn_size if it's not requested by users. + */ + if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7187,10 +7928,11 @@ void perf_prepare_sample(struct perf_event_header *header, size += hweight64(mask) * sizeof(u64); } - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_REGS_USER; } - if (sample_type & PERF_SAMPLE_STACK_USER) { + if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added @@ -7198,9 +7940,10 @@ void perf_prepare_sample(struct perf_event_header *header, * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; + u16 header_size = perf_sample_data_size(data, event); u16 size = sizeof(u64); - stack_size = perf_sample_ustack_size(stack_size, header->size, + stack_size = perf_sample_ustack_size(stack_size, header_size, data->regs_user.regs); /* @@ -7212,10 +7955,31 @@ void perf_prepare_sample(struct perf_event_header *header, size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_STACK_USER; } - if (sample_type & PERF_SAMPLE_REGS_INTR) { + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + data->weight.full = 0; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { + data->data_src.val = PERF_MEM_NA; + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { + data->txn = 0; + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } + + if (filtered_sample_type & PERF_SAMPLE_ADDR) { + data->addr = 0; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7227,19 +7991,23 @@ void perf_prepare_sample(struct perf_event_header *header, size += hweight64(mask) * sizeof(u64); } - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_REGS_INTR; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { data->phys_addr = perf_virt_to_phys(data->addr); + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } #ifdef CONFIG_CGROUP_PERF - if (sample_type & PERF_SAMPLE_CGROUP) { + if (filtered_sample_type & PERF_SAMPLE_CGROUP) { struct cgroup *cgrp; /* protected by RCU */ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; data->cgroup = cgroup_id(cgrp); + data->sample_flags |= PERF_SAMPLE_CGROUP; } #endif @@ -7248,16 +8016,21 @@ void perf_prepare_sample(struct perf_event_header *header, * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, * but the value will not dump to the userspace. */ - if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { data->data_page_size = perf_get_page_size(data->addr); + data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; + } - if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { data->code_page_size = perf_get_page_size(data->ip); + data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; + } - if (sample_type & PERF_SAMPLE_AUX) { + if (filtered_sample_type & PERF_SAMPLE_AUX) { u64 size; + u16 header_size = perf_sample_data_size(data, event); - header->size += sizeof(u64); /* size */ + header_size += sizeof(u64); /* size */ /* * Given the 16bit nature of header::size, an AUX sample can @@ -7265,14 +8038,26 @@ void perf_prepare_sample(struct perf_event_header *header, * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary). */ - size = min_t(size_t, U16_MAX - header->size, + size = min_t(size_t, U16_MAX - header_size, event->attr.aux_sample_size); size = rounddown(size, 8); size = perf_prepare_sample_aux(event, data, size); - WARN_ON_ONCE(size + header->size > U16_MAX); - header->size += size; + WARN_ON_ONCE(size + header_size > U16_MAX); + data->dyn_size += size + sizeof(u64); /* size above */ + data->sample_flags |= PERF_SAMPLE_AUX; } +} + +void perf_prepare_header(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + header->type = PERF_RECORD_SAMPLE; + header->size = perf_sample_data_size(data, event); + header->misc = perf_misc_flags(event, regs); + /* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the @@ -7284,6 +8069,49 @@ void perf_prepare_sample(struct perf_event_header *header, WARN_ON_ONCE(header->size & 7); } +static void __perf_event_aux_pause(struct perf_event *event, bool pause) +{ + if (pause) { + if (!event->hw.aux_paused) { + event->hw.aux_paused = 1; + event->pmu->stop(event, PERF_EF_PAUSE); + } + } else { + if (event->hw.aux_paused) { + event->hw.aux_paused = 0; + event->pmu->start(event, PERF_EF_RESUME); + } + } +} + +static void perf_event_aux_pause(struct perf_event *event, bool pause) +{ + struct perf_buffer *rb; + + if (WARN_ON_ONCE(!event)) + return; + + rb = ring_buffer_get(event); + if (!rb) + return; + + scoped_guard (irqsave) { + /* + * Guard against self-recursion here. Another event could trip + * this same from NMI context. + */ + if (READ_ONCE(rb->aux_in_pause_resume)) + break; + + WRITE_ONCE(rb->aux_in_pause_resume, 1); + barrier(); + __perf_event_aux_pause(event, pause); + barrier(); + WRITE_ONCE(rb->aux_in_pause_resume, 0); + } + ring_buffer_put(rb); +} + static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, @@ -7300,7 +8128,8 @@ __perf_event_output(struct perf_event *event, /* protect the callchain buffers */ rcu_read_lock(); - perf_prepare_sample(&header, data, event, regs); + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); err = output_begin(&handle, data, event, header.size); if (err) @@ -7433,7 +8262,6 @@ perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; - int ctxn; rcu_read_lock(); preempt_disable(); @@ -7450,11 +8278,9 @@ perf_iterate_sb(perf_iterate_f output, void *data, perf_iterate_sb_cpu(output, data); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_iterate_ctx(ctx, output, data, false); - } + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); @@ -7496,20 +8322,18 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) void perf_event_exec(void) { struct perf_event_context *ctx; - int ctxn; - rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; + ctx = perf_pin_task_context(current); + if (!ctx) + return; - perf_event_enable_on_exec(ctxn); + perf_event_enable_on_exec(ctx); + perf_event_remove_on_exec(ctx); + scoped_guard(rcu) + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, - true); - } - rcu_read_unlock(); + perf_unpin_context(ctx); + put_ctx(ctx); } struct remote_output { @@ -7549,8 +8373,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->ctx->pmu; - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; @@ -7766,7 +8589,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strlcpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -8046,6 +8869,8 @@ struct perf_mmap_event { u64 ino; u64 ino_generation; u32 prot, flags; + u8 build_id[BUILD_ID_SIZE_MAX]; + u32 build_id_size; struct { struct perf_event_header header; @@ -8077,6 +8902,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; + bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) @@ -8101,13 +8927,25 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); + use_build_id = event->attr.build_id && mmap_event->build_id_size; + + if (event->attr.mmap2 && use_build_id) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; + perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { - perf_output_put(&handle, mmap_event->maj); - perf_output_put(&handle, mmap_event->min); - perf_output_put(&handle, mmap_event->ino); - perf_output_put(&handle, mmap_event->ino_generation); + if (use_build_id) { + u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; + + __output_copy(&handle, size, 4); + __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); + } else { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } @@ -8133,7 +8971,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - char *name; + char *name = NULL; if (vma->vm_flags & VM_READ) prot |= PROT_READ; @@ -8147,10 +8985,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) else flags = MAP_PRIVATE; - if (vma->vm_flags & VM_DENYWRITE) - flags |= MAP_DENYWRITE; - if (vma->vm_flags & VM_MAYEXEC) - flags |= MAP_EXECUTABLE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) @@ -8184,33 +9018,22 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } else { - if (vma->vm_ops && vma->vm_ops->name) { + if (vma->vm_ops && vma->vm_ops->name) name = (char *) vma->vm_ops->name(vma); - if (name) - goto cpy_name; - } - - name = (char *)arch_vma_name(vma); - if (name) - goto cpy_name; - - if (vma->vm_start <= vma->vm_mm->start_brk && - vma->vm_end >= vma->vm_mm->brk) { - name = "[heap]"; - goto cpy_name; - } - if (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack) { - name = "[stack]"; - goto cpy_name; + if (!name) + name = (char *)arch_vma_name(vma); + if (!name) { + if (vma_is_initial_heap(vma)) + name = "[heap]"; + else if (vma_is_initial_stack(vma)) + name = "[stack]"; + else + name = "//anon"; } - - name = "//anon"; - goto cpy_name; } cpy_name: - strlcpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name, sizeof(tmp)); name = tmp; got_name: /* @@ -8236,6 +9059,9 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + if (atomic_read(&nr_build_id_events)) + build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -8325,7 +9151,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; - int ctxn; /* * Data tracing isn't supported yet and as such there is no need @@ -8335,13 +9160,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) return; rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (!ctx) - continue; - + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); - } rcu_read_unlock(); } @@ -8527,13 +9348,12 @@ static void perf_event_switch(struct task_struct *task, }, }; - if (!sched_in && task->state == TASK_RUNNING) + if (!sched_in && task_is_runnable(task)) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; + } - perf_iterate_sb(perf_event_switch_output, - &switch_event, - NULL); + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } /* @@ -8637,7 +9457,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - strlcpy(name, sym, KSYM_NAME_LEN); + strscpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; @@ -8700,7 +9520,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data) perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, data, event, + ret = perf_output_begin(&handle, &sample, event, bpf_event->event_id.header.size); if (ret) return; @@ -8717,21 +9537,19 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; int i; - if (prog->aux->func_cnt == 0) { - perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, - (u64)(unsigned long)prog->bpf_func, - prog->jited_len, unregister, - prog->aux->ksym.name); - } else { - for (i = 0; i < prog->aux->func_cnt; i++) { - struct bpf_prog *subprog = prog->aux->func[i]; - - perf_event_ksymbol( - PERF_RECORD_KSYMBOL_TYPE_BPF, - (u64)(unsigned long)subprog->bpf_func, - subprog->jited_len, unregister, - prog->aux->ksym.name); - } + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, + prog->aux->ksym.name); + + for (i = 1; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, + subprog->aux->ksym.name); } } @@ -8741,10 +9559,6 @@ void perf_event_bpf_event(struct bpf_prog *prog, { struct perf_bpf_event bpf_event; - if (type <= PERF_BPF_EVENT_UNKNOWN || - type >= PERF_BPF_EVENT_MAX) - return; - switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: @@ -8752,7 +9566,7 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_event_bpf_emit_ksymbols(prog, type); break; default: - break; + return; } if (!atomic_read(&nr_bpf_events)) @@ -8902,6 +9716,37 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } +void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u64 hw_id; + } rec; + int ret; + + if (event->parent) + event = event->parent; + + rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID; + rec.header.misc = 0; + rec.header.size = sizeof(rec); + rec.hw_id = hw_id; + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} +EXPORT_SYMBOL_GPL(perf_report_aux_output_id); + static int __perf_event_account_interrupt(struct perf_event *event, int throttle) { @@ -8915,8 +9760,8 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle - && hwc->interrupts >= max_samples_per_tick)) { + if (unlikely(throttle && + hwc->interrupts > max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; @@ -8943,13 +9788,120 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + +#ifdef CONFIG_BPF_SYSCALL +static int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .event = event, + }; + struct bpf_prog *prog; + int ret = 0; + + ctx.regs = perf_arch_bpf_user_pt_regs(regs); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + prog = READ_ONCE(event->prog); + if (prog) { + perf_prepare_sample(data, event, regs); + ret = bpf_prog_run(prog, &ctx); + } + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + + return ret; +} + +static inline int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + if (prog->type != BPF_PROG_TYPE_PERF_EVENT) + return -EINVAL; + + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + return -EPROTO; + } + + event->prog = prog; + event->bpf_cookie = bpf_cookie; + return 0; +} + +static inline void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static inline int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return 1; +} + +static inline int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -EOPNOTSUPP; +} + +static inline void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; @@ -8963,6 +9915,13 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + if (event->attr.aux_pause) + perf_event_aux_pause(event->aux_event, true); + + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && + !bpf_overflow_handler(event, data, regs)) + goto out; + /* * XXX event_limit might not quite work as expected on inherited * events @@ -8972,23 +9931,67 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - perf_event_disable_inatomic(event); } + if (event->attr.sigtrap) { + /* + * The desired behaviour of sigtrap vs invalid samples is a bit + * tricky; on the one hand, one should not loose the SIGTRAP if + * it is the first event, on the other hand, we should also not + * trigger the WARN or override the data address. + */ + bool valid_sample = sample_is_allowed(event, regs); + unsigned int pending_id = 1; + enum task_work_notify_mode notify_mode; + + if (regs) + pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; + + notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; + + if (!event->pending_work && + !task_work_add(current, &event->pending_task, notify_mode)) { + event->pending_work = pending_id; + local_inc(&event->ctx->nr_no_switch_fast); + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; + + } else if (event->attr.exclude_kernel && valid_sample) { + /* + * Should not be able to return to user space without + * consuming pending_work; with exceptions: + * + * 1. Where !exclude_kernel, events can overflow again + * in the kernel without returning to user space. + * + * 2. Events that can overflow again before the IRQ- + * work without user space progress (e.g. hrtimer). + * To approximate progress (with false negatives), + * check 32-bit hash of the current IP. + */ + WARN_ON_ONCE(event->pending_work != pending_id); + } + } + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; - irq_work_queue(&event->pending); + irq_work_queue(&event->pending_irq); } +out: + if (event->attr.aux_resume) + perf_event_aux_pause(event->aux_event, false); return ret; } int perf_event_overflow(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } @@ -9001,11 +10004,7 @@ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; }; - static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* @@ -9024,16 +10023,16 @@ u64 perf_swevent_set_period(struct perf_event *event) hwc->last_period = hwc->sample_period; -again: - old = val = local64_read(&hwc->period_left); - if (val < 0) - return 0; + old = local64_read(&hwc->period_left); + do { + val = old; + if (val < 0) + return 0; - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (local64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + } while (!local64_try_cmpxchg(&hwc->period_left, &old, val)); return nr; } @@ -9093,8 +10092,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; @@ -9203,17 +10201,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - return get_recursion_context(swhash->recursion); + return get_recursion_context(current->perf_recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - put_recursion_context(swhash->recursion, rctx); + put_recursion_context(current->perf_recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) @@ -9384,6 +10378,9 @@ static void sw_perf_event_destroy(struct perf_event *event) swevent_hlist_put(); } +static struct pmu perf_cpu_clock; /* fwd declaration */ +static struct pmu perf_task_clock; + static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; @@ -9399,7 +10396,10 @@ static int perf_swevent_init(struct perf_event *event) switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: + event->attr.type = perf_cpu_clock.type; + return -ENOENT; case PERF_COUNT_SW_TASK_CLOCK: + event->attr.type = perf_task_clock.type; return -ENOENT; default: @@ -9438,10 +10438,48 @@ static struct pmu perf_swevent = { #ifdef CONFIG_EVENT_TRACING +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) + struct perf_raw_record *raw) { - void *record = data->raw->frag.data; + void *record = raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -9453,7 +10491,7 @@ static int perf_tp_filter_match(struct perf_event *event, } static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, + struct perf_raw_record *raw, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) @@ -9464,7 +10502,7 @@ static int perf_tp_event_match(struct perf_event *event, if (event->attr.exclude_kernel && !user_mode(regs)) return 0; - if (!perf_tp_filter_match(event, data)) + if (!perf_tp_filter_match(event, raw)) return 0; return 1; @@ -9487,6 +10525,49 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); +static void __perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_raw_record *raw, + struct perf_event *event) +{ + struct trace_entry *entry = record; + + if (event->attr.config != entry->type) + return; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + return; + if (perf_tp_event_match(event, raw, regs)) { + perf_sample_data_init(data, 0, 0); + perf_sample_save_raw_data(data, event, raw); + perf_swevent_event(event, count, data, regs); + } +} + +static void perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_raw_record *raw, + struct perf_event_context *ctx) +{ + unsigned int cpu = smp_processor_id(); + struct pmu *pmu = &perf_tracepoint; + struct perf_event *event, *sibling; + + perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, raw, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); + } + + perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, raw, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); + } +} + void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) @@ -9501,14 +10582,22 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }, }; - perf_sample_data_init(&data, 0, 0); - data.raw = &raw; - perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) + if (perf_tp_event_match(event, &raw, regs)) { + /* + * Here use the same on-stack perf_sample_data, + * some members in data are event-specific and + * need to be re-computed for different sweveents. + * Re-initialize data->sample_flags safely to avoid + * the problem that next event skips preparing data + * because data->sample_flags is set. + */ + perf_sample_data_init(&data, 0, 0); + perf_sample_save_raw_data(&data, event, &raw); perf_swevent_event(event, count, &data, regs); + } } /* @@ -9517,23 +10606,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, */ if (task && task != current) { struct perf_event_context *ctx; - struct trace_entry *entry = record; rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->cpu != smp_processor_id()) - continue; - if (event->attr.type != PERF_TYPE_TRACEPOINT) - continue; - if (event->attr.config != entry->type) - continue; - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } + raw_spin_lock(&ctx->lock); + perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); + raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } @@ -9542,44 +10623,6 @@ unlock: } EXPORT_SYMBOL_GPL(perf_tp_event); -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} - -static int perf_tp_event_init(struct perf_event *event) -{ - int err; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -ENOENT; - - /* - * no branch sampling for tracepoint events - */ - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - err = perf_trace_init(event); - if (err) - return err; - - event->destroy = tp_perf_event_destroy; - - return 0; -} - -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_tp_event_init, - .add = perf_trace_add, - .del = perf_trace_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe @@ -9737,91 +10780,6 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } -#ifdef CONFIG_BPF_SYSCALL -static void bpf_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct bpf_perf_event_data_kern ctx = { - .data = data, - .event = event, - }; - int ret = 0; - - ctx.regs = perf_arch_bpf_user_pt_regs(regs); - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) - goto out; - rcu_read_lock(); - ret = BPF_PROG_RUN(event->prog, &ctx); - rcu_read_unlock(); -out: - __this_cpu_dec(bpf_prog_active); - if (!ret) - return; - - event->orig_overflow_handler(event, data, regs); -} - -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) -{ - struct bpf_prog *prog; - - if (event->overflow_handler_context) - /* hw breakpoint or kernel counter */ - return -EINVAL; - - if (event->prog) - return -EEXIST; - - prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); - if (IS_ERR(prog)) - return PTR_ERR(prog); - - if (event->attr.precise_ip && - prog->call_get_stack && - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || - event->attr.exclude_callchain_kernel || - event->attr.exclude_callchain_user)) { - /* - * On perf_event with precise_ip, calling bpf_get_stack() - * may trigger unwinder warnings and occasional crashes. - * bpf_get_[stack|stackid] works around this issue by using - * callchain attached to perf_sample_data. If the - * perf_event does not full (kernel and user) callchain - * attached to perf_sample_data, do not allow attaching BPF - * program that calls bpf_get_[stack|stackid]. - */ - bpf_prog_put(prog); - return -EPROTO; - } - - event->prog = prog; - event->orig_overflow_handler = READ_ONCE(event->overflow_handler); - WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); - return 0; -} - -static void perf_event_free_bpf_handler(struct perf_event *event) -{ - struct bpf_prog *prog = event->prog; - - if (!prog) - return; - - WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); - event->prog = NULL; - bpf_prog_put(prog); -} -#else -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) -{ - return -EOPNOTSUPP; -} -static void perf_event_free_bpf_handler(struct perf_event *event) -{ -} -#endif - /* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open() @@ -9841,57 +10799,46 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { - bool is_kprobe, is_tracepoint, is_syscall_tp; - struct bpf_prog *prog; - int ret; + bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; if (!perf_event_is_tracing(event)) - return perf_event_set_bpf_handler(event, prog_fd); + return perf_event_set_bpf_handler(event, prog, bpf_cookie); - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; + is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); - if (!is_kprobe && !is_tracepoint && !is_syscall_tp) + if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - prog = bpf_prog_get(prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || - (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { - /* valid fd, but invalid bpf program type */ - bpf_prog_put(prog); + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) + return -EINVAL; + + if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe) + /* only uprobe programs are allowed to be sleepable */ return -EINVAL; - } /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { - bpf_prog_put(prog); + if (prog->kprobe_override && !is_kprobe) return -EINVAL; - } if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); - if (prog->aux->max_ctx_offset > off) { - bpf_prog_put(prog); + if (prog->aux->max_ctx_offset > off) return -EACCES; - } } - ret = perf_event_attach_bpf_prog(event, prog); - if (ret) - bpf_prog_put(prog); - return ret; + return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); @@ -9910,12 +10857,13 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, + u64 bpf_cookie) { return -ENOENT; } -static void perf_event_free_bpf_prog(struct perf_event *event) +void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ @@ -10000,8 +10948,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; @@ -10031,7 +10980,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) return; if (ifh->nr_file_filters) { - mm = get_task_mm(event->ctx->task); + mm = get_task_mm(task); if (!mm) goto restart; @@ -10212,8 +11161,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ if (state == IF_STATE_END) { ret = -EINVAL; - if (kernel && event->attr.exclude_kernel) - goto fail; /* * ACTION "filter" must have a non-zero length region @@ -10255,8 +11202,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } /* ready to consume more filters */ + kfree(filename); + filename = NULL; state = IF_STATE_ACTION; filter = NULL; + kernel = 0; } } @@ -10491,7 +11441,7 @@ static void cpu_clock_event_read(struct perf_event *event) static int cpu_clock_event_init(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_SOFTWARE) + if (event->attr.type != perf_cpu_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) @@ -10512,6 +11462,7 @@ static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, + .dev = PMU_NULL_DEV, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, @@ -10572,7 +11523,7 @@ static void task_clock_event_read(struct perf_event *event) static int task_clock_event_init(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_SOFTWARE) + if (event->attr.type != perf_task_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) @@ -10593,6 +11544,7 @@ static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, + .dev = PMU_NULL_DEV, .event_init = task_clock_event_init, .add = task_clock_event_add, @@ -10662,36 +11614,9 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -/* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. - */ -static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) -{ - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - static void free_pmu_context(struct pmu *pmu) { - /* - * Static contexts such as perf_sw_context have a global lifetime - * and may be shared between different PMUs. Avoid freeing them - * when a single PMU is going away. - */ - if (pmu->task_ctx_nr > perf_invalid_context) - return; - - free_percpu(pmu->pmu_cpu_context); + free_percpu(pmu->cpu_pmu_context); } /* @@ -10703,7 +11628,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -10714,7 +11639,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -10725,7 +11650,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -10755,12 +11680,11 @@ perf_event_mux_interval_ms_store(struct device *dev, /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { - struct perf_cpu_context *cpuctx; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + struct perf_cpu_pmu_context *cpc; + cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - cpu_function_call(cpu, - (remote_function_f)perf_mux_hrtimer_restart, cpuctx); + cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); @@ -10769,12 +11693,87 @@ perf_event_mux_interval_ms_store(struct device *dev, } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return topology_sibling_cpumask(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_die_cpumask(cpu); + case PERF_PMU_SCOPE_CLUSTER: + return topology_cluster_cpumask(cpu); + case PERF_PMU_SCOPE_PKG: + return topology_core_cpumask(cpu); + case PERF_PMU_SCOPE_SYS_WIDE: + return cpu_online_mask; + } + + return NULL; +} + +static inline struct cpumask *perf_scope_cpumask(unsigned int scope) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return perf_online_core_mask; + case PERF_PMU_SCOPE_DIE: + return perf_online_die_mask; + case PERF_PMU_SCOPE_CLUSTER: + return perf_online_cluster_mask; + case PERF_PMU_SCOPE_PKG: + return perf_online_pkg_mask; + case PERF_PMU_SCOPE_SYS_WIDE: + return perf_online_sys_mask; + } + + return NULL; +} + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cpumask *mask = perf_scope_cpumask(pmu->scope); + + if (mask) + return cpumap_print_to_pagebuf(true, buf, mask); + return 0; +} + +static DEVICE_ATTR_RO(cpumask); + static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, + &dev_attr_nr_addr_filters.attr, + &dev_attr_cpumask.attr, + NULL, +}; + +static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pmu *pmu = dev_get_drvdata(dev); + + if (n == 2 && !pmu->nr_addr_filters) + return 0; + + /* cpumask */ + if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) + return 0; + + return a->mode; +} + +static struct attribute_group pmu_dev_attr_group = { + .is_visible = pmu_dev_is_visible, + .attrs = pmu_dev_attrs, +}; + +static const struct attribute_group *pmu_dev_groups[] = { + &pmu_dev_attr_group, NULL, }; -ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { @@ -10797,29 +11796,25 @@ static int pmu_dev_alloc(struct pmu *pmu) pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; + pmu->dev->parent = pmu->parent; pmu->dev->release = pmu_dev_release; - ret = device_add(pmu->dev); + + ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) goto free_dev; - /* For PMUs with address filters, throw in an extra attribute: */ - if (pmu->nr_addr_filters) - ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); - + ret = device_add(pmu->dev); if (ret) - goto del_dev; + goto free_dev; - if (pmu->attr_update) + if (pmu->attr_update) { ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); - - if (ret) - goto del_dev; + if (ret) + goto del_dev; + } out: return ret; @@ -10835,6 +11830,21 @@ free_dev: static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; +static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) +{ + void *tmp, *val = idr_find(idr, id); + + if (val != old) + return false; + + tmp = idr_replace(idr, new, id); + if (IS_ERR(tmp)) + return false; + + WARN_ON_ONCE(tmp != val); + return true; +} + int perf_pmu_register(struct pmu *pmu, const char *name, int type) { int cpu, ret, max = PERF_TYPE_MAX; @@ -10846,72 +11856,50 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto unlock; pmu->type = -1; - if (!name) - goto skip_type; + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { + ret = -EINVAL; + goto free_pdc; + } + + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { + ret = -EINVAL; + goto free_pdc; + } + pmu->name = name; - if (type != PERF_TYPE_SOFTWARE) { - if (type >= 0) - max = type; + if (type >= 0) + max = type; - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); - if (ret < 0) - goto free_pdc; + ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL); + if (ret < 0) + goto free_pdc; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && ret != type); - type = ret; - } + type = ret; pmu->type = type; + atomic_set(&pmu->exclusive_cnt, 0); - if (pmu_bus_running) { + if (pmu_bus_running && !pmu->dev) { ret = pmu_dev_alloc(pmu); if (ret) goto free_idr; } -skip_type: - if (pmu->task_ctx_nr == perf_hw_context) { - static int hw_context_taken = 0; - - /* - * Other than systems with heterogeneous CPUs, it never makes - * sense for two PMUs to share perf_hw_context. PMUs which are - * uncore must use perf_invalid_context. - */ - if (WARN_ON_ONCE(hw_context_taken && - !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) - pmu->task_ctx_nr = perf_invalid_context; - - hw_context_taken = 1; - } - - pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); - if (pmu->pmu_cpu_context) - goto got_cpu_context; - ret = -ENOMEM; - pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); - if (!pmu->pmu_cpu_context) + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + if (!pmu->cpu_pmu_context) goto free_dev; for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_pmu_context *cpc; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); - lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); - lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.pmu = pmu; - cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); - - __perf_mux_hrtimer_init(cpuctx, cpu); - - cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); - cpuctx->heap = cpuctx->heap_default; + cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + __perf_init_event_pmu_context(&cpc->epc, pmu); + __perf_mux_hrtimer_init(cpc, cpu); } -got_cpu_context: if (!pmu->start_txn) { if (pmu->pmu_enable) { /* @@ -10941,29 +11929,29 @@ got_cpu_context: pmu->event_idx = perf_event_idx_default; /* - * Ensure the TYPE_SOFTWARE PMUs are at the head of the list, - * since these cannot be in the IDR. This way the linear search - * is fast, provided a valid software event is provided. + * Now that the PMU is complete, make it visible to perf_try_init_event(). */ - if (type == PERF_TYPE_SOFTWARE || !name) - list_add_rcu(&pmu->entry, &pmus); - else - list_add_tail_rcu(&pmu->entry, &pmus); + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) + goto free_context; + list_add_rcu(&pmu->entry, &pmus); - atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); return ret; +free_context: + free_percpu(pmu->cpu_pmu_context); + free_dev: - device_del(pmu->dev); - put_device(pmu->dev); + if (pmu->dev && pmu->dev != PMU_NULL_DEV) { + device_del(pmu->dev); + put_device(pmu->dev); + } free_idr: - if (pmu->type != PERF_TYPE_SOFTWARE) - idr_remove(&pmu_idr, pmu->type); + idr_remove(&pmu_idr, pmu->type); free_pdc: free_percpu(pmu->pmu_disable_count); @@ -10975,6 +11963,8 @@ void perf_pmu_unregister(struct pmu *pmu) { mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -10984,16 +11974,13 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - if (pmu->type != PERF_TYPE_SOFTWARE) - idr_remove(&pmu_idr, pmu->type); - if (pmu_bus_running) { + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); - mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -11042,6 +12029,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) event_has_any_exclude_flag(event)) ret = -EINVAL; + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); + int cpu; + + if (pmu_cpumask && cpumask) { + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + ret = -ENODEV; + else + event->event_caps |= PERF_EV_CAP_READ_SCOPE; + } else { + ret = -ENODEV; + } + } + if (ret && event->destroy) event->destroy(event); } @@ -11054,11 +12057,18 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { + bool extended_type = false; int idx, type, ret; struct pmu *pmu; idx = srcu_read_lock(&pmus_srcu); + /* + * Save original type before calling pmu->event_init() since certain + * pmus overwrites event->attr.type to forward event to another pmu. + */ + event->orig_type = event->attr.type; + /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; @@ -11072,16 +12082,27 @@ static struct pmu *perf_init_event(struct perf_event *event) * are often aliases for PERF_TYPE_RAW. */ type = event->attr.type; - if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) - type = PERF_TYPE_RAW; + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { + type = event->attr.config >> PERF_PMU_TYPE_SHIFT; + if (!type) { + type = PERF_TYPE_RAW; + } else { + extended_type = true; + event->attr.config &= PERF_HW_EVENT_MASK; + } + } again: rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { + if (event->attr.type != type && type != PERF_TYPE_RAW && + !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) + goto fail; + ret = perf_try_init_event(pmu, event); - if (ret == -ENOENT && event->attr.type != type) { + if (ret == -ENOENT && event->attr.type != type && !extended_type) { type = event->attr.type; goto again; } @@ -11102,6 +12123,7 @@ again: goto unlock; } } +fail: pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); @@ -11131,15 +12153,6 @@ static void account_pmu_sb_event(struct perf_event *event) attach_sb_event(event); } -static void account_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_inc(&per_cpu(perf_cgroup_events, cpu)); -} - /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { @@ -11168,10 +12181,12 @@ static void account_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); + if (event->attr.build_id) + atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) @@ -11225,8 +12240,6 @@ static void account_event(struct perf_event *event) } enabled: - account_event_cpu(event, event->cpu); - account_pmu_sb_event(event); } @@ -11245,13 +12258,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; + int node; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } + if (attr->sigtrap && !task) { + /* Requires a task: avoid signalling random tasks. */ + return ERR_PTR(-EINVAL); + } - event = kzalloc(sizeof(*event), GFP_KERNEL); + node = (cpu >= 0) ? cpu_to_node(cpu) : -1; + event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, + node); if (!event) return ERR_PTR(-ENOMEM); @@ -11276,8 +12296,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); - event->pending_disable = -1; - init_irq_work(&event->pending, perf_pending_event); + init_irq_work(&event->pending_irq, perf_pending_irq); + event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); + init_task_work(&event->pending_task, perf_pending_task); + rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -11296,6 +12318,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (parent_event) + event->event_caps = parent_event->event_caps; + if (task) { event->attach_state = PERF_ATTACH_TASK; /* @@ -11314,13 +12339,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) - if (overflow_handler == bpf_overflow_handler) { + if (parent_event->prog) { struct bpf_prog *prog = parent_event->prog; bpf_prog_inc(prog); event->prog = prog; - event->orig_overflow_handler = - parent_event->orig_overflow_handler; } #endif } @@ -11349,10 +12372,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * We currently do not support PERF_SAMPLE_READ on inherited events. + * We do not support PERF_SAMPLE_READ on inherited events unless + * PERF_SAMPLE_TID is also selected, which allows inherited events to + * collect per-thread samples. * See perf_output_read(). */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) goto err_ns; if (!has_branch_stack(event)) @@ -11365,20 +12390,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } /* - * Disallow uncore-cgroup events, they don't make sense as the cgroup will - * be different on other CPUs in the uncore mask. + * Disallow uncore-task events. Similarly, disallow uncore-cgroup + * events (they don't make sense as the cgroup will be different + * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; goto err_pmu; } if (event->attr.aux_output && - !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { + (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || + event->attr.aux_pause || event->attr.aux_resume)) { err = -EOPNOTSUPP; goto err_pmu; } + if (event->attr.aux_pause && event->attr.aux_resume) { + err = -EINVAL; + goto err_pmu; + } + + if (event->attr.aux_start_paused) { + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { + err = -EOPNOTSUPP; + goto err_pmu; + } + event->hw.aux_paused = 1; + } + if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) @@ -11451,11 +12491,9 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: - if (event->ns) - put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kfree(event); + call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } @@ -11564,6 +12602,18 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif + if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && + (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) + return -EINVAL; + + if (!attr->inherit && attr->inherit_thread) + return -EINVAL; + + if (attr->remove_on_exec && attr->enable_on_exec) + return -EINVAL; + + if (attr->sigtrap && !attr->remove_on_exec) + return -EINVAL; out: return ret; @@ -11574,14 +12624,25 @@ err_size: goto out; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { struct perf_buffer *rb = NULL; int ret = -EINVAL; - if (!output_event) + if (!output_event) { + mutex_lock(&event->mmap_mutex); goto set; + } /* don't allow circular references */ if (event == output_event) @@ -11596,7 +12657,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) /* * If its not a per-cpu rb, it must be the same task. */ - if (output_event->cpu == -1 && output_event->ctx != event->ctx) + if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* @@ -11619,8 +12680,15 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) event->pmu != output_event->pmu) goto out; + /* + * Hold both mmap_mutex to serialize against perf_mmap_close(). Since + * output_event is already on rb->event_list, and the list iteration + * restarts after every removal, it is guaranteed this new event is + * observed *OR* if output_event is already removed, it's guaranteed we + * observe !rb->mmap_count. + */ + mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: - mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock; @@ -11630,6 +12698,12 @@ set: rb = ring_buffer_get(output_event); if (!rb) goto unlock; + + /* did we race against perf_mmap_close() */ + if (!atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); + goto unlock; + } } ring_buffer_attach(event, rb); @@ -11637,20 +12711,13 @@ set: ret = 0; unlock: mutex_unlock(&event->mmap_mutex); + if (output_event) + mutex_unlock(&output_event->mmap_mutex); out: return ret; } -static void mutex_lock_double(struct mutex *a, struct mutex *b) -{ - if (b < a) - swap(a, b); - - mutex_lock(a); - mutex_lock_nested(b, SINGLE_DEPTH_NESTING); -} - static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; @@ -11688,35 +12755,35 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } -/* - * Variation on perf_event_ctx_lock_nested(), except we take two context - * mutexes. - */ -static struct perf_event_context * -__perf_event_ctx_lock_double(struct perf_event *group_leader, - struct perf_event_context *ctx) +static bool +perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { - struct perf_event_context *gctx; + unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; + bool is_capable = perfmon_capable(); -again: - rcu_read_lock(); - gctx = READ_ONCE(group_leader->ctx); - if (!refcount_inc_not_zero(&gctx->refcount)) { + if (attr->sigtrap) { + /* + * perf_event_attr::sigtrap sends signals to the other task. + * Require the current task to also have CAP_KILL. + */ + rcu_read_lock(); + is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); - mutex_lock_double(&gctx->mutex, &ctx->mutex); - - if (group_leader->ctx != gctx) { - mutex_unlock(&ctx->mutex); - mutex_unlock(&gctx->mutex); - put_ctx(gctx); - goto again; + /* + * If the required capabilities aren't available, checks for + * ptrace permissions: upgrade to ATTACH, since sending signals + * can effectively change the target task. + */ + ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; } - return gctx; + /* + * Preserve ptrace permission check for backwards compatibility. The + * ptrace check also includes checks that the current task and other + * task have matching uids, and is therefore not done here explicitly. + */ + return is_capable || ptrace_may_access(task, ptrace_mode); } /** @@ -11726,17 +12793,18 @@ again: * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd + * @flags: perf event open flags */ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *gctx; + struct perf_event_context *ctx; struct file *event_file = NULL; - struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; @@ -11749,12 +12817,12 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; - /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; - err = perf_copy_attr(attr_uptr, &attr); + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; @@ -11784,12 +12852,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; } - err = security_locked_down(LOCKDOWN_PERF); - if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) - /* REGS_INTR can leak data, lockdown must prevent this */ - return err; - - err = 0; + /* REGS_INTR can leak data, lockdown must prevent this */ + if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { + err = security_locked_down(LOCKDOWN_PERF); + if (err) + return err; + } /* * In cgroup mode, the pid argument is used to pass the fd @@ -11807,11 +12875,13 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { - err = perf_fget_light(group_fd, &group); - if (err) + if (!is_perf_file(group)) { + err = -EBADF; goto err_fd; - group_leader = group.file->private_data; + } + group_leader = fd_file(group)->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -11822,7 +12892,7 @@ SYSCALL_DEFINE5(perf_event_open, task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); - goto err_group_fd; + goto err_fd; } } @@ -11864,42 +12934,53 @@ SYSCALL_DEFINE5(perf_event_open, if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; - if (group_leader) { - if (is_software_event(event) && - !in_software_context(group_leader)) { - /* - * If the event is a sw event, but the group_leader - * is on hw context. - * - * Allow the addition of software events to hw - * groups, this is safe because software events - * never fail to schedule. - */ - pmu = group_leader->ctx->pmu; - } else if (!is_software_event(event) && - is_software_event(group_leader) && - (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { - /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. - */ - move_group = 1; - } + if (task) { + err = down_read_interruptible(&task->signal->exec_update_lock); + if (err) + goto err_alloc; + + /* + * We must hold exec_update_lock across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!perf_check_permission(&attr, task)) + goto err_cred; } /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_alloc; + goto err_cred; + } + + mutex_lock(&ctx->mutex); + + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } } - /* - * Look up the group leader (we will attach this event to it): - */ if (group_leader) { err = -EINVAL; @@ -11908,11 +12989,11 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_context; + goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) - goto err_context; + goto err_locked; /* * Make sure we're both events for the same CPU; @@ -11920,133 +13001,76 @@ SYSCALL_DEFINE5(perf_event_open, * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) - goto err_context; - - /* - * Make sure we're both on the same task, or both - * per-CPU events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + goto err_locked; /* - * Do not allow to attach to a group in a different task - * or CPU context. If we're moving SW events, we'll fix - * this up later, so allow that. + * Make sure we're both on the same context; either task or cpu. */ - if (!move_group && group_leader->ctx != ctx) - goto err_context; + if (group_leader->ctx != ctx) + goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_context; - } - - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } - - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, - f_flags); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - event_file = NULL; - goto err_context; - } - - if (task) { - err = down_read_interruptible(&task->signal->exec_update_lock); - if (err) - goto err_file; - - /* - * Preserve ptrace permission check for backwards compatibility. - * - * We must hold exec_update_lock across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto err_cred; - } - - if (move_group) { - gctx = __perf_event_ctx_lock_double(group_leader, ctx); - - if (gctx->task == TASK_TOMBSTONE) { - err = -ESRCH; goto err_locked; - } - /* - * Check if we raced against another sys_perf_event_open() call - * moving the software group underneath us. - */ - if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + if (is_software_event(event) && + !in_software_context(group_leader)) { /* - * If someone moved the group out from under us, check - * if this new event wound up on the same ctx, if so - * its the regular !move_group case, otherwise fail. + * If the event is a sw event, but the group_leader + * is on hw context. + * + * Allow the addition of software events to hw + * groups, this is safe because software events + * never fail to schedule. + * + * Note the comment that goes with struct + * perf_event_pmu_context. */ - if (gctx != ctx) { - err = -EINVAL; - goto err_locked; - } else { - perf_event_ctx_unlock(group_leader, gctx); - move_group = 0; + pmu = group_leader->pmu_ctx->pmu; + } else if (!is_software_event(event)) { + if (is_software_event(group_leader) && + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; } - } - /* - * Failure to create exclusive events returns -EBUSY. - */ - err = -EBUSY; - if (!exclusive_event_installable(group_leader, ctx)) - goto err_locked; - - for_each_sibling_event(sibling, group_leader) { - if (!exclusive_event_installable(sibling, ctx)) + /* Don't allow group of multiple hw events from different pmus */ + if (!in_software_context(group_leader) && + group_leader->pmu_ctx->pmu != pmu) goto err_locked; } - } else { - mutex_lock(&ctx->mutex); } - if (ctx->task == TASK_TOMBSTONE) { - err = -ESRCH; + /* + * Now that we're certain of the pmu; find the pmu_ctx. + */ + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); goto err_locked; } + event->pmu_ctx = pmu_ctx; - if (!perf_event_validate_size(event)) { - err = -E2BIG; - goto err_locked; + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_context; } - if (!task) { - /* - * Check if the @cpu we're creating an event for is online. - * - * We use the perf_cpu_context::ctx::mutex to serialize against - * the hotplug notifiers. See perf_event_{init,exit}_cpu(). - */ - struct perf_cpu_context *cpuctx = - container_of(ctx, struct perf_cpu_context, ctx); - - if (!cpuctx->online) { - err = -ENODEV; - goto err_locked; - } + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_context; } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; - goto err_locked; + goto err_context; } /* @@ -12055,36 +13079,33 @@ SYSCALL_DEFINE5(perf_event_open, */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_locked; + goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); + event_file = NULL; + goto err_context; + } + /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ perf_remove_from_context(group_leader, 0); - put_ctx(gctx); + put_pmu_ctx(group_leader->pmu_ctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); - put_ctx(gctx); + put_pmu_ctx(sibling->pmu_ctx); } /* - * Wait for everybody to stop referencing the events through - * the old lists, before installing it on new lists. - */ - synchronize_rcu(); - - /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group @@ -12095,9 +13116,10 @@ SYSCALL_DEFINE5(perf_event_open, * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { + sibling->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); - get_ctx(ctx); } /* @@ -12105,9 +13127,10 @@ SYSCALL_DEFINE5(perf_event_open, * event. What we want here is event in the initial * startup state, ready to be add into new context. */ + group_leader->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); - get_ctx(ctx); } /* @@ -12124,8 +13147,6 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { @@ -12138,39 +13159,29 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). + * File reference in group guarantees that group_leader has been + * kept alive until we place the new event on the sibling_list. + * This ensures destruction of the group leader will find + * the pointer to itself in perf_group_detach(). */ - fdput(group); fd_install(event_fd, event_file); return event_fd; +err_context: + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_locked: - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_cred: if (task) up_read(&task->signal->exec_update_lock); -err_file: - fput(event_file); -err_context: - perf_unpin_context(ctx); - put_ctx(ctx); err_alloc: - /* - * If event_file is set, the fput() above will have called ->release() - * and that will take care of freeing the event. - */ - if (!event_file) - free_event(event); + free_event(event); err_task: if (task) put_task_struct(task); -err_group_fd: - fdput(group); err_fd: put_unused_fd(event_fd); return err; @@ -12182,6 +13193,8 @@ err_fd: * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) + * @overflow_handler: callback to trigger when we hit the event + * @context: context data could be used in overflow_handler callback */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, @@ -12189,15 +13202,17 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, perf_overflow_handler_t overflow_handler, void *context) { + struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; + struct pmu *pmu; int err; /* * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ - if (attr->aux_output) + if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, @@ -12209,14 +13224,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + pmu = event->pmu; + + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; /* * Get the target context (task or percpu): */ - ctx = find_get_context(event->pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_free; + goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); @@ -12226,6 +13245,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); + goto err_unlock; + } + event->pmu_ctx = pmu_ctx; + if (!task) { /* * Check if the @cpu we're creating an event for is online. @@ -12237,13 +13263,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; - goto err_unlock; + goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_unlock; + goto err_pmu_ctx; } perf_install_in_context(ctx, event, event->cpu); @@ -12252,44 +13278,67 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_pmu_ctx: + put_pmu_ctx(pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); -err_free: +err_alloc: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +static void __perf_pmu_remove(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, + struct perf_event_groups *groups, + struct list_head *events) { - struct perf_event_context *src_ctx; - struct perf_event_context *dst_ctx; - struct perf_event *event, *tmp; - LIST_HEAD(events); - - src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; - dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + struct perf_event *event, *sibling; - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ - mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &src_ctx->event_list, - event_entry) { + perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, src_cpu); - put_ctx(src_ctx); - list_add(&event->migrate_entry, &events); + put_pmu_ctx(event->pmu_ctx); + list_add(&event->migrate_entry, events); + + for_each_sibling_event(sibling, event) { + perf_remove_from_context(sibling, 0); + put_pmu_ctx(sibling->pmu_ctx); + list_add(&sibling->migrate_entry, events); + } } +} + +static void __perf_pmu_install_event(struct pmu *pmu, + struct perf_event_context *ctx, + int cpu, struct perf_event *event) +{ + struct perf_event_pmu_context *epc; + struct perf_event_context *old_ctx = event->ctx; + + get_ctx(ctx); /* normally find_get_context() */ + + event->cpu = cpu; + epc = find_get_pmu_context(pmu, ctx, event); + event->pmu_ctx = epc; + + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + perf_install_in_context(ctx, event, cpu); /* - * Wait for the events to quiesce before re-instating them. + * Now that event->ctx is updated and visible, put the old ctx. */ - synchronize_rcu(); + put_ctx(old_ctx); +} + +static void __perf_pmu_install(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, struct list_head *events) +{ + struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. @@ -12299,45 +13348,72 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) * leader will enable its siblings, even if those are still on the old * context. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } +} + +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx, *dst_ctx; + LIST_HEAD(events); + + /* + * Since per-cpu context is persistent, no need to grab an extra + * reference. + */ + src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); + + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); + + if (!list_empty(&events)) { + /* + * Wait for the events to quiesce before re-instating them. + */ + synchronize_rcu(); + + __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); + } + mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); -static void sync_child_event(struct perf_event *child_event, - struct task_struct *child) +static void sync_child_event(struct perf_event *child_event) { struct perf_event *parent_event = child_event->parent; u64 child_val; - if (child_event->attr.inherit_stat) - perf_event_read_event(child_event, child); + if (child_event->attr.inherit_stat) { + struct task_struct *task = child_event->ctx->task; - child_val = perf_event_count(child_event); + if (task && task != TASK_TOMBSTONE) + perf_event_read_event(child_event, task); + } + + child_val = perf_event_count(child_event, false); /* * Add back the child's count to the parent's count: @@ -12350,70 +13426,63 @@ static void sync_child_event(struct perf_event *child_event, } static void -perf_event_exit_event(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *parent_event = child_event->parent; + struct perf_event *parent_event = event->parent; + unsigned long detach_flags = 0; - /* - * Do not destroy the 'original' grouping; because of the context - * switch optimization the original events could've ended up in a - * random child task. - * - * If we were to destroy the original group, all group related - * operations would cease to function properly after this random - * child dies. - * - * Do destroy all inherited groups, we don't care about those - * and being thorough is better. - */ - raw_spin_lock_irq(&child_ctx->lock); - WARN_ON_ONCE(child_ctx->is_active); + if (parent_event) { + /* + * Do not destroy the 'original' grouping; because of the + * context switch optimization the original events could've + * ended up in a random child task. + * + * If we were to destroy the original group, all group related + * operations would cease to function properly after this + * random child dies. + * + * Do destroy all inherited groups, we don't care about those + * and being thorough is better. + */ + detach_flags = DETACH_GROUP | DETACH_CHILD; + mutex_lock(&parent_event->child_mutex); + } - if (parent_event) - perf_group_detach(child_event); - list_del_event(child_event, child_ctx); - perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ - raw_spin_unlock_irq(&child_ctx->lock); + perf_remove_from_context(event, detach_flags); + + raw_spin_lock_irq(&ctx->lock); + if (event->state > PERF_EVENT_STATE_EXIT) + perf_event_set_state(event, PERF_EVENT_STATE_EXIT); + raw_spin_unlock_irq(&ctx->lock); /* - * Parent events are governed by their filedesc, retain them. + * Child events can be freed. */ - if (!parent_event) { - perf_event_wakeup(child_event); + if (parent_event) { + mutex_unlock(&parent_event->child_mutex); + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + free_event(event); + put_event(parent_event); return; } - /* - * Child events can be cleaned up. - */ - - sync_child_event(child_event, child); /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Kick perf_poll() for is_event_hup(). + * Parent events are governed by their filedesc, retain them. */ - perf_event_wakeup(parent_event); - free_event(child_event); - put_event(parent_event); + perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child, int ctxn) +static void perf_event_exit_task_context(struct task_struct *child) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); - child_ctx = perf_pin_task_context(child, ctxn); + child_ctx = perf_pin_task_context(child); if (!child_ctx) return; @@ -12435,13 +13504,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + RCU_INIT_POINTER(child->perf_event_ctxp, NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ @@ -12460,7 +13529,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx); mutex_unlock(&child_ctx->mutex); @@ -12476,7 +13545,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; - int ctxn; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, @@ -12492,8 +13560,7 @@ void perf_event_exit_task(struct task_struct *child) } mutex_unlock(&child->perf_event_mutex); - for_each_task_context_nr(ctxn) - perf_event_exit_task_context(child, ctxn); + perf_event_exit_task_context(child); /* * The perf_event_exit_task_context calls perf_event_task @@ -12536,56 +13603,51 @@ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; - int ctxn; - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; + ctx = rcu_access_pointer(task->perf_event_ctxp); + if (!ctx) + return; - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); + mutex_lock(&ctx->mutex); + raw_spin_lock_irq(&ctx->lock); + /* + * Destroy the task <-> ctx relation and mark the context dead. + * + * This is important because even though the task hasn't been + * exposed yet the context has been (through child_list). + */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ + raw_spin_unlock_irq(&ctx->lock); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - mutex_unlock(&ctx->mutex); + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) + perf_free_event(event, ctx); - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ - } + mutex_unlock(&ctx->mutex); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } void perf_event_delayed_put(struct task_struct *task) { - int ctxn; - - for_each_task_context_nr(ctxn) - WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); + WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) @@ -12618,6 +13680,15 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } +int perf_allow_kernel(struct perf_event_attr *attr) +{ + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_KERNEL); +} +EXPORT_SYMBOL_GPL(perf_allow_kernel); + /* * Inherit an event from parent task to child task. * @@ -12635,6 +13706,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; @@ -12655,17 +13727,12 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; - - if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && - !child_ctx->task_ctx_data) { - struct pmu *pmu = child_event->pmu; - - child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); - if (!child_ctx->task_ctx_data) { - free_event(child_event); - return ERR_PTR(-ENOMEM); - } + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); + if (IS_ERR(pmu_ctx)) { + free_event(child_event); + return ERR_CAST(pmu_ctx); } + child_event->pmu_ctx = pmu_ctx; /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) @@ -12720,6 +13787,7 @@ inherit_event(struct perf_event *parent_event, */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); + child_event->attach_state |= PERF_ATTACH_CHILD; raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* @@ -12770,6 +13838,8 @@ static int inherit_group(struct perf_event *parent_event, !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } + if (leader) + leader->group_generation = parent_event->group_generation; return 0; } @@ -12787,18 +13857,21 @@ static int inherit_group(struct perf_event *parent_event, static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, - int *inherited_all) + struct task_struct *child, + u64 clone_flags, int *inherited_all) { - int ret; struct perf_event_context *child_ctx; + int ret; - if (!event->attr.inherit) { + if (!event->attr.inherit || + (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || + /* Do not inherit if sigtrap and signal handlers were cleared. */ + (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -12806,16 +13879,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); + child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; - child->perf_event_ctxp[ctxn] = child_ctx; + child->perf_event_ctxp = child_ctx; } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; @@ -12825,7 +13896,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -static int perf_event_init_context(struct task_struct *child, int ctxn) +static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -12835,14 +13906,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) unsigned long flags; int ret = 0; - if (likely(!parent->perf_event_ctxp[ctxn])) + if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent, ctxn); + parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; @@ -12865,7 +13936,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -12881,7 +13952,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -12889,7 +13960,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* @@ -12923,20 +13994,19 @@ out_unlock: /* * Initialize the perf_event context in task_struct */ -int perf_event_init_task(struct task_struct *child) +int perf_event_init_task(struct task_struct *child, u64 clone_flags) { - int ctxn, ret; + int ret; - memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); + child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); - for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn); - if (ret) { - perf_event_free_task(child); - return ret; - } + ret = perf_event_init_context(child, clone_flags); + if (ret) { + perf_event_free_task(child); + return ret; } return 0; @@ -12945,21 +14015,33 @@ int perf_event_init_task(struct task_struct *child) static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; + struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); -#ifdef CONFIG_CGROUP_PERF - INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); -#endif + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); + + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); + cpuctx->heap = cpuctx->heap_default; } } @@ -12981,34 +14063,70 @@ static void perf_swevent_init_cpu(unsigned int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } +static void perf_event_clear_cpumask(unsigned int cpu) +{ + int target[PERF_PMU_MAX_SCOPE]; + unsigned int scope; + struct pmu *pmu; + + cpumask_clear_cpu(cpu, perf_online_mask); + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); + + target[scope] = -1; + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) + continue; + target[scope] = cpumask_any_but(cpumask, cpu); + if (target[scope] < nr_cpu_ids) + cpumask_set_cpu(target[scope], pmu_cpumask); + } + + /* migrate */ + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->scope == PERF_PMU_SCOPE_NONE || + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) + continue; + + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); + } +} + static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; + // XXX simplify cpuctx->online mutex_lock(&pmus_lock); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + /* + * Clear the cpumasks, and migrate to other CPUs if possible. + * Must be invoked before the __perf_event_exit_context. + */ + perf_event_clear_cpumask(cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); - cpuctx->online = 0; - mutex_unlock(&ctx->mutex); - } - cpumask_clear_cpu(cpu, perf_online_mask); + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; + mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); } #else @@ -13017,24 +14135,57 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +static void perf_event_setup_cpumask(unsigned int cpu) +{ + struct cpumask *pmu_cpumask; + unsigned int scope; + + /* + * Early boot stage, the cpumask hasn't been set yet. + * The perf_online_<domain>_masks includes the first CPU of each domain. + * Always unconditionally set the boot CPU for the perf_online_<domain>_masks. + */ + if (cpumask_empty(perf_online_mask)) { + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + pmu_cpumask = perf_scope_cpumask(scope); + if (WARN_ON_ONCE(!pmu_cpumask)) + continue; + cpumask_set_cpu(cpu, pmu_cpumask); + } + goto end; + } + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + + pmu_cpumask = perf_scope_cpumask(scope); + + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_empty(cpumask) && + cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) + cpumask_set_cpu(cpu, pmu_cpumask); + } +end: + cpumask_set_cpu(cpu, perf_online_mask); +} + int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); - cpumask_set_cpu(cpu, perf_online_mask); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + perf_event_setup_cpumask(cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - cpuctx->online = 1; - mutex_unlock(&ctx->mutex); - } + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; @@ -13075,8 +14226,8 @@ void __init perf_event_init(void) perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); - perf_pmu_register(&perf_cpu_clock, NULL, -1); - perf_pmu_register(&perf_task_clock, NULL, -1); + perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); + perf_pmu_register(&perf_task_clock, "task_clock", -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); @@ -13084,6 +14235,8 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); + perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); + /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. @@ -13117,7 +14270,7 @@ static int __init perf_event_sysfs_init(void) goto unlock; list_for_each_entry(pmu, &pmus, entry) { - if (!pmu->name || pmu->type < 0) + if (pmu->dev) continue; ret = pmu_dev_alloc(pmu); @@ -13169,9 +14322,11 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; - rcu_read_lock(); - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); - rcu_read_unlock(); + + preempt_disable(); + perf_cgroup_switch(task); + preempt_enable(); + return 0; } @@ -13198,3 +14353,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = { .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ + +DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); |