diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 1423 |
1 files changed, 930 insertions, 493 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 724e6d7e128f..bcb09e011e9e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +enum event_type_t { + EVENT_FLEXIBLE = 0x01, + EVENT_PINNED = 0x02, + EVENT_TIME = 0x04, + EVENT_FROZEN = 0x08, + /* see ctx_resched() for details */ + EVENT_CPU = 0x10, + EVENT_CGROUP = 0x20, + + /* compound helpers */ + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, +}; + +static inline void __perf_ctx_lock(struct perf_event_context *ctx) +{ + raw_spin_lock(&ctx->lock); + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); +} + static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - raw_spin_lock(&cpuctx->ctx.lock); + __perf_ctx_lock(&cpuctx->ctx); if (ctx) - raw_spin_lock(&ctx->lock); + __perf_ctx_lock(ctx); +} + +static inline void __perf_ctx_unlock(struct perf_event_context *ctx) +{ + /* + * If ctx_sched_in() didn't again set any ALL flags, clean up + * after ctx_sched_out() by clearing is_active. + */ + if (ctx->is_active & EVENT_FROZEN) { + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + else + ctx->is_active &= ~EVENT_FROZEN; + } + raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); + __perf_ctx_unlock(ctx); + __perf_ctx_unlock(&cpuctx->ctx); } #define TASK_TOMBSTONE ((void *)-1L) @@ -264,6 +299,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, @@ -291,22 +327,25 @@ again: if (!task_function_call(task, event_function, &efs)) return; - raw_spin_lock_irq(&ctx->lock); + local_irq_disable(); + cpuctx = this_cpu_ptr(&perf_cpu_context); + perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task == TASK_TOMBSTONE) { - raw_spin_unlock_irq(&ctx->lock); - return; - } + if (task == TASK_TOMBSTONE) + goto unlock; if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); goto again; } func(event, NULL, ctx, data); - raw_spin_unlock_irq(&ctx->lock); +unlock: + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); } /* @@ -369,16 +408,6 @@ unlock: (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_TIME = 0x4, - /* see ctx_resched() for details */ - EVENT_CPU = 0x8, - EVENT_CGROUP = 0x10, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - /* * perf_sched_events : >0 events exist */ @@ -407,6 +436,11 @@ static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static cpumask_var_t perf_online_core_mask; +static cpumask_var_t perf_online_die_mask; +static cpumask_var_t perf_online_cluster_mask; +static cpumask_var_t perf_online_pkg_mask; +static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* @@ -450,7 +484,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_event_max_sample_rate_handler(struct ctl_table *table, int write, +int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -474,7 +508,7 @@ int perf_event_max_sample_rate_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; -int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, +int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -534,7 +568,7 @@ void perf_sample_event_took(u64 sample_len_ns) __this_cpu_write(running_sample_length, running_len); /* - * Note: this will be biased artifically low until we have + * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ @@ -596,10 +630,10 @@ static inline u64 perf_event_clock(struct perf_event *event) * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is - * OFF, irrespecive of what the group member states are. This results in + * OFF, irrespective of what the group member states are. This results in * __perf_effective_state(). * - * A futher ramification is that when a group leader flips between OFF and + * A further ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * @@ -685,30 +719,32 @@ do { \ ___p; \ }) +#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ + if (_cgroup && !_epc->nr_cgroups) \ + continue; \ + else if (_pmu && _epc->pmu != _pmu) \ + continue; \ + else + static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); - } } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); - } } -static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); -static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF @@ -865,7 +901,7 @@ static void perf_cgroup_switch(struct task_struct *task) perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@ -877,7 +913,7 @@ static void perf_cgroup_switch(struct task_struct *task) * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -891,7 +927,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event, int cpu, heap_size, ret = 0; /* - * Allow storage to have sufficent space for an iterator for each + * Allow storage to have sufficient space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) @@ -930,22 +966,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int ret = 0; - if (!f.file) + if (fd_empty(f)) return -EBADF; - css = css_tryget_online_from_dir(f.file->f_path.dentry, + css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } + if (IS_ERR(css)) + return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) - goto out; + return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -959,8 +993,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, perf_detach_cgroup(event); ret = -EINVAL; } -out: - fdput(f); return ret; } @@ -1255,8 +1287,9 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock - * perf_event::mmap_mutex * mmap_lock + * perf_event::mmap_mutex + * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock @@ -1768,6 +1801,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu) typeof(*event), group_node)) /* + * Does the event attribute request inherit with PERF_SAMPLE_READ + */ +static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) +{ + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); +} + +/* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ @@ -1797,6 +1838,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (has_inherit_and_sample_read(&event->attr)) + local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); @@ -2021,6 +2064,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; + if (has_inherit_and_sample_read(&event->attr)) + local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); @@ -2097,7 +2142,7 @@ static void perf_put_aux_event(struct perf_event *event) static bool perf_need_aux_event(struct perf_event *event) { - return !!event->attr.aux_output || !!event->attr.aux_sample_size; + return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, @@ -2122,6 +2167,10 @@ static int perf_get_aux_event(struct perf_event *event, !perf_aux_output_match(event, group_leader)) return 0; + if ((event->attr.aux_pause || event->attr.aux_resume) && + !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return 0; + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; @@ -2283,27 +2332,14 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) state = PERF_EVENT_STATE_OFF; } - if (event->pending_sigtrap) { - bool dec = true; - - event->pending_sigtrap = 0; - if (state != PERF_EVENT_STATE_OFF && - !event->pending_work) { - event->pending_work = 1; - dec = false; - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); - task_work_add(current, &event->pending_task, TWA_RESUME); - } - if (dec) - local_dec(&event->ctx->nr_pending); - } - perf_event_set_state(event, state); if (!is_software_event(event)) cpc->active_oncpu--; - if (event->attr.freq && event->attr.sample_freq) + if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq--; + epc->nr_freq--; + } if (event->attr.exclusive || !cpc->active_oncpu) cpc->exclusive = 0; @@ -2329,6 +2365,45 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) event_sched_out(event, ctx); } +static inline void +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, final); + } +} + +static inline void +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + __ctx_time_update(cpuctx, ctx, false); +} + +/* + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). + */ +static inline void +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + ctx_time_update(cpuctx, ctx); + if (ctx->is_active & EVENT_TIME) + ctx->is_active |= EVENT_FROZEN; +} + +static inline void +ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_event(event); + } +} + #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL @@ -2348,10 +2423,7 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, false); - } + ctx_time_update(cpuctx, ctx); /* * Ensure event_sched_out() switches to OFF, at the very least @@ -2436,12 +2508,8 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - perf_pmu_disable(event->pmu_ctx->pmu); + ctx_time_update_event(ctx, event); if (event == event->group_leader) group_sched_out(event, ctx); @@ -2464,7 +2532,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_irq it's OK because event->ctx + * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2504,7 +2572,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; - irq_work_queue(&event->pending_irq); + irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -2558,9 +2626,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu++; - if (event->attr.freq && event->attr.sample_freq) + if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq++; - + epc->nr_freq++; + } if (event->attr.exclusive) cpc->exclusive = 1; @@ -2656,7 +2725,8 @@ static void add_event_to_ctx(struct perf_event *event, } static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) + struct pmu *pmu, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); @@ -2666,18 +2736,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, event_type); + ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* @@ -2695,16 +2766,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ -/* - * XXX: ctx_resched() reschedule entire perf_event_context while adding new - * event to the context or enabling existing event in the context. We can - * probably optimize it by rescheduling only affected pmu_ctx. - */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, - enum event_type_t event_type) + struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); + struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be @@ -2715,10 +2782,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_disable(epc->pmu); + if (task_ctx) { - perf_ctx_disable(task_ctx, false); - task_ctx_sched_out(task_ctx, event_type); + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + task_ctx_sched_out(task_ctx, pmu, event_type); } /* @@ -2729,15 +2800,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - ctx_sched_out(&cpuctx->ctx, event_type); + ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + + perf_event_sched_in(cpuctx, task_ctx, pmu); - perf_event_sched_in(cpuctx, task_ctx); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_enable(epc->pmu); - perf_ctx_enable(&cpuctx->ctx, false); - if (task_ctx) - perf_ctx_enable(task_ctx, false); + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_enable(epc->pmu); + } } void perf_pmu_resched(struct pmu *pmu) @@ -2746,7 +2821,7 @@ void perf_pmu_resched(struct pmu *pmu) struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } @@ -2802,9 +2877,10 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, + get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2947,8 +3023,7 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - if (ctx->is_active) - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2956,25 +3031,21 @@ static void __perf_event_enable(struct perf_event *event, if (!ctx->is_active) return; - if (!event_filter_match(event)) { - ctx_sched_in(ctx, EVENT_TIME); + if (!event_filter_match(event)) return; - } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, EVENT_TIME); + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; - } task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* @@ -3242,7 +3313,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; - if (ctx->task && !ctx->is_active) { + if (ctx->task && !(ctx->is_active & EVENT_ALL)) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu->cpu_pmu_context); @@ -3250,7 +3321,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, cpc->task_epc = NULL; } - if (!event_type) + if (!(event_type & EVENT_ALL)) return; perf_pmu_disable(pmu); @@ -3276,8 +3347,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, perf_pmu_enable(pmu); } +/* + * Be very careful with the @pmu argument since this will change ctx state. + * The @pmu argument works for ctx_resched(), because that is symmetric in + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. + * + * However, if you were to be asymmetrical, you could end up with messed up + * state, eg. ctx->is_active cleared even though most EPCs would still actually + * be active. + */ static void -ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; @@ -3308,34 +3388,36 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) * * would only update time for the pinned events. */ - if (is_active & EVENT_TIME) { - /* update (and stop) ctx time */ - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + ctx->is_active &= ~event_type; + + if (!(ctx->is_active & EVENT_ALL)) { /* - * CPU-release for the below ->is_active store, - * see __load_acquire() in perf_event_time_now() + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() + * does not observe a hole. perf_ctx_unlock() will clean up. */ - barrier(); + if (ctx->is_active & EVENT_FROZEN) + ctx->is_active &= EVENT_TIME_FROZEN; + else + ctx->is_active = 0; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) + if (!(ctx->is_active & EVENT_ALL)) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); - } } /* @@ -3528,12 +3610,17 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_disable(ctx, false); - /* PMIs are disabled; ctx->nr_pending is stable. */ - if (local_read(&ctx->nr_pending) || - local_read(&next_ctx->nr_pending)) { + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ + if (local_read(&ctx->nr_no_switch_fast) || + local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. + * + * Likewise, when a context contains inherit + + * SAMPLE_READ events they should be switched + * out using the slow path so that they are + * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); @@ -3574,7 +3661,7 @@ unlock: inside_switch: perf_ctx_sched_task_cb(ctx, false); - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); @@ -3683,7 +3770,7 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_switch(next); } -static bool perf_less_group_idx(const void *l, const void *r) +static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args) { const struct perf_event *le = *(const struct perf_event **)l; const struct perf_event *re = *(const struct perf_event **)r; @@ -3691,20 +3778,14 @@ static bool perf_less_group_idx(const void *l, const void *r) return le->group_index < re->group_index; } -static void swap_ptr(void *l, void *r) -{ - void **lp = l, **rp = r; - - swap(*lp, *rp); -} +DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); static const struct min_heap_callbacks perf_min_heap = { - .elem_size = sizeof(struct perf_event *), .less = perf_less_group_idx, - .swp = swap_ptr, + .swp = NULL, }; -static void __heap_add(struct min_heap *heap, struct perf_event *event) +static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) { struct perf_event **itrs = heap->data; @@ -3738,7 +3819,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; - struct min_heap event_heap; + struct perf_event_min_heap event_heap; struct perf_event **evt; int ret; @@ -3747,7 +3828,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, if (!ctx->task) { cpuctx = this_cpu_ptr(&perf_cpu_context); - event_heap = (struct min_heap){ + event_heap = (struct perf_event_min_heap){ .data = cpuctx->heap, .nr = 0, .size = cpuctx->heap_size, @@ -3760,7 +3841,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, css = &cpuctx->cgrp->css; #endif } else { - event_heap = (struct min_heap){ + event_heap = (struct perf_event_min_heap){ .data = itrs, .nr = 0, .size = ARRAY_SIZE(itrs), @@ -3782,7 +3863,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } - min_heapify_all(&event_heap, &perf_min_heap); + min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); while (event_heap.nr) { ret = func(*evt, data); @@ -3791,9 +3872,9 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, *evt = perf_event_groups_next(*evt, pmu); if (*evt) - min_heapify(&event_heap, 0, &perf_min_heap); + min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); else - min_heap_pop(&event_heap, &perf_min_heap); + min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); } return 0; @@ -3871,29 +3952,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx, merge_sched_in, &can_add_hw); } -static void ctx_groups_sched_in(struct perf_event_context *ctx, - struct perf_event_groups *groups, - bool cgroup) +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { - struct perf_event_pmu_context *pmu_ctx; - - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); - } -} + struct perf_event_context *ctx = pmu_ctx->ctx; -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, - struct pmu *pmu) -{ - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); + if (event_type & EVENT_PINNED) + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + if (event_type & EVENT_FLEXIBLE) + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; @@ -3917,7 +3991,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { - if (!is_active) + if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -3929,12 +4003,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); + if (is_active & EVENT_PINNED) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + } /* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); + if (is_active & EVENT_FLEXIBLE) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + } } static void perf_event_context_sched_in(struct task_struct *task) @@ -3977,10 +4055,10 @@ static void perf_event_context_sched_in(struct task_struct *task) */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx); + perf_event_sched_in(cpuctx, ctx, NULL); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); @@ -4103,7 +4181,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ + if (delta >= 0) + delta += 7; + else + delta -= 7; + delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; @@ -4123,30 +4205,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo } } -/* - * combine freq adjustment with unthrottling to avoid two passes over the - * events. At the same time, make sure, having freq events does not change - * the rate of unthrottling as that would introduce bias. - */ -static void -perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) +static void perf_adjust_freq_unthr_events(struct list_head *event_list) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; - /* - * only need to iterate over all events iff: - * - context have events in frequency mode (needs freq adjust) - * - there are events to unthrottle on this cpu - */ - if (!(ctx->nr_freq || unthrottle)) - return; - - raw_spin_lock(&ctx->lock); - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + list_for_each_entry(event, event_list, active_list) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -4154,18 +4220,17 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) if (!event_filter_match(event)) continue; - perf_pmu_disable(event->pmu); - hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); - event->pmu->start(event, 0); + if (!event->attr.freq || !event->attr.sample_freq) + event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) - goto next; + continue; /* * stop the event and update event->count @@ -4187,8 +4252,41 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); - next: - perf_pmu_enable(event->pmu); + } +} + +/* + * combine freq adjustment with unthrottling to avoid two passes over the + * events. At the same time, make sure, having freq events does not change + * the rate of unthrottling as that would introduce bias. + */ +static void +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) +{ + struct perf_event_pmu_context *pmu_ctx; + + /* + * only need to iterate over all events iff: + * - context have events in frequency mode (needs freq adjust) + * - there are events to unthrottle on this cpu + */ + if (!(ctx->nr_freq || unthrottle)) + return; + + raw_spin_lock(&ctx->lock); + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (!(pmu_ctx->nr_freq || unthrottle)) + continue; + if (!perf_pmu_ctx_is_active(pmu_ctx)) + continue; + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) + continue; + + perf_pmu_disable(pmu_ctx->pmu); + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); + perf_pmu_enable(pmu_ctx->pmu); } raw_spin_unlock(&ctx->lock); @@ -4305,14 +4403,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); - __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) - __pmu_ctx_sched_in(task_epc->ctx, pmu); + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4378,7 +4476,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); @@ -4390,9 +4488,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx, event_type); - } else { - ctx_sched_in(ctx, EVENT_TIME); + ctx_resched(cpuctx, ctx, NULL, event_type); } perf_ctx_unlock(cpuctx, ctx); @@ -4453,16 +4549,24 @@ struct perf_read_data { int ret; }; +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); + static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { + int local_cpu = smp_processor_id(); u16 local_pkg, event_pkg; if ((unsigned)event_cpu >= nr_cpu_ids) return event_cpu; - if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { - int local_cpu = smp_processor_id(); + if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); + + if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) + return local_cpu; + } + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); @@ -4495,10 +4599,7 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (data->group) @@ -4533,8 +4634,11 @@ unlock: raw_spin_unlock(&ctx->lock); } -static inline u64 perf_event_count(struct perf_event *event) +static inline u64 perf_event_count(struct perf_event *event, bool self) { + if (self) + return local64_read(&event->count); + return local64_read(&event->count) + atomic64_read(&event->child_count); } @@ -4695,10 +4799,7 @@ again: * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (group) @@ -5187,9 +5288,35 @@ static bool exclusive_event_installable(struct perf_event *event, static void perf_addr_filters_splice(struct perf_event *event, struct list_head *head); +static void perf_pending_task_sync(struct perf_event *event) +{ + struct callback_head *head = &event->pending_task; + + if (!event->pending_work) + return; + /* + * If the task is queued to the current task's queue, we + * obviously can't wait for it to complete. Simply cancel it. + */ + if (task_work_cancel(current, head)) { + event->pending_work = 0; + local_dec(&event->ctx->nr_no_switch_fast); + return; + } + + /* + * All accesses related to the event are within the same RCU section in + * perf_pending_task(). The RCU grace period before the event is freed + * will make sure all those accesses are complete by then. + */ + rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); +} + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); unaccount_event(event); @@ -5365,6 +5492,7 @@ int perf_event_release_kernel(struct perf_event *event) again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { + void *var = NULL; /* * Cannot change, child events are not migrated, see the @@ -5405,11 +5533,23 @@ again: * this can't be the last reference. */ put_event(event); + } else { + var = &ctx->refcount; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); put_ctx(ctx); + + if (var) { + /* + * If perf_event_free_task() has deleted all events from the + * ctx while the child_mutex got released above, make sure to + * notify about the preceding put_ctx(). + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); + } goto again; } mutex_unlock(&event->child_mutex); @@ -5454,7 +5594,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); - total += perf_event_count(event); + total += perf_event_count(event, false); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -5463,7 +5603,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); - total += perf_event_count(child); + total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -5545,14 +5685,14 @@ static int __perf_read_group_add(struct perf_event *leader, /* * Write {count,id} tuples for every sibling. */ - values[n++] += perf_event_count(leader); + values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { - values[n++] += perf_event_count(sub); + values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -5851,18 +5991,9 @@ EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; -static inline int perf_fget_light(int fd, struct fd *p) +static inline bool is_perf_file(struct fd f) { - struct fd f = fdget(fd); - if (!f.file) - return -EBADF; - - if (f.file->f_op != &perf_fops) { - fdput(f); - return -EBADF; - } - *p = f; - return 0; + return !fd_empty(f) && fd_file(f)->f_op == &perf_fops; } static int perf_event_set_output(struct perf_event *event, @@ -5910,20 +6041,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_OUTPUT: { - int ret; + CLASS(fd, output)(arg); // arg == -1 => empty + struct perf_event *output_event = NULL; if (arg != -1) { - struct perf_event *output_event; - struct fd output; - ret = perf_fget_light(arg, &output); - if (ret) - return ret; - output_event = output.file->private_data; - ret = perf_event_set_output(event, output_event); - fdput(output); - } else { - ret = perf_event_set_output(event, NULL); + if (!is_perf_file(output)) + return -EBADF; + output_event = fd_file(output)->private_data; } - return ret; + return perf_event_set_output(event, output_event); } case PERF_EVENT_IOC_SET_FILTER: @@ -6132,7 +6257,7 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); + userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); @@ -6152,41 +6277,6 @@ unlock: } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) -{ - struct perf_event *event = vmf->vma->vm_file->private_data; - struct perf_buffer *rb; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { @@ -6329,12 +6419,11 @@ static void perf_mmap_close(struct vm_area_struct *vma) event->pmu->event_unmapped(event, vma->vm_mm); /* - * rb->aux_mmap_count will always drop before rb->mmap_count and - * event->mmap_count, so it is ok to use event->mmap_mutex to - * serialize with perf_mmap here. + * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex + * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6351,7 +6440,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); - mutex_unlock(&event->mmap_mutex); + mutex_unlock(&rb->aux_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) @@ -6427,18 +6516,93 @@ out_put: ring_buffer_put(rb); /* could be last */ } +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) +{ + /* The first page is the user control page, others are read-only. */ + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, + .pfn_mkwrite = perf_mmap_pfn_mkwrite, }; +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) +{ + unsigned long nr_pages = vma_pages(vma); + int err = 0; + unsigned long pagenum; + + /* + * We map this as a VM_PFNMAP VMA. + * + * This is not ideal as this is designed broadly for mappings of PFNs + * referencing memory-mapped I/O ranges or non-system RAM i.e. for which + * !pfn_valid(pfn). + * + * We are mapping kernel-allocated memory (memory we manage ourselves) + * which would more ideally be mapped using vm_insert_page() or a + * similar mechanism, that is as a VM_MIXEDMAP mapping. + * + * However this won't work here, because: + * + * 1. It uses vma->vm_page_prot, but this field has not been completely + * setup at the point of the f_op->mmp() hook, so we are unable to + * indicate that this should be mapped CoW in order that the + * mkwrite() hook can be invoked to make the first page R/W and the + * rest R/O as desired. + * + * 2. Anything other than a VM_PFNMAP of valid PFNs will result in + * vm_normal_page() returning a struct page * pointer, which means + * vm_ops->page_mkwrite() will be invoked rather than + * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping + * to work around retry logic in the fault handler, however this + * field is no longer allowed to be used within struct page. + * + * 3. Having a struct page * made available in the fault logic also + * means that the page gets put on the rmap and becomes + * inappropriately accessible and subject to map and ref counting. + * + * Ideally we would have a mechanism that could explicitly express our + * desires, but this is not currently the case, so we instead use + * VM_PFNMAP. + * + * We manage the lifetime of these mappings with internal refcounts (see + * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of + * this mapping is maintained correctly. + */ + for (pagenum = 0; pagenum < nr_pages; pagenum++) { + unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; + struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); + + if (page == NULL) { + err = -EINVAL; + break; + } + + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); + if (err) + break; + } + +#ifdef CONFIG_MMU + /* Clear any partial mappings on error. */ + if (err) + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); +#endif + + return err; +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; @@ -6477,6 +6641,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; nr_pages = vma_size / PAGE_SIZE; + if (nr_pages > INT_MAX) + return -ENOMEM; mutex_lock(&event->mmap_mutex); ret = -EINVAL; @@ -6485,6 +6651,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto aux_unlock; + aux_mutex = &rb->aux_mutex; + mutex_lock(aux_mutex); + aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); @@ -6552,6 +6721,8 @@ again: goto again; } + /* We need the rb to map pages. */ + rb = event->rb; goto unlock; } @@ -6635,6 +6806,8 @@ unlock: atomic_dec(&rb->mmap_count); } aux_unlock: + if (aux_mutex) + mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); /* @@ -6644,6 +6817,9 @@ aux_unlock: vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; + if (!ret) + ret = map_range(rb, vma); + if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); @@ -6667,7 +6843,6 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { - .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, @@ -6684,14 +6859,6 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ -static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) -{ - /* only the parent has fasync state */ - if (event->parent) - event = event->parent; - return &event->fasync; -} - void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); @@ -6726,7 +6893,7 @@ static void perf_sigtrap(struct perf_event *event) /* * Deliver the pending work in-event-context or follow the context. */ -static void __perf_pending_irq(struct perf_event *event) +static void __perf_pending_disable(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); @@ -6741,11 +6908,6 @@ static void __perf_pending_irq(struct perf_event *event) * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { - if (event->pending_sigtrap) { - event->pending_sigtrap = 0; - perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); - } if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); @@ -6769,11 +6931,26 @@ static void __perf_pending_irq(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_irq() + * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending_irq, cpu); + irq_work_queue_on(&event->pending_disable_irq, cpu); +} + +static void perf_pending_disable(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + __perf_pending_disable(event); + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } static void perf_pending_irq(struct irq_work *entry) @@ -6796,8 +6973,6 @@ static void perf_pending_irq(struct irq_work *entry) perf_event_wakeup(event); } - __perf_pending_irq(event); - if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } @@ -6808,23 +6983,27 @@ static void perf_pending_task(struct callback_head *head) int rctx; /* + * All accesses to the event must belong to the same implicit RCU read-side + * critical section as the ->pending_work reset. See comment in + * perf_pending_task_sync(). + */ + rcu_read_lock(); + /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ - preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); + rcuwait_wake_up(&event->pending_work_wait); } + rcu_read_unlock(); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); - preempt_enable_notrace(); - - put_event(event); } #ifdef CONFIG_GUEST_PERF_EVENTS @@ -6865,6 +7044,29 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif +static bool should_sample_guest(struct perf_event *event) +{ + return !event->attr.exclude_guest && perf_guest_state(); +} + +unsigned long perf_misc_flags(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_arch_guest_misc_flags(regs); + + return perf_arch_misc_flags(regs); +} + +unsigned long perf_instruction_pointer(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_guest_get_ip(); + + return perf_arch_instruction_pointer(regs); +} + static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) @@ -7200,7 +7402,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 values[5]; int n = 0; - values[n++] = perf_event_count(event); + values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); @@ -7218,14 +7420,15 @@ static void perf_output_read_one(struct perf_output_handle *handle, } static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; + bool self = has_inherit_and_sample_read(&event->attr); /* * Disabling interrupts avoids all counter scheduling @@ -7245,7 +7448,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); - values[n++] = perf_event_count(leader); + values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) @@ -7260,7 +7463,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); - values[n++] = perf_event_count(sub); + values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -7281,6 +7484,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. + * + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread + * counts rather than attempting to accumulate some value across all children on + * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) @@ -7539,7 +7746,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { u64 size = 0; -#ifdef CONFIG_HAVE_FAST_GUP +#ifdef CONFIG_HAVE_GUP_FAST pgd_t *pgdp, pgd; p4d_t *p4dp, p4d; pud_t *pudp, pud; @@ -7585,9 +7792,9 @@ again: pte = ptep_get_lockless(ptep); if (pte_present(pte)) - size = pte_leaf_size(pte); + size = __pte_leaf_size(pmd, pte); pte_unmap(ptep); -#endif /* CONFIG_HAVE_FAST_GUP */ +#endif /* CONFIG_HAVE_GUP_FAST */ return size; } @@ -7677,7 +7884,7 @@ void perf_prepare_sample(struct perf_sample_data *data, __perf_event_header__init_id(data, event, filtered_sample_type); if (filtered_sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); + data->ip = perf_instruction_pointer(event, regs); data->sample_flags |= PERF_SAMPLE_IP; } @@ -7841,7 +8048,7 @@ void perf_prepare_header(struct perf_event_header *header, { header->type = PERF_RECORD_SAMPLE; header->size = perf_sample_data_size(data, event); - header->misc = perf_misc_flags(regs); + header->misc = perf_misc_flags(event, regs); /* * If you're adding more sample types here, you likely need to do @@ -7854,6 +8061,49 @@ void perf_prepare_header(struct perf_event_header *header, WARN_ON_ONCE(header->size & 7); } +static void __perf_event_aux_pause(struct perf_event *event, bool pause) +{ + if (pause) { + if (!event->hw.aux_paused) { + event->hw.aux_paused = 1; + event->pmu->stop(event, PERF_EF_PAUSE); + } + } else { + if (event->hw.aux_paused) { + event->hw.aux_paused = 0; + event->pmu->start(event, PERF_EF_RESUME); + } + } +} + +static void perf_event_aux_pause(struct perf_event *event, bool pause) +{ + struct perf_buffer *rb; + + if (WARN_ON_ONCE(!event)) + return; + + rb = ring_buffer_get(event); + if (!rb) + return; + + scoped_guard (irqsave) { + /* + * Guard against self-recursion here. Another event could trip + * this same from NMI context. + */ + if (READ_ONCE(rb->aux_in_pause_resume)) + break; + + WRITE_ONCE(rb->aux_in_pause_resume, 1); + barrier(); + __perf_event_aux_pause(event, pause); + barrier(); + WRITE_ONCE(rb->aux_in_pause_resume, 0); + } + ring_buffer_put(rb); +} + static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, @@ -8801,7 +9051,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; if (atomic_read(&nr_build_id_events)) - build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); perf_iterate_sb(perf_event_mmap_output, mmap_event, @@ -9089,7 +9339,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - if (!sched_in && task->on_rq) { + if (!sched_in && task_is_runnable(task)) { switch_event.event_id.header.misc |= PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; } @@ -9278,21 +9528,19 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; int i; - if (prog->aux->func_cnt == 0) { - perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, - (u64)(unsigned long)prog->bpf_func, - prog->jited_len, unregister, - prog->aux->ksym.name); - } else { - for (i = 0; i < prog->aux->func_cnt; i++) { - struct bpf_prog *subprog = prog->aux->func[i]; - - perf_event_ksymbol( - PERF_RECORD_KSYMBOL_TYPE_BPF, - (u64)(unsigned long)subprog->bpf_func, - subprog->jited_len, unregister, - subprog->aux->ksym.name); - } + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, + prog->aux->ksym.name); + + for (i = 1; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, + subprog->aux->ksym.name); } } @@ -9544,6 +9792,100 @@ static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *r return true; } +#ifdef CONFIG_BPF_SYSCALL +static int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .event = event, + }; + struct bpf_prog *prog; + int ret = 0; + + ctx.regs = perf_arch_bpf_user_pt_regs(regs); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + prog = READ_ONCE(event->prog); + if (prog) { + perf_prepare_sample(data, event, regs); + ret = bpf_prog_run(prog, &ctx); + } + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + + return ret; +} + +static inline int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + if (prog->type != BPF_PROG_TYPE_PERF_EVENT) + return -EINVAL; + + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + return -EPROTO; + } + + event->prog = prog; + event->bpf_cookie = bpf_cookie; + return 0; +} + +static inline void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static inline int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return 1; +} + +static inline int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -EOPNOTSUPP; +} + +static inline void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + /* * Generic event overflow handling, sampling. */ @@ -9564,6 +9906,13 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + if (event->attr.aux_pause) + perf_event_aux_pause(event->aux_event, true); + + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && + !bpf_overflow_handler(event, data, regs)) + goto out; + /* * XXX event_limit might not quite work as expected on inherited * events @@ -9585,16 +9934,26 @@ static int __perf_event_overflow(struct perf_event *event, */ bool valid_sample = sample_is_allowed(event, regs); unsigned int pending_id = 1; + enum task_work_notify_mode notify_mode; if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; - if (!event->pending_sigtrap) { - event->pending_sigtrap = pending_id; - local_inc(&event->ctx->nr_pending); + + notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; + + if (!event->pending_work && + !task_work_add(current, &event->pending_task, notify_mode)) { + event->pending_work = pending_id; + local_inc(&event->ctx->nr_no_switch_fast); + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; + } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without - * consuming pending_sigtrap; with exceptions: + * consuming pending_work; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. @@ -9604,13 +9963,8 @@ static int __perf_event_overflow(struct perf_event *event, * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ - WARN_ON_ONCE(event->pending_sigtrap != pending_id); + WARN_ON_ONCE(event->pending_work != pending_id); } - - event->pending_addr = 0; - if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) - event->pending_addr = data->addr; - irq_work_queue(&event->pending_irq); } READ_ONCE(event->overflow_handler)(event, data, regs); @@ -9619,6 +9973,9 @@ static int __perf_event_overflow(struct perf_event *event, event->pending_wakeup = 1; irq_work_queue(&event->pending_irq); } +out: + if (event->attr.aux_resume) + perf_event_aux_pause(event->aux_event, false); return ret; } @@ -9638,11 +9995,7 @@ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; }; - static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* @@ -9730,8 +10083,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; @@ -9840,17 +10192,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - return get_recursion_context(swhash->recursion); + return get_recursion_context(current->perf_recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - put_recursion_context(swhash->recursion, rctx); + put_recursion_context(current->perf_recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) @@ -10120,9 +10468,9 @@ static struct pmu perf_tracepoint = { }; static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) + struct perf_raw_record *raw) { - void *record = data->raw->frag.data; + void *record = raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -10134,7 +10482,7 @@ static int perf_tp_filter_match(struct perf_event *event, } static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, + struct perf_raw_record *raw, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) @@ -10145,7 +10493,7 @@ static int perf_tp_event_match(struct perf_event *event, if (event->attr.exclude_kernel && !user_mode(regs)) return 0; - if (!perf_tp_filter_match(event, data)) + if (!perf_tp_filter_match(event, raw)) return 0; return 1; @@ -10171,6 +10519,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event *event) { struct trace_entry *entry = record; @@ -10180,13 +10529,17 @@ static void __perf_tp_event_target_task(u64 count, void *record, /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; - if (perf_tp_event_match(event, data, regs)) + if (perf_tp_event_match(event, raw, regs)) { + perf_sample_data_init(data, 0, 0); + perf_sample_save_raw_data(data, event, raw); perf_swevent_event(event, count, data, regs); + } } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); @@ -10194,15 +10547,15 @@ static void perf_tp_event_target_task(u64 count, void *record, struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } } @@ -10220,15 +10573,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }, }; - perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); - perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) { - perf_swevent_event(event, count, &data, regs); - + if (perf_tp_event_match(event, &raw, regs)) { /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and @@ -10238,7 +10586,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); + perf_swevent_event(event, count, &data, regs); } } @@ -10255,7 +10604,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; raw_spin_lock(&ctx->lock); - perf_tp_event_target_task(count, record, regs, &data, ctx); + perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); @@ -10422,97 +10771,6 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } -#ifdef CONFIG_BPF_SYSCALL -static void bpf_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct bpf_perf_event_data_kern ctx = { - .data = data, - .event = event, - }; - struct bpf_prog *prog; - int ret = 0; - - ctx.regs = perf_arch_bpf_user_pt_regs(regs); - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) - goto out; - rcu_read_lock(); - prog = READ_ONCE(event->prog); - if (prog) { - perf_prepare_sample(data, event, regs); - ret = bpf_prog_run(prog, &ctx); - } - rcu_read_unlock(); -out: - __this_cpu_dec(bpf_prog_active); - if (!ret) - return; - - event->orig_overflow_handler(event, data, regs); -} - -static int perf_event_set_bpf_handler(struct perf_event *event, - struct bpf_prog *prog, - u64 bpf_cookie) -{ - if (event->overflow_handler_context) - /* hw breakpoint or kernel counter */ - return -EINVAL; - - if (event->prog) - return -EEXIST; - - if (prog->type != BPF_PROG_TYPE_PERF_EVENT) - return -EINVAL; - - if (event->attr.precise_ip && - prog->call_get_stack && - (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || - event->attr.exclude_callchain_kernel || - event->attr.exclude_callchain_user)) { - /* - * On perf_event with precise_ip, calling bpf_get_stack() - * may trigger unwinder warnings and occasional crashes. - * bpf_get_[stack|stackid] works around this issue by using - * callchain attached to perf_sample_data. If the - * perf_event does not full (kernel and user) callchain - * attached to perf_sample_data, do not allow attaching BPF - * program that calls bpf_get_[stack|stackid]. - */ - return -EPROTO; - } - - event->prog = prog; - event->bpf_cookie = bpf_cookie; - event->orig_overflow_handler = READ_ONCE(event->overflow_handler); - WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); - return 0; -} - -static void perf_event_free_bpf_handler(struct perf_event *event) -{ - struct bpf_prog *prog = event->prog; - - if (!prog) - return; - - WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); - event->prog = NULL; - bpf_prog_put(prog); -} -#else -static int perf_event_set_bpf_handler(struct perf_event *event, - struct bpf_prog *prog, - u64 bpf_cookie) -{ - return -EOPNOTSUPP; -} -static void perf_event_free_bpf_handler(struct perf_event *event) -{ -} -#endif - /* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open() @@ -11426,10 +11684,60 @@ perf_event_mux_interval_ms_store(struct device *dev, } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return topology_sibling_cpumask(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_die_cpumask(cpu); + case PERF_PMU_SCOPE_CLUSTER: + return topology_cluster_cpumask(cpu); + case PERF_PMU_SCOPE_PKG: + return topology_core_cpumask(cpu); + case PERF_PMU_SCOPE_SYS_WIDE: + return cpu_online_mask; + } + + return NULL; +} + +static inline struct cpumask *perf_scope_cpumask(unsigned int scope) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return perf_online_core_mask; + case PERF_PMU_SCOPE_DIE: + return perf_online_die_mask; + case PERF_PMU_SCOPE_CLUSTER: + return perf_online_cluster_mask; + case PERF_PMU_SCOPE_PKG: + return perf_online_pkg_mask; + case PERF_PMU_SCOPE_SYS_WIDE: + return perf_online_sys_mask; + } + + return NULL; +} + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cpumask *mask = perf_scope_cpumask(pmu->scope); + + if (mask) + return cpumap_print_to_pagebuf(true, buf, mask); + return 0; +} + +static DEVICE_ATTR_RO(cpumask); + static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_nr_addr_filters.attr, + &dev_attr_cpumask.attr, NULL, }; @@ -11441,6 +11749,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int if (n == 2 && !pmu->nr_addr_filters) return 0; + /* cpumask */ + if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) + return 0; + return a->mode; } @@ -11525,6 +11837,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto free_pdc; } + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { + ret = -EINVAL; + goto free_pdc; + } + pmu->name = name; if (type >= 0) @@ -11679,6 +11996,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) event_has_any_exclude_flag(event)) ret = -EINVAL; + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); + int cpu; + + if (pmu_cpumask && cpumask) { + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + ret = -ENODEV; + else + event->event_caps |= PERF_EV_CAP_READ_SCOPE; + } else { + ret = -ENODEV; + } + } + if (ret && event->destroy) event->destroy(event); } @@ -11931,7 +12264,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); + event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); + rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -11971,13 +12306,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) - if (overflow_handler == bpf_overflow_handler) { + if (parent_event->prog) { struct bpf_prog *prog = parent_event->prog; bpf_prog_inc(prog); event->prog = prog; - event->orig_overflow_handler = - parent_event->orig_overflow_handler; } #endif } @@ -12006,10 +12339,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * We currently do not support PERF_SAMPLE_READ on inherited events. + * We do not support PERF_SAMPLE_READ on inherited events unless + * PERF_SAMPLE_TID is also selected, which allows inherited events to + * collect per-thread samples. * See perf_output_read(). */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) goto err_ns; if (!has_branch_stack(event)) @@ -12032,11 +12367,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } if (event->attr.aux_output && - !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { + (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || + event->attr.aux_pause || event->attr.aux_resume)) { err = -EOPNOTSUPP; goto err_pmu; } + if (event->attr.aux_pause && event->attr.aux_resume) { + err = -EINVAL; + goto err_pmu; + } + + if (event->attr.aux_start_paused) { + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { + err = -EOPNOTSUPP; + goto err_pmu; + } + event->hw.aux_paused = 1; + } + if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) @@ -12423,7 +12772,6 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; - struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; @@ -12494,11 +12842,13 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { - err = perf_fget_light(group_fd, &group); - if (err) + if (!is_perf_file(group)) { + err = -EBADF; goto err_fd; - group_leader = group.file->private_data; + } + group_leader = fd_file(group)->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -12509,7 +12859,7 @@ SYSCALL_DEFINE5(perf_event_open, task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); - goto err_group_fd; + goto err_fd; } } @@ -12776,12 +13126,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). + * File reference in group guarantees that group_leader has been + * kept alive until we place the new event on the sibling_list. + * This ensures destruction of the group leader will find + * the pointer to itself in perf_group_detach(). */ - fdput(group); fd_install(event_fd, event_file); return event_fd; @@ -12800,8 +13149,6 @@ err_alloc: err_task: if (task) put_task_struct(task); -err_group_fd: - fdput(group); err_fd: put_unused_fd(event_fd); return err; @@ -12832,7 +13179,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ - if (attr->aux_output) + if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, @@ -13033,7 +13380,7 @@ static void sync_child_event(struct perf_event *child_event) perf_event_read_event(child_event, task); } - child_val = perf_event_count(child_event); + child_val = perf_event_count(child_event, false); /* * Add back the child's count to the parent's count: @@ -13124,7 +13471,7 @@ static void perf_event_exit_task_context(struct task_struct *child) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -13300,6 +13647,15 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } +int perf_allow_kernel(struct perf_event_attr *attr) +{ + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_KERNEL); +} +EXPORT_SYMBOL_GPL(perf_allow_kernel); + /* * Inherit an event from parent task to child task. * @@ -13609,6 +13965,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ret; + memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); @@ -13629,6 +13986,12 @@ static void __init perf_event_init_all_cpus(void) int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); @@ -13672,12 +14035,46 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } +static void perf_event_clear_cpumask(unsigned int cpu) +{ + int target[PERF_PMU_MAX_SCOPE]; + unsigned int scope; + struct pmu *pmu; + + cpumask_clear_cpu(cpu, perf_online_mask); + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); + + target[scope] = -1; + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) + continue; + target[scope] = cpumask_any_but(cpumask, cpu); + if (target[scope] < nr_cpu_ids) + cpumask_set_cpu(target[scope], pmu_cpumask); + } + + /* migrate */ + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->scope == PERF_PMU_SCOPE_NONE || + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) + continue; + + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); + } +} + static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; @@ -13685,6 +14082,11 @@ static void perf_event_exit_cpu_context(int cpu) // XXX simplify cpuctx->online mutex_lock(&pmus_lock); + /* + * Clear the cpumasks, and migrate to other CPUs if possible. + * Must be invoked before the __perf_event_exit_context. + */ + perf_event_clear_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -13692,7 +14094,6 @@ static void perf_event_exit_cpu_context(int cpu) smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); - cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } #else @@ -13701,6 +14102,42 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +static void perf_event_setup_cpumask(unsigned int cpu) +{ + struct cpumask *pmu_cpumask; + unsigned int scope; + + /* + * Early boot stage, the cpumask hasn't been set yet. + * The perf_online_<domain>_masks includes the first CPU of each domain. + * Always unconditionally set the boot CPU for the perf_online_<domain>_masks. + */ + if (cpumask_empty(perf_online_mask)) { + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + pmu_cpumask = perf_scope_cpumask(scope); + if (WARN_ON_ONCE(!pmu_cpumask)) + continue; + cpumask_set_cpu(cpu, pmu_cpumask); + } + goto end; + } + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + + pmu_cpumask = perf_scope_cpumask(scope); + + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_empty(cpumask) && + cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) + cpumask_set_cpu(cpu, pmu_cpumask); + } +end: + cpumask_set_cpu(cpu, perf_online_mask); +} + int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; @@ -13709,7 +14146,7 @@ int perf_event_init_cpu(unsigned int cpu) perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); - cpumask_set_cpu(cpu, perf_online_mask); + perf_event_setup_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; |