diff options
Diffstat (limited to 'kernel/events/core.c')
| -rw-r--r-- | kernel/events/core.c | 9070 |
1 files changed, 6441 insertions, 2629 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..ece716879cbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/fs.h> @@ -29,6 +28,7 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> +#include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> @@ -50,6 +50,13 @@ #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> +#include <linux/min_heap.h> +#include <linux/highmem.h> +#include <linux/pgtable.h> +#include <linux/buildid.h> +#include <linux/task_work.h> +#include <linux/percpu-rwsem.h> +#include <linux/unwind_deferred.h> #include "internal.h" @@ -94,11 +101,11 @@ static void remote_function(void *data) * @info: the function call argument * * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly + * be on the current CPU, which just calls the function directly. This will + * retry due to any failures in smp_call_function_single(), such as if the + * task_cpu() goes offline concurrently. * - * returns: @func return value, or - * -ESRCH - when the process isn't running - * -EAGAIN - when the process moved away + * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -111,17 +118,24 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) }; int ret; - do { - ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + for (;;) { + ret = smp_call_function_single(task_cpu(p), remote_function, + &data, 1); if (!ret) ret = data.ret; - } while (ret == -EAGAIN); + + if (ret != -EAGAIN) + break; + + cond_resched(); + } return ret; } /** * cpu_function_call - call a function on the cpu + * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * @@ -143,28 +157,70 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) +enum event_type_t { + EVENT_FLEXIBLE = 0x01, + EVENT_PINNED = 0x02, + EVENT_TIME = 0x04, + EVENT_FROZEN = 0x08, + /* see ctx_resched() for details */ + EVENT_CPU = 0x10, + EVENT_CGROUP = 0x20, + + /* compound helpers */ + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, +}; + +static inline void __perf_ctx_lock(struct perf_event_context *ctx) { - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); + raw_spin_lock(&ctx->lock); + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - raw_spin_lock(&cpuctx->ctx.lock); + __perf_ctx_lock(&cpuctx->ctx); if (ctx) - raw_spin_lock(&ctx->lock); + __perf_ctx_lock(ctx); +} + +static inline void __perf_ctx_unlock(struct perf_event_context *ctx) +{ + /* + * If ctx_sched_in() didn't again set any ALL flags, clean up + * after ctx_sched_out() by clearing is_active. + */ + if (ctx->is_active & EVENT_FROZEN) { + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + else + ctx->is_active &= ~EVENT_FROZEN; + } + raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); + __perf_ctx_unlock(ctx); + __perf_ctx_unlock(&cpuctx->ctx); } +typedef struct { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; +} class_perf_ctx_lock_t; + +static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) +{ perf_ctx_unlock(_T->cpuctx, _T->ctx); } + +static inline class_perf_ctx_lock_t +class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } + #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) @@ -172,6 +228,14 @@ static bool is_kernel_event(struct perf_event *event) return READ_ONCE(event->owner) == TASK_TOMBSTONE; } +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +struct perf_event_context *perf_cpu_task_ctx(void) +{ + lockdep_assert_irqs_disabled(); + return this_cpu_ptr(&perf_cpu_context)->task_ctx; +} + /* * On task ctx scheduling... * @@ -205,11 +269,11 @@ static int event_function(void *info) struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* @@ -250,6 +314,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, @@ -259,7 +324,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to - * stabilize the the event->ctx relation. See + * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); @@ -277,22 +342,25 @@ again: if (!task_function_call(task, event_function, &efs)) return; - raw_spin_lock_irq(&ctx->lock); + local_irq_disable(); + cpuctx = this_cpu_ptr(&perf_cpu_context); + perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task == TASK_TOMBSTONE) { - raw_spin_unlock_irq(&ctx->lock); - return; - } + if (task == TASK_TOMBSTONE) + goto unlock; if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); goto again; } func(event, NULL, ctx, data); - raw_spin_unlock_irq(&ctx->lock); +unlock: + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); } /* @@ -302,11 +370,11 @@ again: static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) @@ -355,18 +423,8 @@ unlock: (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_TIME = 0x4, - /* see ctx_resched() for details */ - EVENT_CPU = 0x8, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - /* * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ static void perf_sched_delayed(struct work_struct *work); @@ -375,8 +433,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -385,11 +441,22 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly; +static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; +static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static cpumask_var_t perf_online_core_mask; +static cpumask_var_t perf_online_die_mask; +static cpumask_var_t perf_online_cluster_mask; +static cpumask_var_t perf_online_pkg_mask; +static cpumask_var_t perf_online_sys_mask; +static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: @@ -400,8 +467,8 @@ static cpumask_var_t perf_online_mask; */ int sysctl_perf_event_paranoid __read_mostly = 2; -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ +static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* * max perf event sample rate @@ -411,6 +478,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; @@ -430,24 +498,23 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - if (ret || !write) - return ret; - + int ret; + int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ - if (sysctl_perf_cpu_time_max_percent == 100 || - sysctl_perf_cpu_time_max_percent == 0) + if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); @@ -455,11 +522,8 @@ int perf_proc_update_handler(struct ctl_table *table, int write, return 0; } -int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; - -int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -478,6 +542,52 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, return 0; } +static const struct ctl_table events_core_sysctl_table[] = { + /* + * User-space relies on this file as a feature check for + * perf_events being enabled. It's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_event_max_sample_rate_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +}; + +static int __init init_events_core_sysctls(void) +{ + register_sysctl_init("kernel", events_core_sysctl_table); + return 0; +} +core_initcall(init_events_core_sysctls); + + /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not @@ -518,7 +628,7 @@ void perf_sample_event_took(u64 sample_len_ns) __this_cpu_write(running_sample_length, running_len); /* - * Note: this will be biased artifically low until we have + * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ @@ -555,23 +665,11 @@ void perf_sample_event_took(u64 sample_len_ns) static atomic64_t perf_event_id; -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); - static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } -extern __weak const char *perf_pmu_name(void) -{ - return "pmu"; -} - static inline u64 perf_clock(void) { return local_clock(); @@ -582,13 +680,138 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } +/* + * State based event timekeeping... + * + * The basic idea is to use event->state to determine which (if any) time + * fields to increment with the current delta. This means we only need to + * update timestamps when we change state or when they are explicitly requested + * (read). + * + * Event groups make things a little more complicated, but not terribly so. The + * rules for a group are that if the group leader is OFF the entire group is + * OFF, irrespective of what the group member states are. This results in + * __perf_effective_state(). + * + * A further ramification is that when a group leader flips between OFF and + * !OFF, we need to update all group member times. + * + * + * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we + * need to make sure the relevant context time is updated before we try and + * update our timestamps. + */ + +static __always_inline enum perf_event_state +__perf_effective_state(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + + if (leader->state <= PERF_EVENT_STATE_OFF) + return leader->state; + + return event->state; +} + +static __always_inline void +__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) +{ + enum perf_event_state state = __perf_effective_state(event); + u64 delta = now - event->tstamp; + + *enabled = event->total_time_enabled; + if (state >= PERF_EVENT_STATE_INACTIVE) + *enabled += delta; + + *running = event->total_time_running; + if (state >= PERF_EVENT_STATE_ACTIVE) + *running += delta; +} + +static void perf_event_update_time(struct perf_event *event) +{ + u64 now = perf_event_time(event); + + __perf_update_times(event, now, &event->total_time_enabled, + &event->total_time_running); + event->tstamp = now; +} + +static void perf_event_update_sibling_time(struct perf_event *leader) +{ + struct perf_event *sibling; + + for_each_sibling_event(sibling, leader) + perf_event_update_time(sibling); +} + +static void +perf_event_set_state(struct perf_event *event, enum perf_event_state state) +{ + if (event->state == state) + return; + + perf_event_update_time(event); + /* + * If a group leader gets enabled/disabled all its siblings + * are affected too. + */ + if ((event->state < 0) ^ (state < 0)) + perf_event_update_sibling_time(event); + + WRITE_ONCE(event->state, state); +} + +/* + * UP store-release, load-acquire + */ + +#define __store_release(ptr, val) \ +do { \ + barrier(); \ + WRITE_ONCE(*(ptr), (val)); \ +} while (0) + +#define __load_acquire(ptr) \ +({ \ + __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ + barrier(); \ + ___p; \ +}) + +#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ + if (_cgroup && !_epc->nr_cgroups) \ + continue; \ + else if (_pmu && _epc->pmu != _pmu) \ + continue; \ + else + +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) +{ + struct perf_event_pmu_context *pmu_ctx; + + for_each_epc(pmu_ctx, ctx, NULL, cgroup) + perf_pmu_disable(pmu_ctx->pmu); +} + +static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) +{ + struct perf_event_pmu_context *pmu_ctx; + + for_each_epc(pmu_ctx, ctx, NULL, cgroup) + perf_pmu_enable(pmu_ctx->pmu); +} + +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); + #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) @@ -627,29 +850,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); + struct perf_cgroup_info *t; - info = this_cpu_ptr(cgrp->info); + t = per_cpu_ptr(event->cgrp->info, event->cpu); + if (!__load_acquire(&t->active)) + return t->time; + now += READ_ONCE(t->timeoffset); + return now; +} - info->time += now - info->timestamp; +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->time += now - info->timestamp; info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); + struct perf_cgroup *cgrp = cpuctx->cgrp; + struct cgroup_subsys_state *css; + struct perf_cgroup_info *info; + + if (cgrp) { + u64 now = perf_clock(); + + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + info = this_cpu_ptr(cgrp->info); + + __update_cgrp_time(info, now, true); + if (final) + __store_release(&info->active, 0); + } + } } static inline void update_cgrp_time_from_event(struct perf_event *event) { - struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and @@ -658,143 +903,126 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current, event->ctx); + info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ - if (cgrp == event->cgrp) - __update_cgrp_time(event->cgrp); + if (info->active) + __update_cgrp_time(info, perf_clock(), true); } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp; + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; + struct cgroup_subsys_state *css; /* * ctx->lock held by caller * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ - if (!task || !ctx->nr_cgroups) + if (!cgrp) return; - cgrp = perf_cgroup_from_task(task, ctx); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; -} - -static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); + WARN_ON_ONCE(!ctx->nr_cgroups); -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + info = this_cpu_ptr(cgrp->info); + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); + } +} /* * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next */ -static void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task) { - struct perf_cpu_context *cpuctx; - struct list_head *list; - unsigned long flags; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_cgroup *cgrp; /* - * Disable interrupts and preemption to avoid this CPU's - * cgrp_cpuctx_entry to change under us. + * cpuctx->cgrp is set when the first cgroup event enabled, + * and is cleared when the last cgroup event disabled. */ - local_irq_save(flags); - - list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + cgrp = perf_cgroup_from_task(task, NULL); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + return; - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, - &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + /* + * Re-check, could've raced vs perf_remove_from_context(). + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; - local_irq_restore(flags); -} + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; + perf_ctx_disable(&cpuctx->ctx, true); - rcu_read_lock(); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(next, NULL); - + cpuctx->cgrp = cgrp; /* - * only schedule out current cgroup events if we know - * that we are switching to a different cgroup. Otherwise, - * do no touch the cgroup events. + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); - rcu_read_unlock(); + perf_ctx_enable(&cpuctx->ctx, true); } -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) +static int perf_cgroup_ensure_storage(struct perf_event *event, + struct cgroup_subsys_state *css) { - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; + struct perf_cpu_context *cpuctx; + struct perf_event **storage; + int cpu, heap_size, ret = 0; - rcu_read_lock(); /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock + * Allow storage to have sufficient space for an iterator for each + * possibly nested cgroup plus an iterator for events with no cgroup. */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(prev, NULL); + for (heap_size = 1; css; css = css->parent) + heap_size++; - /* - * only need to schedule in cgroup events if we are changing - * cgroup during ctxsw. Cgroup events were not scheduled - * out of ctxsw out if that was not the case. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWIN); + for_each_possible_cpu(cpu) { + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + if (heap_size <= cpuctx->heap_size) + continue; - rcu_read_unlock(); + storage = kmalloc_node(heap_size * sizeof(struct perf_event *), + GFP_KERNEL, cpu_to_node(cpu)); + if (!storage) { + ret = -ENOMEM; + break; + } + + raw_spin_lock_irq(&cpuctx->ctx.lock); + if (cpuctx->heap_size < heap_size) { + swap(cpuctx->heap, storage); + if (storage == cpuctx->heap_default) + storage = NULL; + cpuctx->heap_size = heap_size; + } + raw_spin_unlock_irq(&cpuctx->ctx.lock); + + kfree(storage); + } + + return ret; } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -803,18 +1031,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int ret = 0; - if (!f.file) + if (fd_empty(f)) return -EBADF; - css = css_tryget_online_from_dir(f.file->f_path.dentry, + css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } + if (IS_ERR(css)) + return PTR_ERR(css); + + ret = perf_cgroup_ensure_storage(event, css); + if (ret) + return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -828,86 +1058,51 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, perf_detach_cgroup(event); ret = -EINVAL; } -out: - fdput(f); return ret; } static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) + return; + + event->pmu_ctx->nr_cgroups++; -static inline void -perf_cgroup_defer_enabled(struct perf_event *event) -{ /* - * when the current task's perf cgroup does not match - * the event's, we need to remember to call the - * perf_mark_enable() function the first time a task with - * a matching perf cgroup is scheduled in. + * Because cgroup events are always per-cpu events, + * @ctx == &cpuctx->ctx. */ - if (is_cgroup_event(event) && !perf_cgroup_match(event)) - event->cgrp_defer_enabled = 1; -} + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); -static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - if (!event->cgrp_defer_enabled) + if (ctx->nr_cgroups++) return; - event->cgrp_defer_enabled = 0; - - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - sub->cgrp_defer_enabled = 0; - } - } + cpuctx->cgrp = perf_cgroup_from_task(current, ctx); } -/* - * Update cpuctx->cgrp so that it is set when first cgroup event is added and - * cleared when last cgroup event is removed. - */ static inline void -list_update_cgroup_event(struct perf_event *event, - struct perf_event_context *ctx, bool add) +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; - struct list_head *cpuctx_entry; if (!is_cgroup_event(event)) return; - if (add && ctx->nr_cgroups++) - return; - else if (!add && --ctx->nr_cgroups) - return; + event->pmu_ctx->nr_cgroups--; + /* * Because cgroup events are always per-cpu events, - * this will always be called from the right CPU. - */ - cpuctx = __get_cpu_context(ctx); - cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; - /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ - if (add) { - list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); - if (perf_cgroup_from_task(current, ctx) == event->cgrp) - cpuctx->cgrp = event->cgrp; - } else { - list_del(cpuctx_entry); - cpuctx->cgrp = NULL; - } + * @ctx == &cpuctx->ctx. + */ + cpuctx = container_of(ctx, struct perf_cpu_context, ctx); + + if (--ctx->nr_cgroups) + return; + + cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ @@ -930,17 +1125,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ -} - -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, + bool final) { } @@ -952,43 +1138,33 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { + return 0; } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } static inline void -perf_cgroup_defer_enabled(struct perf_event *event) +perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } static inline void -perf_cgroup_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) +perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } -static inline void -list_update_cgroup_event(struct perf_event *event, - struct perf_event_context *ctx, bool add) +static void perf_cgroup_switch(struct task_struct *task) { } - #endif /* @@ -1001,34 +1177,30 @@ list_update_cgroup_event(struct perf_event *event, */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { - struct perf_cpu_context *cpuctx; - int rotations = 0; + struct perf_cpu_pmu_context *cpc; + bool rotations; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); - cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); + cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); + rotations = perf_rotate_context(cpc); - raw_spin_lock(&cpuctx->hrtimer_lock); + raw_spin_lock(&cpc->hrtimer_lock); if (rotations) - hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + hrtimer_forward_now(hr, cpc->hrtimer_interval); else - cpuctx->hrtimer_active = 0; - raw_spin_unlock(&cpuctx->hrtimer_lock); + cpc->hrtimer_active = 0; + raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } -static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; + struct pmu *pmu = cpc->epc.pmu; u64 interval; - /* no multiplexing needed for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return; - /* * check default is sane, if not set then force to * default interval (1/tick) @@ -1037,79 +1209,68 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - raw_spin_lock_init(&cpuctx->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - timer->function = perf_mux_hrtimer_handler; + raw_spin_lock_init(&cpc->hrtimer_lock); + hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); } -static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; - /* not for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return 0; - - raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); - if (!cpuctx->hrtimer_active) { - cpuctx->hrtimer_active = 1; - hrtimer_forward_now(timer, cpuctx->hrtimer_interval); - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); + if (!cpc->hrtimer_active) { + cpc->hrtimer_active = 1; + hrtimer_forward_now(timer, cpc->hrtimer_interval); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } - raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); + raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } +static int perf_mux_hrtimer_restart_ipi(void *arg) +{ + return perf_mux_hrtimer_restart(arg); +} + +static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) +{ + return *this_cpu_ptr(pmu->cpu_pmu_context); +} + void perf_pmu_disable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, active_ctx_list); - -/* - * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and - * perf_event_task_tick() are fully serialized because they're strictly cpu - * affine and perf_event_ctx{activate,deactivate} are called with IRQs - * disabled, while perf_event_task_tick is called from IRQ context. - */ -static void perf_event_ctx_activate(struct perf_event_context *ctx) +static void perf_assert_pmu_disabled(struct pmu *pmu) { - struct list_head *head = this_cpu_ptr(&active_ctx_list); - - WARN_ON(!irqs_disabled()); - - WARN_ON(!list_empty(&ctx->active_ctx_list)); - - list_add(&ctx->active_ctx_list, head); + int *count = &this_cpc(pmu)->pmu_disable_count; + WARN_ON_ONCE(*count == 0); } -static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +static inline void perf_pmu_read(struct perf_event *event) { - WARN_ON(!irqs_disabled()); - - WARN_ON(list_empty(&ctx->active_ctx_list)); - - list_del_init(&ctx->active_ctx_list); + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); } static void get_ctx(struct perf_event_context *ctx) { - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -1117,18 +1278,21 @@ static void free_ctx(struct rcu_head *head) struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - kfree(ctx->task_ctx_data); kfree(ctx); } static void put_ctx(struct perf_event_context *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) { + if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); + } else { + smp_mb__after_atomic(); /* pairs with wait_var_event() */ + if (ctx->task == TASK_TOMBSTONE) + wake_up_var(&ctx->refcount); } } @@ -1159,7 +1323,7 @@ static void put_ctx(struct perf_event_context *ctx) * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * - * But remember that that these are parent<->child context relations, and + * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * @@ -1185,13 +1349,19 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: - * cred_guard_mutex + * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock - * perf_event::mmap_mutex - * mmap_sem + * mmap_lock + * perf_event::mmap_mutex + * perf_buffer::aux_mutex + * perf_addr_filters_head::lock + * + * cpu_hotplug_lock + * pmus_lock + * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) @@ -1200,8 +1370,8 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); - ctx = ACCESS_ONCE(event->ctx); - if (!atomic_inc_not_zero(&ctx->refcount)) { + ctx = READ_ONCE(event->ctx); + if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } @@ -1249,26 +1419,31 @@ unclone_ctx(struct perf_event_context *ctx) return parent_ctx; } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, + enum pid_type type) { + u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; - return task_tgid_nr_ns(p, event->ns); + nr = __task_pid_nr_ns(p, type, event->ns); + /* avoid -1 if it is idle thread or runs in another ns */ + if (!nr && !pid_alive(p)) + nr = -1; + return nr; } -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; + return perf_event_pid_type(event, p, PIDTYPE_TGID); +} - return task_pid_nr_ns(p, event->ns); +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + return perf_event_pid_type(event, p, PIDTYPE_PID); } /* @@ -1288,11 +1463,11 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. * - * This has to cope with with the fact that until it is locked, + * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; @@ -1308,7 +1483,7 @@ retry: */ local_irq_save(*flags); rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might @@ -1321,7 +1496,7 @@ retry: * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); - if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); @@ -1329,7 +1504,7 @@ retry: } if (ctx->task == TASK_TOMBSTONE || - !atomic_inc_not_zero(&ctx->refcount)) { + !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { @@ -1348,12 +1523,12 @@ retry: * reference count so that the context can't get freed. */ static struct perf_event_context * -perf_pin_task_context(struct task_struct *task, int ctxn) +perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -1373,76 +1548,61 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_event_context *ctx) +static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); - ctx->time += now - ctx->timestamp; + lockdep_assert_held(&ctx->lock); + + if (adv) + ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +} + +static void update_context_time(struct perf_event_context *ctx) +{ + __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + if (unlikely(!ctx)) + return 0; + if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx ? ctx->time : 0; + return ctx->time; } -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) +static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; - u64 run_end; - - lockdep_assert_held(&ctx->lock); - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; + if (unlikely(!ctx)) + return 0; - /* - * in cgroup mode, time_enabled represents - * the time the event was enabled AND active - * tasks were in the monitored cgroup. This is - * independent of the activity of the context as - * there may be a mix of cgroup and non-cgroup events. - * - * That is why we treat cgroup events differently - * here. - */ if (is_cgroup_event(event)) - run_end = perf_cgroup_event_time(event); - else if (ctx->is_active) - run_end = ctx->time; - else - run_end = event->tstamp_stopped; - - event->total_time_enabled = run_end - event->tstamp_enabled; + return perf_cgroup_event_time_now(event, now); - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = perf_event_time(event); - - event->total_time_running = run_end - event->tstamp_running; - -} - -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) + return ctx->time; - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -1466,8 +1626,21 @@ static enum event_type_t get_event_type(struct perf_event *event) return event_type; } -static struct list_head * -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +/* + * Helper function to initialize event group nodes. + */ +static void init_event_group(struct perf_event *event) +{ + RB_CLEAR_NODE(&event->group_node); + event->group_index = 0; +} + +/* + * Extract pinned or flexible groups from the context + * based on event attrs bits. + */ +static struct perf_event_groups * +get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; @@ -1476,7 +1649,234 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) } /* - * Add a event from the lists for its context. + * Helper function to initializes perf_event_group trees. + */ +static void perf_event_groups_init(struct perf_event_groups *groups) +{ + groups->tree = RB_ROOT; + groups->index = 0; +} + +static inline struct cgroup *event_cgroup(const struct perf_event *event) +{ + struct cgroup *cgroup = NULL; + +#ifdef CONFIG_CGROUP_PERF + if (event->cgrp) + cgroup = event->cgrp->css.cgroup; +#endif + + return cgroup; +} + +/* + * Compare function for event groups; + * + * Implements complex key that first sorts by CPU and then by virtual index + * which provides ordering when rotating groups for the same CPU. + */ +static __always_inline int +perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, + const struct cgroup *left_cgroup, const u64 left_group_index, + const struct perf_event *right) +{ + if (left_cpu < right->cpu) + return -1; + if (left_cpu > right->cpu) + return 1; + + if (left_pmu) { + if (left_pmu < right->pmu_ctx->pmu) + return -1; + if (left_pmu > right->pmu_ctx->pmu) + return 1; + } + +#ifdef CONFIG_CGROUP_PERF + { + const struct cgroup *right_cgroup = event_cgroup(right); + + if (left_cgroup != right_cgroup) { + if (!left_cgroup) { + /* + * Left has no cgroup but right does, no + * cgroups come first. + */ + return -1; + } + if (!right_cgroup) { + /* + * Right has no cgroup but left does, no + * cgroups come first. + */ + return 1; + } + /* Two dissimilar cgroups, order by id. */ + if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) + return -1; + + return 1; + } + } +#endif + + if (left_group_index < right->group_index) + return -1; + if (left_group_index > right->group_index) + return 1; + + return 0; +} + +#define __node_2_pe(node) \ + rb_entry((node), struct perf_event, group_node) + +static inline bool __group_less(struct rb_node *a, const struct rb_node *b) +{ + struct perf_event *e = __node_2_pe(a); + return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), + e->group_index, __node_2_pe(b)) < 0; +} + +struct __group_key { + int cpu; + struct pmu *pmu; + struct cgroup *cgroup; +}; + +static inline int __group_cmp(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); +} + +static inline int +__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), + b->group_index, b); +} + +/* + * Insert @event into @groups' tree; using + * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} + * as key. This places it last inside the {cpu,pmu,cgroup} subtree. + */ +static void +perf_event_groups_insert(struct perf_event_groups *groups, + struct perf_event *event) +{ + event->group_index = ++groups->index; + + rb_add(&event->group_node, &groups->tree, __group_less); +} + +/* + * Helper function to insert event into the pinned or flexible groups. + */ +static void +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_insert(groups, event); +} + +/* + * Delete a group from a tree. + */ +static void +perf_event_groups_delete(struct perf_event_groups *groups, + struct perf_event *event) +{ + WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || + RB_EMPTY_ROOT(&groups->tree)); + + rb_erase(&event->group_node, &groups->tree); + init_event_group(event); +} + +/* + * Helper function to delete event from its groups. + */ +static void +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event_groups *groups; + + groups = get_event_groups(event, ctx); + perf_event_groups_delete(groups, event); +} + +/* + * Get the leftmost event in the {cpu,pmu,cgroup} subtree. + */ +static struct perf_event * +perf_event_groups_first(struct perf_event_groups *groups, int cpu, + struct pmu *pmu, struct cgroup *cgrp) +{ + struct __group_key key = { + .cpu = cpu, + .pmu = pmu, + .cgroup = cgrp, + }; + struct rb_node *node; + + node = rb_find_first(&key, &groups->tree, __group_cmp); + if (node) + return __node_2_pe(node); + + return NULL; +} + +static struct perf_event * +perf_event_groups_next(struct perf_event *event, struct pmu *pmu) +{ + struct __group_key key = { + .cpu = event->cpu, + .pmu = pmu, + .cgroup = event_cgroup(event), + }; + struct rb_node *next; + + next = rb_next_match(&key, &event->group_node, __group_cmp); + if (next) + return __node_2_pe(next); + + return NULL; +} + +#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ + for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ + event; event = perf_event_groups_next(event, pmu)) + +/* + * Iterate through the whole groups tree. + */ +#define perf_event_groups_for_each(event, groups) \ + for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ + typeof(*event), group_node); event; \ + event = rb_entry_safe(rb_next(&event->group_node), \ + typeof(*event), group_node)) + +/* + * Does the event attribute request inherit with PERF_SAMPLE_READ + */ +static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) +{ + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); +} + +/* + * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void @@ -1487,28 +1887,32 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; + event->tstamp = perf_event_time(event); + /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { - struct list_head *list; - event->group_caps = event->event_caps; - - list = ctx_group_list(event, ctx); - list_add_tail(&event->group_entry, list); + add_event_to_groups(event, ctx); } - list_update_cgroup_event(event, ctx, true); - list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (has_inherit_and_sample_read(&event->attr)) + local_inc(&ctx->nr_no_switch_fast); + + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_enable(event, ctx); ctx->generation++; + event->pmu_ctx->nr_events++; } /* @@ -1520,28 +1924,34 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -static void __perf_event_read_size(struct perf_event *event, int nr_siblings) +static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_ID) + if (read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_GROUP) { + if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } - size += entry * nr; - event->read_size = size; + /* + * Since perf_event_validate_size() limits this to 16k and inhibits + * adding more siblings, this will never overflow. + */ + return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) @@ -1558,8 +1968,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); - if (sample_type & PERF_SAMPLE_WEIGHT) - size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; @@ -1570,6 +1980,18 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + size += sizeof(data->phys_addr); + + if (sample_type & PERF_SAMPLE_CGROUP) + size += sizeof(data->cgroup); + + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + size += sizeof(data->data_page_size); + + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + size += sizeof(data->code_page_size); + event->header_size = size; } @@ -1579,8 +2001,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) */ static void perf_event__header_size(struct perf_event *event) { - __perf_event_read_size(event, - event->group_leader->nr_siblings); + event->read_size = + __perf_event_read_size(event->attr.read_format, + event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } @@ -1611,23 +2034,44 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +/* + * Check that adding an event to the group does not result in anybody + * overflowing the 64k event limit imposed by the output buffer. + * + * Specifically, check that the read_size for the event does not exceed 16k, + * read_size being the one term that grows with groups size. Since read_size + * depends on per-event read_format, also (re)check the existing events. + * + * This leaves 48k for the constant size fields and things like callchains, + * branch stacks and register sets. + */ static bool perf_event_validate_size(struct perf_event *event) { - /* - * The values computed here will be over-written when we actually - * attach the event. - */ - __perf_event_read_size(event, event->group_leader->nr_siblings + 1); - __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); - perf_event__id_header_size(event); + struct perf_event *sibling, *group_leader = event->group_leader; + + if (__perf_event_read_size(event->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + + if (__perf_event_read_size(group_leader->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; /* - * Sum the lot; should not exceed the 64k limit we have on records. - * Conservative limit to allow for callchains and other variable fields. + * When creating a new group leader, group_leader->ctx is initialized + * after the size has been validated, but we cannot safely use + * for_each_sibling_event() until group_leader->ctx is set. A new group + * leader cannot have any siblings yet, so we can safely skip checking + * the non-existent siblings. */ - if (event->read_size + event->header_size + - event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) - return false; + if (event == group_leader) + return true; + + for_each_sibling_event(sibling, group_leader) { + if (__perf_event_read_size(sibling->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + } return true; } @@ -1639,7 +2083,8 @@ static void perf_group_attach(struct perf_event *event) lockdep_assert_held(&event->ctx->lock); /* - * We can have double attach due to group movement in perf_event_open. + * We can have double attach due to group movement (move_group) in + * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; @@ -1653,17 +2098,18 @@ static void perf_group_attach(struct perf_event *event) group_leader->group_caps &= event->event_caps; - list_add_tail(&event->group_entry, &group_leader->sibling_list); + list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; + group_leader->group_generation++; perf_event__header_size(group_leader); - list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } /* - * Remove a event from the lists for its context. + * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void @@ -1680,38 +2126,136 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - list_update_cgroup_event(event, ctx, false); - ctx->nr_events--; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; + if (has_inherit_and_sample_read(&event->attr)) + local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); if (event->group_leader == event) - list_del_init(&event->group_entry); + del_event_from_groups(event, ctx); + + ctx->generation++; + event->pmu_ctx->nr_events--; +} + +static int +perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) +{ + if (!has_aux(aux_event)) + return 0; + + if (!event->pmu->aux_output_match) + return 0; + + return event->pmu->aux_output_match(aux_event); +} - update_group_times(event); +static void put_event(struct perf_event *event); +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state); + +static void perf_put_aux_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *iter; /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event + * If event uses aux_event tear down the link */ - if (event->state > PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_OFF; + if (event->aux_event) { + iter = event->aux_event; + event->aux_event = NULL; + put_event(iter); + return; + } - ctx->generation++; + /* + * If the event is an aux_event, tear down all links to + * it from other events. + */ + for_each_sibling_event(iter, event) { + if (iter->aux_event != event) + continue; + + iter->aux_event = NULL; + put_event(event); + + /* + * If it's ACTIVE, schedule it out and put it into ERROR + * state so that we don't try to schedule it again. Note + * that perf_event_enable() will clear the ERROR status. + */ + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); + } +} + +static bool perf_need_aux_event(struct perf_event *event) +{ + return event->attr.aux_output || has_aux_action(event); +} + +static int perf_get_aux_event(struct perf_event *event, + struct perf_event *group_leader) +{ + /* + * Our group leader must be an aux event if we want to be + * an aux_output. This way, the aux event will precede its + * aux_output events in the group, and therefore will always + * schedule first. + */ + if (!group_leader) + return 0; + + /* + * aux_output and aux_sample_size are mutually exclusive. + */ + if (event->attr.aux_output && event->attr.aux_sample_size) + return 0; + + if (event->attr.aux_output && + !perf_aux_output_match(event, group_leader)) + return 0; + + if ((event->attr.aux_pause || event->attr.aux_resume) && + !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return 0; + + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) + return 0; + + if (!atomic_long_inc_not_zero(&group_leader->refcount)) + return 0; + + /* + * Link aux_outputs to their aux event; this is undone in + * perf_group_detach() by perf_put_aux_event(). When the + * group in torn down, the aux_output events loose their + * link to the aux_event and can't schedule any more. + */ + event->aux_event = group_leader; + + return 1; +} + +static inline struct list_head *get_event_list(struct perf_event *event) +{ + return event->attr.pinned ? &event->pmu_ctx->pinned_active : + &event->pmu_ctx->flexible_active; } static void perf_group_detach(struct perf_event *event) { + struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; - struct list_head *list = NULL; + struct perf_event_context *ctx = event->ctx; - lockdep_assert_held(&event->ctx->lock); + lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. @@ -1721,155 +2265,209 @@ static void perf_group_detach(struct perf_event *event) event->attach_state &= ~PERF_ATTACH_GROUP; + perf_put_aux_event(event); + /* * If this is a sibling, remove it from its group. */ - if (event->group_leader != event) { - list_del_init(&event->group_entry); + if (leader != event) { + list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; + event->group_leader->group_generation++; goto out; } - if (!list_empty(&event->group_entry)) - list = &event->group_entry; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ - list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - if (list) - list_move_tail(&sibling->group_entry, list); + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + + /* + * Events that have PERF_EV_CAP_SIBLING require being part of + * a group and cannot exist on their own, schedule them out + * and move them into the ERROR state. Also see + * _perf_event_enable(), it will not be able to recover this + * ERROR state. + */ + if (sibling->event_caps & PERF_EV_CAP_SIBLING) + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); + sibling->group_leader = sibling; + list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; + if (sibling->attach_state & PERF_ATTACH_CONTEXT) { + add_event_to_groups(sibling, event->ctx); + + if (sibling->state == PERF_EVENT_STATE_ACTIVE) + list_add_tail(&sibling->active_list, get_event_list(sibling)); + } + WARN_ON_ONCE(sibling->ctx != event->ctx); } out: - perf_event__header_size(event->group_leader); - - list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); -} -static bool is_orphaned_event(struct perf_event *event) -{ - return event->state == PERF_EVENT_STATE_DEAD; + perf_event__header_size(leader); } -static inline int __pmu_filter_match(struct perf_event *event) -{ - struct pmu *pmu = event->pmu; - return pmu->filter_match ? pmu->filter_match(event) : 1; -} +static void sync_child_event(struct perf_event *child_event); -/* - * Check whether we should attempt to schedule an event group based on - * PMU-specific filtering. An event group can consist of HW and SW events, - * potentially with a SW leader, so we must check all the filters, to - * determine whether a group is schedulable: - */ -static inline int pmu_filter_match(struct perf_event *event) +static void perf_child_detach(struct perf_event *event) { - struct perf_event *child; + struct perf_event *parent_event = event->parent; - if (!__pmu_filter_match(event)) - return 0; + if (!(event->attach_state & PERF_ATTACH_CHILD)) + return; - list_for_each_entry(child, &event->sibling_list, group_entry) { - if (!__pmu_filter_match(child)) - return 0; - } + event->attach_state &= ~PERF_ATTACH_CHILD; - return 1; + if (WARN_ON_ONCE(!parent_event)) + return; + + /* + * Can't check this from an IPI, the holder is likey another CPU. + * + lockdep_assert_held(&parent_event->child_mutex); + */ + + sync_child_event(event); + list_del_init(&event->child_list); +} + +static bool is_orphaned_event(struct perf_event *event) +{ + return event->state == PERF_EVENT_STATE_DEAD; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && - perf_cgroup_match(event) && pmu_filter_match(event); + perf_cgroup_match(event); +} + +static inline bool is_event_in_freq_mode(struct perf_event *event) +{ + return event->attr.freq && event->attr.sample_freq; } static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - u64 delta; + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); + enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; + + // XXX cpc serialization, probably per-cpu IRQ disabled WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); - /* - * An event which could not be activated because of - * filter mismatch still needs to have its timings - * maintained, otherwise bogus information is return - * via read() for time_enabled, time_running: - */ - if (event->state == PERF_EVENT_STATE_INACTIVE && - !event_filter_match(event)) { - delta = tstamp - event->tstamp_stopped; - event->tstamp_running += delta; - event->tstamp_stopped = tstamp; - } - if (event->state != PERF_EVENT_STATE_ACTIVE) return; + /* + * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but + * we can schedule events _OUT_ individually through things like + * __perf_remove_from_context(). + */ + list_del_init(&event->active_list); + perf_pmu_disable(event->pmu); - event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; - event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { event->pending_disable = 0; - event->state = PERF_EVENT_STATE_OFF; + perf_cgroup_event_disable(event, ctx); + state = PERF_EVENT_STATE_OFF; } + perf_event_set_state(event, state); + if (!is_software_event(event)) - cpuctx->active_oncpu--; - if (!--ctx->nr_active) - perf_event_ctx_deactivate(ctx); - if (event->attr.freq && event->attr.sample_freq) + cpc->active_oncpu--; + if (is_event_in_freq_mode(event)) { ctx->nr_freq--; - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; + epc->nr_freq--; + } + if (event->attr.exclusive || !cpc->active_oncpu) + cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void -group_sched_out(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - int state = group_event->state; - perf_pmu_disable(ctx->pmu); + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; - event_sched_out(group_event, cpuctx, ctx); + perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); + + event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) - event_sched_out(event, cpuctx, ctx); + for_each_sibling_event(event, group_event) + event_sched_out(event, ctx); +} + +static inline void +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, final); + } +} - perf_pmu_enable(ctx->pmu); +static inline void +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + __ctx_time_update(cpuctx, ctx, false); +} - if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) - cpuctx->exclusive = 0; +/* + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). + */ +static inline void +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + ctx_time_update(cpuctx, ctx); + if (ctx->is_active & EVENT_TIME) + ctx->is_active |= EVENT_FROZEN; +} + +static inline void +ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_event(event); + } } #define DETACH_GROUP 0x01UL +#define DETACH_CHILD 0x02UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -1883,14 +2481,51 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_context *ctx, void *info) { + struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; + enum perf_event_state state = PERF_EVENT_STATE_OFF; unsigned long flags = (unsigned long)info; - event_sched_out(event, cpuctx, ctx); + ctx_time_update(cpuctx, ctx); + + /* + * Ensure event_sched_out() switches to OFF, at the very least + * this avoids raising perf_pending_task() at this time. + */ + if (flags & DETACH_EXIT) + state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; + if (flags & DETACH_DEAD) + state = PERF_EVENT_STATE_DEAD; + + event_sched_out(event, ctx); + + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_disable(event, ctx); + + perf_event_set_state(event, min(event->state, state)); + if (flags & DETACH_GROUP) perf_group_detach(event); + if (flags & DETACH_CHILD) + perf_child_detach(event); list_del_event(event, ctx); + if (!pmu_ctx->nr_events) { + pmu_ctx->rotate_necessary = 0; + + if (ctx->task && ctx->is_active) { + struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); + + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + } + if (!ctx->nr_events && ctx->is_active) { + if (ctx == &cpuctx->ctx) + update_cgrp_time_from_cpuctx(cpuctx, true); + ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -1915,25 +2550,30 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla lockdep_assert_held(&ctx->mutex); - event_function_call(event, __perf_remove_from_context, (void *)flags); - /* - * The above event_function_call() can NO-OP when it hits - * TASK_TOMBSTONE. In that case we must already have been detached - * from the context (by perf_event_exit_event()) but the grouping - * might still be in-tact. + * Because of perf_event_exit_task(), perf_remove_from_context() ought + * to work in the face of TASK_TOMBSTONE, unlike every other + * event_function_call() user. */ - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); - if ((flags & DETACH_GROUP) && - (event->attach_state & PERF_ATTACH_GROUP)) { - /* - * Since in that case we cannot possibly be scheduled, simply - * detach now. - */ - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); + raw_spin_lock_irq(&ctx->lock); + if (!ctx->is_active) { + __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), + ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); + return; } + raw_spin_unlock_irq(&ctx->lock); + + event_function_call(event, __perf_remove_from_context, (void *)flags); +} + +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state) +{ + event_sched_out(event, ctx); + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, state); } /* @@ -1947,27 +2587,36 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); + perf_pmu_disable(event->pmu_ctx->pmu); + ctx_time_update_event(ctx, event); + + /* + * When disabling a group leader, the whole group becomes ineligible + * to run, so schedule out the full group. + */ if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; + group_sched_out(event, ctx); + + /* + * But only mark the leader OFF; the siblings will remain + * INACTIVE. + */ + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); + + perf_pmu_enable(event->pmu_ctx->pmu); } /* - * Disable a event. + * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through + * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_event it's OK because event->ctx + * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2007,42 +2656,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; - irq_work_queue(&event->pending); -} - -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx, - u64 tstamp) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, tstamp); - else - event->shadow_ctx_time = tstamp - ctx->timestamp; + irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -2050,14 +2664,56 @@ static void perf_set_shadow_time(struct perf_event *event, static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); +static void perf_event_unthrottle(struct perf_event *event, bool start) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->hw.interrupts = 0; + if (start) + event->pmu->start(event, 0); + if (event == event->group_leader) + perf_log_throttle(event, 1); +} + +static void perf_event_throttle(struct perf_event *event) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->hw.interrupts = MAX_INTERRUPTS; + event->pmu->stop(event, 0); + if (event == event->group_leader) + perf_log_throttle(event, 0); +} + +static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_unthrottle(leader, skip_start_event ? leader != event : true); + for_each_sibling_event(sibling, leader) + perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true); +} + +static void perf_event_throttle_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_throttle(leader); + for_each_sibling_event(sibling, leader) + perf_event_throttle(sibling); +} + static int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; + WARN_ON_ONCE(event->ctx != ctx); + lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) @@ -2065,51 +2721,40 @@ event_sched_in(struct perf_event *event, WRITE_ONCE(event->oncpu, smp_processor_id()); /* - * Order event::oncpu write to happen before the ACTIVE state - * is visible. + * Order event::oncpu write to happen before the ACTIVE state is + * visible. This allows perf_event_{stop,read}() to observe the correct + * ->oncpu if it sees ACTIVE. */ smp_wmb(); - WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); + perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { - perf_log_throttle(event, 1); - event->hw.interrupts = 0; - } - - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) + perf_event_unthrottle(event, false); perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx, tstamp); - perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { - event->state = PERF_EVENT_STATE_INACTIVE; + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - if (!is_software_event(event)) - cpuctx->active_oncpu++; - if (!ctx->nr_active++) - perf_event_ctx_activate(ctx); - if (event->attr.freq && event->attr.sample_freq) + cpc->active_oncpu++; + if (is_event_in_freq_mode(event)) { ctx->nr_freq++; - + epc->nr_freq++; + } if (event->attr.exclusive) - cpuctx->exclusive = 1; + cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); @@ -2118,31 +2763,24 @@ out: } static int -group_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = ctx->pmu; - u64 now = ctx->time; - bool simulate = false; + struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; - } + if (event_sched_in(group_event, ctx)) + goto error; /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx)) { + for_each_sibling_event(event, group_event) { + if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } @@ -2155,44 +2793,29 @@ group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: - * The events up to the failed event are scheduled out normally, - * tstamp_stopped will be updated. - * - * The failed events and the remaining siblings need to have - * their timings updated as if they had gone thru event_sched_in() - * and event_sched_out(). This is required to get consistent timings - * across the group. This also takes care of the case where the group - * could never be scheduled by ensuring tstamp_stopped is set to mark - * the time the event was actually stopped, such that time delta - * calculation in update_event_times() is correct. - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { + * The events up to the failed event are scheduled out normally. + */ + for_each_sibling_event(event, group_event) { if (event == partial_group) - simulate = true; + break; - if (simulate) { - event->tstamp_running += now - event->tstamp_stopped; - event->tstamp_stopped = now; - } else { - event_sched_out(event, cpuctx, ctx); - } + event_sched_out(event, ctx); } - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); +error: pmu->cancel_txn(pmu); - - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; } /* * Work out whether we can put this event group on the CPU now. */ -static int group_can_go_on(struct perf_event *event, - struct perf_cpu_context *cpuctx, - int can_add_hw) +static int group_can_go_on(struct perf_event *event, int can_add_hw) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); + /* * Groups consisting entirely of software events can always go on. */ @@ -2202,13 +2825,13 @@ static int group_can_go_on(struct perf_event *event, * If an exclusive group is already on, no other hardware * events can go on. */ - if (cpuctx->exclusive) + if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on. */ - if (event->attr.exclusive && cpuctx->active_oncpu) + if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -2220,47 +2843,35 @@ static int group_can_go_on(struct perf_event *event, static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { - u64 tstamp = perf_event_time(event); - list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type); -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); - -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, +static void task_ctx_sched_out(struct perf_event_context *ctx, + struct pmu *pmu, enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct task_struct *task) + struct pmu *pmu) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* @@ -2280,10 +2891,10 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, - enum event_type_t event_type) + struct pmu *pmu, enum event_type_t event_type) { - enum event_type_t ctx_event_type = event_type & EVENT_ALL; bool cpu_event = !!(event_type & EVENT_CPU); + struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be @@ -2292,9 +2903,17 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; - perf_pmu_disable(cpuctx->ctx.pmu); - if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx, event_type); + event_type &= EVENT_ALL; + + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + task_ctx_sched_out(task_ctx, pmu, event_type); + } /* * Decide which cpu ctx groups to schedule out based on the types @@ -2304,12 +2923,29 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - cpu_ctx_sched_out(cpuctx, ctx_event_type); - else if (ctx_event_type & EVENT_PINNED) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, pmu, event_type); + else if (event_type & EVENT_PINNED) + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + + perf_event_sched_in(cpuctx, task_ctx, pmu); - perf_event_sched_in(cpuctx, task_ctx, current); - perf_pmu_enable(cpuctx->ctx.pmu); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_enable(epc->pmu); + + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_enable(epc->pmu); + } +} + +void perf_pmu_resched(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + + perf_ctx_lock(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); + perf_ctx_unlock(cpuctx, task_ctx); } /* @@ -2322,7 +2958,7 @@ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; @@ -2351,10 +2987,23 @@ static int __perf_install_in_context(void *info) raw_spin_lock(&task_ctx->lock); } +#ifdef CONFIG_CGROUP_PERF + if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { + /* + * If the current cgroup doesn't match the event's + * cgroup, we should not try to schedule it. + */ + struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); + reprogram = cgroup_is_descendant(cgrp->css.cgroup, + event->cgrp->css.cgroup); + } +#endif + if (reprogram) { - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, + get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2365,6 +3014,9 @@ unlock: return ret; } +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx); + /* * Attach a performance event to a context. * @@ -2379,8 +3031,10 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); + WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); + if (event->cpu != -1) - event->cpu = cpu; + WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx @@ -2388,6 +3042,26 @@ perf_install_in_context(struct perf_event_context *ctx, */ smp_store_release(&event->ctx, ctx); + /* + * perf_event_attr::disabled events will not run and can be initialized + * without IPI. Except when this is the first event for the context, in + * that case we need the magic of the IPI to set ctx->is_active. + * + * The IOC_ENABLE that is sure to follow the creation of a disabled + * event will issue the IPI and reprogram the hardware. + */ + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && + ctx->nr_events && !is_cgroup_event(event)) { + raw_spin_lock_irq(&ctx->lock); + if (ctx->task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); + return; + } + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; @@ -2458,27 +3132,6 @@ again: } /* - * Put a event into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_event_mark_enabled(struct perf_event *event) -{ - struct perf_event *sub; - u64 tstamp = perf_event_time(event); - - event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; - } -} - -/* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, @@ -2493,39 +3146,33 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - if (ctx->is_active) - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); + perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; - if (!event_filter_match(event)) { - if (is_cgroup_event(event)) - perf_cgroup_defer_enabled(event); - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + if (!event_filter_match(event)) return; - } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; - } task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* - * Enable a event. + * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to @@ -2540,6 +3187,7 @@ static void _perf_event_enable(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { +out: raw_spin_unlock_irq(&ctx->lock); return; } @@ -2551,8 +3199,16 @@ static void _perf_event_enable(struct perf_event *event) * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ - if (event->state == PERF_EVENT_STATE_ERROR) + if (event->state == PERF_EVENT_STATE_ERROR) { + /* + * Detached SIBLING events cannot leave ERROR state. + */ + if (event->event_caps & PERF_EV_CAP_SIBLING && + event->group_leader == event) + goto out; + event->state = PERF_EVENT_STATE_OFF; + } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); @@ -2603,7 +3259,7 @@ static int __perf_event_stop(void *info) * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * - * Since this is happening on a event-local CPU, no trace is lost + * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) @@ -2645,7 +3301,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in - * event::addr_filters_offs array and bump the event::addr_filters_gen; + * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. @@ -2656,7 +3312,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly - * registered mapping, called for every new mmap(), with mm::mmap_sem down + * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. @@ -2707,12 +3363,130 @@ int perf_event_refresh(struct perf_event *event, int refresh) } EXPORT_SYMBOL_GPL(perf_event_refresh); -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static int perf_event_modify_breakpoint(struct perf_event *bp, + struct perf_event_attr *attr) +{ + int err; + + _perf_event_disable(bp); + + err = modify_user_hw_breakpoint_check(bp, attr, true); + + if (!bp->attr.disabled) + _perf_event_enable(bp); + + return err; +} + +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + +static int perf_event_modify_attr(struct perf_event *event, + struct perf_event_attr *attr) +{ + int (*func)(struct perf_event *, struct perf_event_attr *); + struct perf_event *child; + int err; + + if (event->attr.type != attr->type) + return -EINVAL; + + switch (event->attr.type) { + case PERF_TYPE_BREAKPOINT: + func = perf_event_modify_breakpoint; + break; + default: + /* Place holder for future additions. */ + return -EOPNOTSUPP; + } + + WARN_ON_ONCE(event->ctx->parent_ctx); + + mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); + err = func(event, attr); + if (err) + goto out; + list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); + err = func(child, attr); + if (err) + goto out; + } +out: + mutex_unlock(&event->child_mutex); + return err; +} + +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) +{ + struct perf_event_context *ctx = pmu_ctx->ctx; + struct perf_event *event, *tmp; + struct pmu *pmu = pmu_ctx->pmu; + + if (ctx->task && !(ctx->is_active & EVENT_ALL)) { + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); + + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + + if (!(event_type & EVENT_ALL)) + return; + + perf_pmu_disable(pmu); + if (event_type & EVENT_PINNED) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->pinned_active, + active_list) + group_sched_out(event, ctx); + } + + if (event_type & EVENT_FLEXIBLE) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->flexible_active, + active_list) + group_sched_out(event, ctx); + /* + * Since we cleared EVENT_FLEXIBLE, also clear + * rotate_necessary, is will be reset by + * ctx_flexible_sched_in() when needed. + */ + pmu_ctx->rotate_necessary = 0; + } + perf_pmu_enable(pmu); +} + +/* + * Be very careful with the @pmu argument since this will change ctx state. + * The @pmu argument works for ctx_resched(), because that is symmetric in + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. + * + * However, if you were to be asymmetrical, you could end up with messed up + * state, eg. ctx->is_active cleared even though most EPCs would still actually + * be active. + */ +static void +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - struct perf_event *event; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -2726,16 +3500,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; - } - /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -2746,28 +3510,36 @@ static void ctx_sched_out(struct perf_event_context *ctx, * * would only update time for the pinned events. */ - if (is_active & EVENT_TIME) { - /* update (and stop) ctx time */ - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - } - - is_active ^= ctx->is_active; /* changed bits */ + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); - if (!ctx->nr_active || !(is_active & EVENT_ALL)) - return; + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + ctx->is_active &= ~event_type; - perf_pmu_disable(ctx->pmu); - if (is_active & EVENT_PINNED) { - list_for_each_entry(event, &ctx->pinned_groups, group_entry) - group_sched_out(event, cpuctx, ctx); + if (!(ctx->is_active & EVENT_ALL)) { + /* + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() + * does not observe a hole. perf_ctx_unlock() will clean up. + */ + if (ctx->is_active & EVENT_FROZEN) + ctx->is_active &= EVENT_TIME_FROZEN; + else + ctx->is_active = 0; } - if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry(event, &ctx->flexible_groups, group_entry) - group_sched_out(event, cpuctx, ctx); + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!(ctx->is_active & EVENT_ALL)) + cpuctx->task_ctx = NULL; } - perf_pmu_enable(ctx->pmu); + + is_active ^= ctx->is_active; /* changed bits */ + + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_out(pmu_ctx, is_active); } /* @@ -2823,18 +3595,9 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - switch (event->state) { - case PERF_EVENT_STATE_ACTIVE: - event->pmu->read(event); - /* fall-through */ + perf_pmu_read(event); - case PERF_EVENT_STATE_INACTIVE: - update_event_times(event); - break; - - default: - break; - } + perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event @@ -2880,24 +3643,33 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -static void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, + struct task_struct *task, bool sched_in) +{ + struct perf_event_pmu_context *pmu_ctx; + struct perf_cpu_pmu_context *cpc; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + cpc = this_cpc(pmu_ctx->pmu); + + if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) + pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in); + } +} + +static void +perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) { - struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; + struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; - struct perf_cpu_context *cpuctx; int do_switch = 1; if (likely(!ctx)) return; - cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) - return; - rcu_read_lock(); - next_ctx = next->perf_event_ctxp[ctxn]; + next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; @@ -2921,20 +3693,42 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + + perf_ctx_disable(ctx, false); + + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ + if (local_read(&ctx->nr_no_switch_fast) || + local_read(&next_ctx->nr_no_switch_fast)) { + /* + * Must not swap out ctx when there's pending + * events that rely on the ctx->task relation. + * + * Likewise, when a context contains inherit + + * SAMPLE_READ events they should be switched + * out using the slow path so that they are + * treated as if they were distinct contexts. + */ + raw_spin_unlock(&next_ctx->lock); + rcu_read_unlock(); + goto inside_switch; + } + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + perf_ctx_sched_task_cb(ctx, task, false); + + perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of - * ctx->task and ctx->task_ctx_data are immaterial - * since those values are always verified under - * ctx->lock which we're now holding. + * ctx->task is immaterial since this value is + * always verified under ctx->lock which we're now + * holding. */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); - RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; @@ -2948,31 +3742,40 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); + perf_ctx_disable(ctx, false); + +inside_switch: + perf_ctx_sched_task_cb(ctx, task, false); + task_ctx_sched_out(ctx, NULL, EVENT_ALL); + + perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); this_cpu_dec(perf_sched_cb_usages); + barrier(); - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + if (!--cpc->sched_cb_usage) + list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); - if (!cpuctx->sched_cb_usage++) - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + if (!cpc->sched_cb_usage++) + list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + barrier(); this_cpu_inc(perf_sched_cb_usages); } @@ -2984,38 +3787,45 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void perf_pmu_sched_task(struct task_struct *prev, - struct task_struct *next, - bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, + struct task_struct *task, bool sched_in) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; - if (prev == next) + pmu = cpc->epc.pmu; + + /* software PMUs will not have sched_task */ + if (WARN_ON_ONCE(!pmu->sched_task)) return; - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - if (WARN_ON_ONCE(!pmu->sched_task)) - continue; + pmu->sched_task(cpc->task_epc, task, sched_in); - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(pmu); + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); +} - pmu->sched_task(cpuctx->task_ctx, sched_in); +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_cpu_pmu_context *cpc; - perf_pmu_enable(pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ + if (prev == next || cpuctx->task_ctx) + return; + + list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) + __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); -#define for_each_task_context_nr(ctxn) \ - for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) - /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. @@ -3030,111 +3840,248 @@ static void perf_event_switch(struct task_struct *task, void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { - int ctxn; - if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); - for_each_task_context_nr(ctxn) - perf_event_context_sched_out(task, ctxn, next); + perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_out(task, next); + perf_cgroup_switch(next); +} + +static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args) +{ + const struct perf_event *le = *(const struct perf_event **)l; + const struct perf_event *re = *(const struct perf_event **)r; + + return le->group_index < re->group_index; +} + +DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); + +static const struct min_heap_callbacks perf_min_heap = { + .less = perf_less_group_idx, + .swp = NULL, +}; + +static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) +{ + struct perf_event **itrs = heap->data; + + if (event) { + itrs[heap->nr] = event; + heap->nr++; + } +} + +static void __link_epc(struct perf_event_pmu_context *pmu_ctx) +{ + struct perf_cpu_pmu_context *cpc; + + if (!pmu_ctx->ctx->task) + return; + + cpc = this_cpc(pmu_ctx->pmu); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = pmu_ctx; +} + +static noinline int visit_groups_merge(struct perf_event_context *ctx, + struct perf_event_groups *groups, int cpu, + struct pmu *pmu, + int (*func)(struct perf_event *, void *), + void *data) +{ +#ifdef CONFIG_CGROUP_PERF + struct cgroup_subsys_state *css = NULL; +#endif + struct perf_cpu_context *cpuctx = NULL; + /* Space for per CPU and/or any CPU event iterators. */ + struct perf_event *itrs[2]; + struct perf_event_min_heap event_heap; + struct perf_event **evt; + int ret; + + if (pmu->filter && pmu->filter(pmu, cpu)) + return 0; + + if (!ctx->task) { + cpuctx = this_cpu_ptr(&perf_cpu_context); + event_heap = (struct perf_event_min_heap){ + .data = cpuctx->heap, + .nr = 0, + .size = cpuctx->heap_size, + }; + + lockdep_assert_held(&cpuctx->ctx.lock); + +#ifdef CONFIG_CGROUP_PERF + if (cpuctx->cgrp) + css = &cpuctx->cgrp->css; +#endif + } else { + event_heap = (struct perf_event_min_heap){ + .data = itrs, + .nr = 0, + .size = ARRAY_SIZE(itrs), + }; + /* Events not within a CPU context may be on any CPU. */ + __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); + } + evt = event_heap.data; + + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); + +#ifdef CONFIG_CGROUP_PERF + for (; css; css = css->parent) + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); +#endif + + if (event_heap.nr) { + __link_epc((*evt)->pmu_ctx); + perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); + } + + min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); + + while (event_heap.nr) { + ret = func(*evt, data); + if (ret) + return ret; + + *evt = perf_event_groups_next(*evt, pmu); + if (*evt) + min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); + else + min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); + } + + return 0; } /* - * Called with IRQs disabled + * Because the userpage is strictly per-event (there is no concept of context, + * so there cannot be a context indirection), every userpage must be updated + * when context time starts :-( + * + * IOW, we must not miss EVENT_TIME edges. */ -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static inline bool event_update_userpage(struct perf_event *event) { - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); + if (likely(!refcount_read(&event->mmap_count))) + return false; + + perf_event_update_time(event); + perf_event_update_userpage(event); + + return true; } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static inline void group_update_userpage(struct perf_event *group_event) { struct perf_event *event; - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - if (!event_filter_match(event)) - continue; + if (!event_update_userpage(group_event)) + return; - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); + for_each_sibling_event(event, group_event) + event_update_userpage(event); +} - if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx); +static int merge_sched_in(struct perf_event *event, void *data) +{ + struct perf_event_context *ctx = event->ctx; + int *can_add_hw = data; - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_ERROR; + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (group_can_go_on(event, *can_add_hw)) { + if (!group_sched_in(event, ctx)) + list_add_tail(&event->active_list, get_event_list(event)); + } + + if (event->state == PERF_EVENT_STATE_INACTIVE) { + *can_add_hw = 0; + if (event->attr.pinned) { + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + if (*perf_event_fasync(event)) + event->pending_kill = POLL_ERR; + + perf_event_wakeup(event); + } else { + struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); + + event->pmu_ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpc); + group_update_userpage(event); } } + + return 0; } -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void pmu_groups_sched_in(struct perf_event_context *ctx, + struct perf_event_groups *groups, + struct pmu *pmu) { - struct perf_event *event; int can_add_hw = 1; + visit_groups_merge(ctx, groups, smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); +} - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - /* Ignore events in OFF or ERROR state */ - if (event->state <= PERF_EVENT_STATE_OFF) - continue; - /* - * Listen to the 'cpu' scheduling filter constraint - * of events: - */ - if (!event_filter_match(event)) - continue; - - /* may need to reset tstamp_enabled */ - if (is_cgroup_event(event)) - perf_cgroup_mark_enabled(event, ctx); +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) +{ + struct perf_event_context *ctx = pmu_ctx->ctx; - if (group_can_go_on(event, cpuctx, can_add_hw)) { - if (group_sched_in(event, cpuctx, ctx)) - can_add_hw = 0; - } - } + if (event_type & EVENT_PINNED) + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + if (event_type & EVENT_FLEXIBLE) + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - u64 now; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; + if (!(is_active & EVENT_TIME)) { + /* start ctx time */ + __update_context_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { - if (!is_active) + if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -3142,45 +4089,52 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (is_active & EVENT_TIME) { - /* start ctx time */ - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - } - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); + if (is_active & EVENT_PINNED) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + } /* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); + if (is_active & EVENT_FLEXIBLE) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + } } -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) +static void perf_event_context_sched_in(struct task_struct *task) { - struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; - ctx_sched_in(ctx, cpuctx, event_type, task); -} + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp); + if (!ctx) + goto rcu_unlock; -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; + if (cpuctx->task_ctx == ctx) { + perf_ctx_lock(cpuctx, ctx); + perf_ctx_disable(ctx, false); - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; + perf_ctx_sched_task_cb(ctx, task, true); + + perf_ctx_enable(ctx, false); + perf_ctx_unlock(cpuctx, ctx); + goto rcu_unlock; + } perf_ctx_lock(cpuctx, ctx); - perf_pmu_disable(ctx->pmu); + /* + * We must check ctx->nr_events while holding ctx->lock, such + * that we serialize against perf_install_in_context(). + */ + if (!ctx->nr_events) + goto unlock; + + perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3189,11 +4143,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!list_empty(&ctx->pinned_groups)) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); - perf_pmu_enable(ctx->pmu); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { + perf_ctx_disable(&cpuctx->ctx, false); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); + } + + perf_event_sched_in(cpuctx, ctx, NULL); + + perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); + + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) + perf_ctx_enable(&cpuctx->ctx, false); + + perf_ctx_enable(ctx, false); + +unlock: perf_ctx_unlock(cpuctx, ctx); +rcu_unlock: + rcu_read_unlock(); } /* @@ -3210,26 +4177,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - struct perf_event_context *ctx; - int ctxn; - - /* - * If cgroup events exist on this CPU, then we need to check if we have - * to switch in PMU state; cgroup event are system-wide mode only. - * - * Since cgroup events are CPU events, we must schedule these in before - * we schedule in the task events. - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (likely(!ctx)) - continue; - - perf_event_context_sched_in(ctx, task); - } + perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); @@ -3323,7 +4271,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ + if (delta >= 0) + delta += 7; + else + delta -= 7; + delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; @@ -3343,49 +4295,28 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo } } -/* - * combine freq adjustment with unthrottling to avoid two passes over the - * events. At the same time, make sure, having freq events does not change - * the rate of unthrottling as that would introduce bias. - */ -static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, - int needs_unthr) +static void perf_adjust_freq_unthr_events(struct list_head *event_list) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; - /* - * only need to iterate over all events iff: - * - context have events in frequency mode (needs freq adjust) - * - there are events to unthrottle on this cpu - */ - if (!(ctx->nr_freq || needs_unthr)) - return; - - raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); - - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + list_for_each_entry(event, event_list, active_list) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; + // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; - perf_pmu_disable(event->pmu); - hwc = &event->hw; - if (hwc->interrupts == MAX_INTERRUPTS) { - hwc->interrupts = 0; - perf_log_throttle(event, 1); - event->pmu->start(event, 0); - } + if (hwc->interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, is_event_in_freq_mode(event)); - if (!event->attr.freq || !event->attr.sample_freq) - goto next; + if (!is_event_in_freq_mode(event)) + continue; /* * stop the event and update event->count @@ -3407,80 +4338,191 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); - next: - perf_pmu_enable(event->pmu); + } +} + +/* + * combine freq adjustment with unthrottling to avoid two passes over the + * events. At the same time, make sure, having freq events does not change + * the rate of unthrottling as that would introduce bias. + */ +static void +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) +{ + struct perf_event_pmu_context *pmu_ctx; + + /* + * only need to iterate over all events iff: + * - context have events in frequency mode (needs freq adjust) + * - there are events to unthrottle on this cpu + */ + if (!(ctx->nr_freq || unthrottle)) + return; + + raw_spin_lock(&ctx->lock); + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (!(pmu_ctx->nr_freq || unthrottle)) + continue; + if (!perf_pmu_ctx_is_active(pmu_ctx)) + continue; + if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) + continue; + + perf_pmu_disable(pmu_ctx->pmu); + perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); + perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); + perf_pmu_enable(pmu_ctx->pmu); } - perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } /* - * Round-robin a context's events: + * Move @event to the tail of the @ctx's elegible events. */ -static void rotate_ctx(struct perf_event_context *ctx) +static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ - if (!ctx->rotate_disable) - list_rotate_left(&ctx->flexible_groups); + if (ctx->rotate_disable) + return; + + perf_event_groups_delete(&ctx->flexible_groups, event); + perf_event_groups_insert(&ctx->flexible_groups, event); } -static int perf_rotate_context(struct perf_cpu_context *cpuctx) +/* pick an event from the flexible_groups to rotate */ +static inline struct perf_event * +ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { - struct perf_event_context *ctx = NULL; - int rotate = 0; + struct perf_event *event; + struct rb_node *node; + struct rb_root *tree; + struct __group_key key = { + .pmu = pmu_ctx->pmu, + }; + + /* pick the first active flexible event */ + event = list_first_entry_or_null(&pmu_ctx->flexible_active, + struct perf_event, active_list); + if (event) + goto out; + + /* if no active flexible event, pick the first event */ + tree = &pmu_ctx->ctx->flexible_groups.tree; - if (cpuctx->ctx.nr_events) { - if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + if (!pmu_ctx->ctx->task) { + key.cpu = smp_processor_id(); + + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + goto out; } - ctx = cpuctx->task_ctx; - if (ctx && ctx->nr_events) { - if (ctx->nr_events != ctx->nr_active) - rotate = 1; + key.cpu = -1; + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) { + event = __node_2_pe(node); + goto out; } - if (!rotate) - goto done; + key.cpu = smp_processor_id(); + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + +out: + /* + * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() + * finds there are unschedulable events, it will set it again. + */ + pmu_ctx->rotate_necessary = 0; + + return event; +} + +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; + struct perf_event *cpu_event = NULL, *task_event = NULL; + int cpu_rotate, task_rotate; + struct pmu *pmu; + + /* + * Since we run this from IRQ context, nobody can install new + * events, thus the event count values are stable. + */ + + cpu_epc = &cpc->epc; + pmu = cpu_epc->pmu; + task_epc = cpc->task_epc; + + cpu_rotate = cpu_epc->rotate_necessary; + task_rotate = task_epc ? task_epc->rotate_necessary : 0; + + if (!(cpu_rotate || task_rotate)) + return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + perf_pmu_disable(pmu); - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (task_rotate) + task_event = ctx_event_to_rotate(task_epc); + if (cpu_rotate) + cpu_event = ctx_event_to_rotate(cpu_epc); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + /* + * As per the order given at ctx_resched() first 'pop' task flexible + * and then, if needed CPU flexible. + */ + if (task_event || (task_epc && cpu_event)) { + update_context_time(task_epc->ctx); + __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); + } + + if (cpu_event) { + update_context_time(&cpuctx->ctx); + __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); + rotate_ctx(&cpuctx->ctx, cpu_event); + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); + } - perf_event_sched_in(cpuctx, ctx, current); + if (task_event) + rotate_ctx(task_epc->ctx, task_event); - perf_pmu_enable(cpuctx->ctx.pmu); + if (task_event || (task_epc && cpu_event)) + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); + + perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); -done: - return rotate; + return true; } void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&active_ctx_list); - struct perf_event_context *ctx, *tmp; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; int throttled; - WARN_ON(!irqs_disabled()); + lockdep_assert_irqs_disabled(); __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) - perf_adjust_freq_unthr_context(ctx, throttled); + perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); + + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_adjust_freq_unthr_context(ctx, !!throttled); + rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, @@ -3493,7 +4535,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event); + perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); return 1; } @@ -3502,9 +4544,9 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(int ctxn) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; @@ -3512,13 +4554,16 @@ static void perf_event_enable_on_exec(int ctxn) int enabled = 0; local_irq_save(flags); - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx || !ctx->nr_events) + if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) goto out; - cpuctx = __get_cpu_context(ctx); + if (!ctx->nr_events) + goto out; + + cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); @@ -3529,9 +4574,7 @@ static void perf_event_enable_on_exec(int ctxn) */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx, event_type); - } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_resched(cpuctx, ctx, NULL, event_type); } perf_ctx_unlock(cpuctx, ctx); @@ -3542,19 +4585,75 @@ out: put_ctx(clone_ctx); } +static void perf_remove_from_owner(struct perf_event *event); +static void perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, + bool revoke); + +/* + * Removes all events from the current task that have been marked + * remove-on-exec, and feeds their values back to parent events. + */ +static void perf_event_remove_on_exec(struct perf_event_context *ctx) +{ + struct perf_event_context *clone_ctx = NULL; + struct perf_event *event, *next; + unsigned long flags; + bool modified = false; + + mutex_lock(&ctx->mutex); + + if (WARN_ON_ONCE(ctx->task != current)) + goto unlock; + + list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { + if (!event->attr.remove_on_exec) + continue; + + if (!is_kernel_event(event)) + perf_remove_from_owner(event); + + modified = true; + + perf_event_exit_event(event, ctx, false); + } + + raw_spin_lock_irqsave(&ctx->lock, flags); + if (modified) + clone_ctx = unclone_ctx(ctx); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + +unlock: + mutex_unlock(&ctx->mutex); + + if (clone_ctx) + put_ctx(clone_ctx); +} + struct perf_read_data { struct perf_event *event; bool group; int ret; }; +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); + static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { + int local_cpu = smp_processor_id(); u16 local_pkg, event_pkg; - if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { - int local_cpu = smp_processor_id(); + if ((unsigned)event_cpu >= nr_cpu_ids) + return event_cpu; + + if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); + + if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) + return local_cpu; + } + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); @@ -3573,7 +4672,7 @@ static void __perf_event_read(void *info) struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* @@ -3587,12 +4686,12 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); + + perf_event_update_time(event); + if (data->group) + perf_event_update_sibling_time(event); - update_event_times(event); if (event->state != PERF_EVENT_STATE_ACTIVE) goto unlock; @@ -3606,16 +4705,8 @@ static void __perf_event_read(void *info) pmu->read(event); - list_for_each_entry(sub, &event->sibling_list, group_entry) { - update_event_times(sub); - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -3623,12 +4714,24 @@ unlock: raw_spin_unlock(&ctx->lock); } -static inline u64 perf_event_count(struct perf_event *event) +static inline u64 perf_event_count(struct perf_event *event, bool self) { - if (event->pmu->count) - return event->pmu->count(event); + if (self) + return local64_read(&event->count); - return __perf_event_count(event); + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = perf_event_time_now(event, *now); + __perf_update_times(event, ctx_time, enabled, running); } /* @@ -3639,9 +4742,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; + int event_oncpu; + int event_cpu; int ret = 0; /* @@ -3659,15 +4765,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } - /* - * It must not have a pmu::count method, those are not - * NMI safe. - */ - if (event->pmu->count) { - ret = -EOPNOTSUPP; - goto out; - } - /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { @@ -3675,22 +4772,44 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + /* + * Get the event CPU numbers, and adjust them to local if the event is + * a per-package event that can be read locally + */ + event_oncpu = __perf_event_read_cpu(event, event->oncpu); + event_cpu = __perf_event_read_cpu(event, event->cpu); + /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && - event->cpu != smp_processor_id()) { + event_cpu != smp_processor_id()) { ret = -EINVAL; goto out; } + /* If this is a pinned event it must be running on this CPU */ + if (event->attr.pinned && event_oncpu != smp_processor_id()) { + ret = -EBUSY; + goto out; + } + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event_oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); + if (enabled || running) { + u64 __enabled, __running, __now; + + calc_timer_values(event, &__now, &__enabled, &__running); + if (enabled) + *enabled = __enabled; + if (running) + *running = __running; + } out: local_irq_restore(flags); @@ -3699,23 +4818,35 @@ out: static int perf_event_read(struct perf_event *event, bool group) { + enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { - struct perf_read_data data = { - .event = event, - .group = group, - .ret = 0, - }; +again: + if (state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data; + + /* + * Orders the ->state and ->oncpu loads such that if we see + * ACTIVE we must also see the right ->oncpu. + * + * Matches the smp_wmb() from event_sched_in(). + */ + smp_rmb(); event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0; + data = (struct perf_read_data){ + .event = event, + .group = group, + .ret = 0, + }; + preempt_disable(); event_cpu = __perf_event_read_cpu(event, event_cpu); @@ -3732,24 +4863,27 @@ static int perf_event_read(struct perf_event *event, bool group) (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1); preempt_enable(); ret = data.ret; - } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + + } else if (state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); + state = event->state; + if (state != PERF_EVENT_STATE_INACTIVE) { + raw_spin_unlock_irqrestore(&ctx->lock, flags); + goto again; + } + /* - * may read while context is not active - * (e.g., thread is blocked), in that case - * we cannot update context time + * May read while context is not active (e.g., thread is + * blocked), in that case we cannot update context time */ - if (ctx->is_active) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); + + perf_event_update_time(event); if (group) - update_group_times(event); - else - update_event_times(event); + perf_event_update_sibling_time(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -3763,15 +4897,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->active_ctx_list); - INIT_LIST_HEAD(&ctx->pinned_groups); - INIT_LIST_HEAD(&ctx->flexible_groups); + INIT_LIST_HEAD(&ctx->pmu_ctx_list); + perf_event_groups_init(&ctx->pinned_groups); + perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); +} + +static void +__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) +{ + epc->pmu = pmu; + INIT_LIST_HEAD(&epc->pmu_ctx_entry); + INIT_LIST_HEAD(&epc->pinned_active); + INIT_LIST_HEAD(&epc->flexible_active); + atomic_set(&epc->refcount, 1); } static struct perf_event_context * -alloc_perf_context(struct pmu *pmu, struct task_struct *task) +alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; @@ -3780,11 +4924,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) return NULL; __perf_event_init_context(ctx); - if (task) { - ctx->task = task; - get_task_struct(task); - } - ctx->pmu = pmu; + if (task) + ctx->task = get_task_struct(task); return ctx; } @@ -3813,67 +4954,46 @@ find_lively_task_by_vpid(pid_t vpid) * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, - struct perf_event *event) +find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; - void *task_ctx_data = NULL; unsigned long flags; - int ctxn, err; - int cpu = event->cpu; + int err; if (!task) { /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); + err = perf_allow_cpu(); + if (err) + return ERR_PTR(err); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } err = -EINVAL; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto errout; - - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); - if (!task_ctx_data) { - err = -ENOMEM; - goto errout; - } - } - retry: - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; - if (task_ctx_data && !ctx->task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { - ctx = alloc_perf_context(pmu, task); + ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; - if (task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } - err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -3882,12 +5002,12 @@ retry: */ if (task->flags & PF_EXITING) err = -ESRCH; - else if (task->perf_event_ctxp[ctxn]) + else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; - rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); @@ -3900,30 +5020,157 @@ retry: } } - kfree(task_ctx_data); return ctx; errout: - kfree(task_ctx_data); return ERR_PTR(err); } +static struct perf_event_pmu_context * +find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, + struct perf_event *event) +{ + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; + + if (!ctx->task) { + /* + * perf_pmu_migrate_context() / __perf_pmu_install_event() + * relies on the fact that find_get_pmu_context() cannot fail + * for CPU contexts. + */ + struct perf_cpu_pmu_context *cpc; + + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + epc = &cpc->epc; + raw_spin_lock_irq(&ctx->lock); + if (!epc->ctx) { + /* + * One extra reference for the pmu; see perf_pmu_free(). + */ + atomic_set(&epc->refcount, 2); + epc->embedded = 1; + list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + epc->ctx = ctx; + } else { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + } + raw_spin_unlock_irq(&ctx->lock); + return epc; + } + + new = kzalloc(sizeof(*epc), GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + __perf_init_event_pmu_context(new, pmu); + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because perf_event_init_task() doesn't actually hold the + * child_ctx->mutex. + */ + + raw_spin_lock_irq(&ctx->lock); + list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (epc->pmu == pmu) { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + goto found_epc; + } + /* Make sure the pmu_ctx_list is sorted by PMU type: */ + if (!pos && epc->pmu->type > pmu->type) + pos = epc; + } + + epc = new; + new = NULL; + + if (!pos) + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + else + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); + + epc->ctx = ctx; + +found_epc: + raw_spin_unlock_irq(&ctx->lock); + kfree(new); + + return epc; +} + +static void get_pmu_ctx(struct perf_event_pmu_context *epc) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); +} + +static void free_cpc_rcu(struct rcu_head *head) +{ + struct perf_cpu_pmu_context *cpc = + container_of(head, typeof(*cpc), epc.rcu_head); + + kfree(cpc); +} + +static void free_epc_rcu(struct rcu_head *head) +{ + struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); + + kfree(epc); +} + +static void put_pmu_ctx(struct perf_event_pmu_context *epc) +{ + struct perf_event_context *ctx = epc->ctx; + unsigned long flags; + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) + return; + + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); + + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; + + WARN_ON_ONCE(!list_empty(&epc->pinned_active)); + WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + + raw_spin_unlock_irqrestore(&ctx->lock, flags); + + if (epc->embedded) { + call_rcu(&epc->rcu_head, free_cpc_rcu); + return; + } + + call_rcu(&epc->rcu_head, free_epc_rcu); +} + static void perf_event_free_filter(struct perf_event *event); -static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { - struct perf_event *event; + struct perf_event *event = container_of(head, typeof(*event), rcu_head); - event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); - kfree(event); + kmem_cache_free(perf_event_cache, event); } static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); + struct perf_buffer *rb); static void detach_sb_event(struct perf_event *event) { @@ -3946,9 +5193,11 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || - attr->context_switch) + attr->task || attr->ksymbol || + attr->context_switch || attr->text_poke || + attr->bpf_event) return true; + return false; } @@ -3958,15 +5207,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event) detach_sb_event(event); } -static void unaccount_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_dec(&per_cpu(perf_cgroup_events, cpu)); -} - #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif @@ -3989,6 +5229,225 @@ static void unaccount_freq_event(void) atomic_dec(&nr_freq_events); } + +static struct perf_ctx_data * +alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global) +{ + struct perf_ctx_data *cd; + + cd = kzalloc(sizeof(*cd), GFP_KERNEL); + if (!cd) + return NULL; + + cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL); + if (!cd->data) { + kfree(cd); + return NULL; + } + + cd->global = global; + cd->ctx_cache = ctx_cache; + refcount_set(&cd->refcount, 1); + + return cd; +} + +static void free_perf_ctx_data(struct perf_ctx_data *cd) +{ + kmem_cache_free(cd->ctx_cache, cd->data); + kfree(cd); +} + +static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head) +{ + struct perf_ctx_data *cd; + + cd = container_of(rcu_head, struct perf_ctx_data, rcu_head); + free_perf_ctx_data(cd); +} + +static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd) +{ + call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu); +} + +static int +attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, + bool global) +{ + struct perf_ctx_data *cd, *old = NULL; + + cd = alloc_perf_ctx_data(ctx_cache, global); + if (!cd) + return -ENOMEM; + + for (;;) { + if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (old) + perf_free_ctx_data_rcu(old); + return 0; + } + + if (!old) { + /* + * After seeing a dead @old, we raced with + * removal and lost, try again to install @cd. + */ + continue; + } + + if (refcount_inc_not_zero(&old->refcount)) { + free_perf_ctx_data(cd); /* unused */ + return 0; + } + + /* + * @old is a dead object, refcount==0 is stable, try and + * replace it with @cd. + */ + } + return 0; +} + +static void __detach_global_ctx_data(void); +DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem); +static refcount_t global_ctx_data_ref; + +static int +attach_global_ctx_data(struct kmem_cache *ctx_cache) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + int ret; + + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; +again: + /* Allocate everything */ + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (cd && !cd->global) { + cd->global = 1; + if (!refcount_inc_not_zero(&cd->refcount)) + cd = NULL; + } + if (!cd) { + get_task_struct(p); + goto alloc; + } + } + } + + refcount_set(&global_ctx_data_ref, 1); + + return 0; +alloc: + ret = attach_task_ctx_data(p, ctx_cache, true); + put_task_struct(p); + if (ret) { + __detach_global_ctx_data(); + return ret; + } + goto again; +} + +static int +attach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache; + int ret; + + if (!ctx_cache) + return -ENOMEM; + + if (task) + return attach_task_ctx_data(task, ctx_cache, false); + + ret = attach_global_ctx_data(ctx_cache); + if (ret) + return ret; + + event->attach_state |= PERF_ATTACH_GLOBAL_DATA; + return 0; +} + +static void +detach_task_ctx_data(struct task_struct *p) +{ + struct perf_ctx_data *cd; + + scoped_guard (rcu) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !refcount_dec_and_test(&cd->refcount)) + return; + } + + /* + * The old ctx_data may be lost because of the race. + * Nothing is required to do for the case. + * See attach_task_ctx_data(). + */ + if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); +} + +static void __detach_global_ctx_data(void) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + +again: + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !cd->global) + continue; + cd->global = 0; + get_task_struct(p); + goto detach; + } + } + return; +detach: + detach_task_ctx_data(p); + put_task_struct(p); + goto again; +} + +static void detach_global_ctx_data(void) +{ + if (refcount_dec_not_one(&global_ctx_data_ref)) + return; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (!refcount_dec_and_test(&global_ctx_data_ref)) + return; + + /* remove everything */ + __detach_global_ctx_data(); +} + +static void detach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + + event->attach_state &= ~PERF_ATTACH_TASK_DATA; + + if (task) + return detach_task_ctx_data(task); + + if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) { + detach_global_ctx_data(); + event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA; + } +} + static void unaccount_event(struct perf_event *event) { bool dec = false; @@ -3996,14 +5455,18 @@ static void unaccount_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); + if (event->attr.build_id) + atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) atomic_dec(&nr_namespaces_events); + if (event->attr.cgroup) + atomic_dec(&nr_cgroup_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) @@ -4016,14 +5479,18 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) schedule_delayed_work(&perf_sched_work, HZ); } - unaccount_event_cpu(event, event->cpu); - unaccount_pmu_sb_event(event); } @@ -4042,7 +5509,7 @@ static void perf_sched_delayed(struct work_struct *work) * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, - * 3) two matching events on the same context. + * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). @@ -4051,7 +5518,7 @@ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return 0; /* @@ -4075,6 +5542,8 @@ static int exclusive_event_init(struct perf_event *event) return -EBUSY; } + event->attach_state |= PERF_ATTACH_EXCLUSIVE; + return 0; } @@ -4082,14 +5551,13 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) - return; - /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); + + event->attach_state &= ~PERF_ATTACH_EXCLUSIVE; } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) @@ -4102,14 +5570,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) return false; } -/* Called under the same ctx::mutex as perf_install_in_context() */ static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + lockdep_assert_held(&ctx->mutex); + + if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { @@ -4120,15 +5589,78 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void perf_addr_filters_splice(struct perf_event *event, - struct list_head *head); +static void perf_free_addr_filters(struct perf_event *event); +/* vs perf_event_alloc() error */ +static void __free_event(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (event->attach_state & PERF_ATTACH_CALLCHAIN) + put_callchain_buffers(); + + kfree(event->addr_filter_ranges); + + if (event->attach_state & PERF_ATTACH_EXCLUSIVE) + exclusive_event_destroy(event); + + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + + if (event->attach_state & PERF_ATTACH_TASK_DATA) + detach_perf_ctx_data(event); + + if (event->destroy) + event->destroy(event); + + /* + * Must be after ->destroy(), due to uprobe_perf_close() using + * hw.target. + */ + if (event->hw.target) + put_task_struct(event->hw.target); + + if (event->pmu_ctx) { + /* + * put_pmu_ctx() needs an event->ctx reference, because of + * epc->ctx. + */ + WARN_ON_ONCE(!pmu); + WARN_ON_ONCE(!event->ctx); + WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); + put_pmu_ctx(event->pmu_ctx); + } + + /* + * perf_event_free_task() relies on put_ctx() being 'last', in + * particular all task references must be cleaned up. + */ + if (event->ctx) + put_ctx(event->ctx); + + if (pmu) { + module_put(pmu->module); + scoped_guard (spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } + + call_rcu(&event->rcu_head, free_event_rcu); +} + +DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) + +/* vs perf_event_alloc() success */ static void _free_event(struct perf_event *event) { - irq_work_sync(&event->pending); + irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); unaccount_event(event); + security_perf_event_free(event); + if (event->rb) { /* * Can happen when we close an event with re-directed output. @@ -4141,39 +5673,21 @@ static void _free_event(struct perf_event *event) mutex_unlock(&event->mmap_mutex); } - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filters_offs); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); + perf_free_addr_filters(event); - exclusive_event_destroy(event); - module_put(event->pmu->module); - - call_rcu(&event->rcu_head, free_event_rcu); + __free_event(event); } /* * Used to free events which have a known refcount of 1, such as in error paths - * where the event isn't exposed yet and inherited events. + * of inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, - "unexpected event refcount: %ld; ptr=%p\n", - atomic_long_read(&event->refcount), event)) { + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } @@ -4195,7 +5709,7 @@ static void perf_remove_from_owner(struct perf_event *event) * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - owner = lockless_dereference(event->owner); + owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -4234,10 +5748,17 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { + struct perf_event *parent; + if (!atomic_long_dec_and_test(&event->refcount)) return; + parent = event->parent; _free_event(event); + + /* Matches the refcount bump in inherit_event() */ + if (parent) + put_event(parent); } /* @@ -4251,8 +5772,8 @@ int perf_event_release_kernel(struct perf_event *event) struct perf_event *child, *tmp; /* - * If we got here through err_file: fput(event_file); we will not have - * attached to a context yet. + * If we got here through err_alloc: free_event(event); we will not + * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & @@ -4265,9 +5786,7 @@ int perf_event_release_kernel(struct perf_event *event) ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP); - raw_spin_lock_irq(&ctx->lock); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. @@ -4279,20 +5798,22 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - event->state = PERF_EVENT_STATE_DEAD; - raw_spin_unlock_irq(&ctx->lock); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { - /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). */ - ctx = lockless_dereference(child->ctx); + ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. @@ -4320,25 +5841,30 @@ again: tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { - perf_remove_from_context(child, DETACH_GROUP); - list_del(&child->child_list); - free_event(child); - /* - * This matches the refcount bump in inherit_event(); - * this can't be the last reference. - */ - put_event(event); + perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD); + } else { + child = NULL; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); + + if (child) { + /* Last reference unless ->pending_task work is pending */ + put_event(child); + } put_ctx(ctx); + goto again; } mutex_unlock(&event->child_mutex); no_ctx: - put_event(event); /* Must be the 'last' reference */ + /* + * Last reference unless ->pending_task work is pending on this event + * or any of its children. + */ + put_event(event); return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -4352,7 +5878,7 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; @@ -4363,7 +5889,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); - total += perf_event_count(event); + total += perf_event_count(event, false); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -4372,7 +5898,7 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); - total += perf_event_count(child); + total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -4380,13 +5906,25 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) return total; } + +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + count = __perf_event_read_value(event, enabled, running); + perf_event_ctx_unlock(event, ctx); + + return count; +} EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; - struct perf_event *sub; + struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -4395,6 +5933,35 @@ static int __perf_read_group_add(struct perf_event *leader, if (ret) return ret; + raw_spin_lock_irqsave(&ctx->lock, flags); + /* + * Verify the grouping between the parent and child (inherited) + * events is still in tact. + * + * Specifically: + * - leader->ctx->lock pins leader->sibling_list + * - parent->child_mutex pins parent->child_list + * - parent->ctx->mutex pins parent->sibling_list + * + * Because parent->ctx != leader->ctx (and child_list nests inside + * ctx->mutex), group destruction is not atomic between children, also + * see perf_event_release_kernel(). Additionally, parent can grow the + * group. + * + * Therefore it is possible to have parent and child groups in a + * different configuration and summing over such a beast makes no sense + * what so ever. + * + * Reject this. + */ + parent = leader->parent; + if (parent && + (parent->group_generation != leader->group_generation || + parent->nr_siblings != leader->nr_siblings)) { + ret = -ECHILD; + goto unlock; + } + /* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one @@ -4413,20 +5980,23 @@ static int __perf_read_group_add(struct perf_event *leader, /* * Write {count,id} tuples for every sibling. */ - values[n++] += perf_event_count(leader); + values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); - raw_spin_lock_irqsave(&ctx->lock, flags); - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - values[n++] += perf_event_count(sub); + for_each_sibling_event(sub, leader) { + values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); } +unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); - return 0; + return ret; } static int perf_read_group(struct perf_event *event, @@ -4445,10 +6015,6 @@ static int perf_read_group(struct perf_event *event, values[0] = 1 + leader->nr_siblings; - /* - * By locking the child_mutex of the leader we effectively - * lock the child list of all siblings.. XXX explain how. - */ mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); @@ -4479,16 +6045,18 @@ static int perf_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 enabled, running; - u64 values[4]; + u64 values[5]; int n = 0; - values[n++] = perf_event_read_value(event, &enabled, &running); + values[n++] = __perf_event_read_value(event, &enabled, &running); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; @@ -4519,7 +6087,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count) int ret; /* - * Return end-of-file for a read on a event that is in + * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ @@ -4545,6 +6113,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct perf_event_context *ctx; int ret; + ret = security_perf_event_read(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); @@ -4552,17 +6124,27 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) return ret; } -static unsigned int perf_poll(struct file *file, poll_table *wait) +static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct ring_buffer *rb; - unsigned int events = POLLHUP; + struct perf_buffer *rb; + __poll_t events = EPOLLHUP; + + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; + if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && + event->attr.pinned)) + return EPOLLERR; + /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. @@ -4582,6 +6164,24 @@ static void _perf_event_reset(struct perf_event *event) perf_event_update_userpage(event); } +/* Assume it's not an event with inherit set. */ +u64 perf_event_pause(struct perf_event *event, bool reset) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(event->attr.inherit); + _perf_event_disable(event); + count = local64_read(&event->count); + if (reset) + local64_set(&event->count, 0); + perf_event_ctx_unlock(event, ctx); + + return count; +} +EXPORT_SYMBOL_GPL(perf_event_pause); + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -4613,7 +6213,7 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - list_for_each_entry(sibling, &event->sibling_list, group_entry) + for_each_sibling_event(sibling, event) perf_event_for_each_child(sibling, func); } @@ -4634,15 +6234,7 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { - perf_pmu_disable(ctx->pmu); - /* - * We could be throttled; unthrottle now to avoid the tick - * trying to unthrottle while we already re-started the event. - */ - if (event->hw.interrupts == MAX_INTERRUPTS) { - event->hw.interrupts = 0; - perf_log_throttle(event, 1); - } + perf_pmu_disable(event->pmu); event->pmu->stop(event, PERF_EF_UPDATE); } @@ -4650,57 +6242,83 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); - perf_pmu_enable(ctx->pmu); + /* + * Once the period is force-reset, the event starts immediately. + * But the event/group could be throttled. Unthrottle the + * event/group now to avoid the next tick trying to unthrottle + * while we already re-started the event/group. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, true); + perf_pmu_enable(event->pmu); } } -static int perf_event_period(struct perf_event *event, u64 __user *arg) +static int perf_event_check_period(struct perf_event *event, u64 value) { - u64 value; + return event->pmu->check_period(event, value); +} +static int _perf_event_period(struct perf_event *event, u64 value) +{ if (!is_sampling_event(event)) return -EINVAL; - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - if (!value) return -EINVAL; - if (event->attr.freq && value > sysctl_perf_event_sample_rate) - return -EINVAL; + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) + return -EINVAL; + } else { + if (perf_event_check_period(event, value)) + return -EINVAL; + if (value & (1ULL << 63)) + return -EINVAL; + } event_function_call(event, __perf_event_period, &value); return 0; } +int perf_event_period(struct perf_event *event, u64 value) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_period(event, value); + perf_event_ctx_unlock(event, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(perf_event_period); + static const struct file_operations perf_fops; -static inline int perf_fget_light(int fd, struct fd *p) +static inline bool is_perf_file(struct fd f) { - struct fd f = fdget(fd); - if (!f.file) - return -EBADF; - - if (f.file->f_op != &perf_fops) { - fdput(f); - return -EBADF; - } - *p = f; - return 0; + return !fd_empty(f) && fd_file(f)->f_op == &perf_fops; } static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr); +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -4716,8 +6334,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); + { + u64 value; + if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) + return -EFAULT; + + return _perf_event_period(event, value); + } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); @@ -4729,30 +6353,39 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_OUTPUT: { - int ret; + CLASS(fd, output)(arg); // arg == -1 => empty + struct perf_event *output_event = NULL; if (arg != -1) { - struct perf_event *output_event; - struct fd output; - ret = perf_fget_light(arg, &output); - if (ret) - return ret; - output_event = output.file->private_data; - ret = perf_event_set_output(event, output_event); - fdput(output); - } else { - ret = perf_event_set_output(event, NULL); + if (!is_perf_file(output)) + return -EBADF; + output_event = fd_file(output)->private_data; } - return ret; + return perf_event_set_output(event, output_event); } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); case PERF_EVENT_IOC_SET_BPF: - return perf_event_set_bpf_prog(event, arg); + { + struct bpf_prog *prog; + int err; + + prog = bpf_prog_get(arg); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __perf_event_set_bpf_prog(event, prog, 0); + if (err) { + bpf_prog_put(prog); + return err; + } + + return 0; + } case PERF_EVENT_IOC_PAUSE_OUTPUT: { - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -4764,6 +6397,20 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon rcu_read_unlock(); return 0; } + + case PERF_EVENT_IOC_QUERY_BPF: + return perf_event_query_prog_array(event, (void __user *)arg); + + case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: { + struct perf_event_attr new_attr; + int err = perf_copy_attr((struct perf_event_attr __user *)arg, + &new_attr); + + if (err) + return err; + + return perf_event_modify_attr(event, &new_attr); + } default: return -ENOTTY; } @@ -4782,6 +6429,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct perf_event_context *ctx; long ret; + /* Treat ioctl like writes as it is likely a mutating operation. */ + ret = security_perf_event_write(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); @@ -4796,6 +6448,8 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): + case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF): + case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; @@ -4852,23 +6506,10 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = event->shadow_ctx_time + *now; - *enabled = ctx_time - event->tstamp_enabled; - *running = ctx_time - event->tstamp_running; -} - static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct ring_buffer *rb; + struct perf_buffer *rb; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -4900,7 +6541,7 @@ void __weak arch_perf_update_userpage( void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct ring_buffer *rb; + struct perf_buffer *rb; u64 enabled, running, now; rcu_read_lock(); @@ -4921,14 +6562,14 @@ void perf_event_update_userpage(struct perf_event *event) userpg = rb->user_page; /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. + * Disable preemption to guarantee consistent time stamps are stored to + * the user page. */ preempt_disable(); ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); + userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); @@ -4946,48 +6587,16 @@ void perf_event_update_userpage(struct perf_event *event) unlock: rcu_read_unlock(); } - -static int perf_mmap_fault(struct vm_fault *vmf) -{ - struct perf_event *event = vmf->vma->vm_file->private_data; - struct ring_buffer *rb; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} +EXPORT_SYMBOL_GPL(perf_event_update_userpage); static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb) + struct perf_buffer *rb) { - struct ring_buffer *old_rb = NULL; + struct perf_buffer *old_rb = NULL; unsigned long flags; + WARN_ON_ONCE(event->parent); + if (event->rb) { /* * Should be impossible, we set this when removing @@ -5043,7 +6652,10 @@ static void ring_buffer_attach(struct perf_event *event, static void ring_buffer_wakeup(struct perf_event *event) { - struct ring_buffer *rb; + struct perf_buffer *rb; + + if (event->parent) + event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); @@ -5054,14 +6666,17 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_unlock(); } -struct ring_buffer *ring_buffer_get(struct perf_event *event) +struct perf_buffer *ring_buffer_get(struct perf_event *event) { - struct ring_buffer *rb; + struct perf_buffer *rb; + + if (event->parent) + event = event->parent; rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { - if (!atomic_inc_not_zero(&rb->refcount)) + if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); @@ -5069,9 +6684,9 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -void ring_buffer_put(struct ring_buffer *rb) +void ring_buffer_put(struct perf_buffer *rb) { - if (!atomic_dec_and_test(&rb->refcount)) + if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); @@ -5079,18 +6694,31 @@ void ring_buffer_put(struct ring_buffer *rb) call_rcu(&rb->rcu_head, rb_free_rcu); } +typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm); + +#define get_mapped(event, func) \ +({ struct pmu *pmu; \ + mapped_f f = NULL; \ + guard(rcu)(); \ + pmu = READ_ONCE(event->pmu); \ + if (pmu) \ + f = pmu->func; \ + f; \ +}) + static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); - if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + if (mapped) + mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5106,22 +6734,23 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - - struct ring_buffer *rb = ring_buffer_get(event); + mapped_f unmapped = get_mapped(event, event_unmapped); + struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); + bool detach_rest = false; - if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + /* FIXIES vs perf_pmu_unregister() */ + if (unmapped) + unmapped(event, vma->vm_mm); /* - * rb->aux_mmap_count will always drop before rb->mmap_count and - * event->mmap_count, so it is ok to use event->mmap_mutex to - * serialize with perf_mmap here. + * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex + * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -5131,26 +6760,27 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); - WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); - mutex_unlock(&event->mmap_mutex); + mutex_unlock(&rb->aux_mutex); } - atomic_dec(&rb->mmap_count); + if (refcount_dec_and_test(&rb->mmap_count)) + detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) + if (!detach_rest) goto out_put; /* @@ -5204,105 +6834,157 @@ again: * undo the VM accounting. */ - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; + atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, + &mmap_user->locked_vm); + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: ring_buffer_put(rb); /* could be last */ } +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) +{ + /* The first page is the user control page, others are read-only. */ + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; +} + +static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Forbid splitting perf mappings to prevent refcount leaks due to + * the resulting non-matching offsets and sizes. See open()/close(). + */ + return -EINVAL; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, - .close = perf_mmap_close, /* non mergable */ - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, + .close = perf_mmap_close, /* non mergeable */ + .pfn_mkwrite = perf_mmap_pfn_mkwrite, + .may_split = perf_mmap_may_split, }; -static int perf_mmap(struct file *file, struct vm_area_struct *vma) +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) { - struct perf_event *event = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - struct ring_buffer *rb = NULL; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra = 0, extra = 0; - int ret = 0, flags = 0; + unsigned long nr_pages = vma_pages(vma); + int err = 0; + unsigned long pagenum; /* - * Don't allow mmap() of inherited per-task counters. This would - * create a performance issue due to all children writing to the - * same rb. + * We map this as a VM_PFNMAP VMA. + * + * This is not ideal as this is designed broadly for mappings of PFNs + * referencing memory-mapped I/O ranges or non-system RAM i.e. for which + * !pfn_valid(pfn). + * + * We are mapping kernel-allocated memory (memory we manage ourselves) + * which would more ideally be mapped using vm_insert_page() or a + * similar mechanism, that is as a VM_MIXEDMAP mapping. + * + * However this won't work here, because: + * + * 1. It uses vma->vm_page_prot, but this field has not been completely + * setup at the point of the f_op->mmp() hook, so we are unable to + * indicate that this should be mapped CoW in order that the + * mkwrite() hook can be invoked to make the first page R/W and the + * rest R/O as desired. + * + * 2. Anything other than a VM_PFNMAP of valid PFNs will result in + * vm_normal_page() returning a struct page * pointer, which means + * vm_ops->page_mkwrite() will be invoked rather than + * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping + * to work around retry logic in the fault handler, however this + * field is no longer allowed to be used within struct page. + * + * 3. Having a struct page * made available in the fault logic also + * means that the page gets put on the rmap and becomes + * inappropriately accessible and subject to map and ref counting. + * + * Ideally we would have a mechanism that could explicitly express our + * desires, but this is not currently the case, so we instead use + * VM_PFNMAP. + * + * We manage the lifetime of these mappings with internal refcounts (see + * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of + * this mapping is maintained correctly. */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; + for (pagenum = 0; pagenum < nr_pages; pagenum++) { + unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; + struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - - if (vma->vm_pgoff == 0) { - nr_pages = (vma_size / PAGE_SIZE) - 1; - } else { - /* - * AUX area mapping: if rb->aux_nr_pages != 0, it's already - * mapped, all subsequent mappings should have the same size - * and offset. Must be above the normal perf buffer. - */ - u64 aux_offset, aux_size; - - if (!event->rb) - return -EINVAL; + if (page == NULL) { + err = -EINVAL; + break; + } - nr_pages = vma_size / PAGE_SIZE; + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); + if (err) + break; + } - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; +#ifdef CONFIG_MMU + /* Clear any partial mappings on error. */ + if (err) + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); +#endif - rb = event->rb; - if (!rb) - goto aux_unlock; + return err; +} - aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); - aux_size = ACCESS_ONCE(rb->user_page->aux_size); +static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra) +{ + unsigned long user_locked, user_lock_limit, locked, lock_limit; + struct user_struct *user = current_user(); - if (aux_offset < perf_data_size(rb) + PAGE_SIZE) - goto aux_unlock; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + /* Increase the limit linearly with more CPUs */ + user_lock_limit *= num_online_cpus(); - if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) - goto aux_unlock; + user_locked = atomic_long_read(&user->locked_vm); - /* already mapped with a different offset */ - if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) - goto aux_unlock; + /* + * sysctl_perf_event_mlock may have changed, so that + * user->locked_vm > user_lock_limit + */ + if (user_locked > user_lock_limit) + user_locked = user_lock_limit; + user_locked += *user_extra; - if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) - goto aux_unlock; + if (user_locked > user_lock_limit) { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ + *extra = user_locked - user_lock_limit; + *user_extra -= *extra; + } - /* already mapped with a different size */ - if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) - goto aux_unlock; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra; - if (!is_power_of_2(nr_pages)) - goto aux_unlock; + return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK); +} - if (!atomic_inc_not_zero(&rb->mmap_count)) - goto aux_unlock; +static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra) +{ + struct user_struct *user = current_user(); - if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); - ret = 0; - goto unlock; - } + atomic_long_add(user_extra, &user->locked_vm); + atomic64_add(extra, &vma->vm_mm->pinned_vm); +} - atomic_set(&rb->aux_mmap_count, 1); - user_extra = nr_pages; +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + struct perf_buffer *rb; + int rb_flags = 0; - goto accounting; - } + nr_pages -= 1; /* * If we have rb pages ensure they're a power-of-two number, so we @@ -5311,107 +6993,203 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - WARN_ON_ONCE(event->ctx->parent_ctx); -again: - mutex_lock(&event->mmap_mutex); + if (event->rb) { - if (event->rb->nr_pages != nr_pages) { - ret = -EINVAL; - goto unlock; - } + if (data_page_nr(event->rb) != nr_pages) + return -EINVAL; - if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* - * Raced against perf_mmap_close() through - * perf_event_set_output(). Try again, hope for better - * luck. + * Success -- managed to mmap() the same buffer + * multiple times. */ - mutex_unlock(&event->mmap_mutex); - goto again; + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + return 0; } - goto unlock; + /* + * Raced against perf_mmap_close()'s + * refcount_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); } - user_extra = nr_pages + 1; + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) + return -EPERM; -accounting: - user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; + + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, rb_flags); + + if (!rb) + return -ENOMEM; + + refcount_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; + + ring_buffer_attach(event, rb); + + perf_event_update_time(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + + perf_mmap_account(vma, user_extra, extra); + refcount_set(&event->mmap_count, 1); + + return 0; +} + +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, + unsigned long nr_pages) +{ + long extra = 0, user_extra = nr_pages; + u64 aux_offset, aux_size; + struct perf_buffer *rb; + int ret, rb_flags = 0; + + rb = event->rb; + if (!rb) + return -EINVAL; + + guard(mutex)(&rb->aux_mutex); /* - * Increase the limit linearly with more CPUs: + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. */ - user_lock_limit *= num_online_cpus(); + aux_offset = READ_ONCE(rb->user_page->aux_offset); + aux_size = READ_ONCE(rb->user_page->aux_size); - user_locked = atomic_long_read(&user->locked_vm) + user_extra; + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + return -EINVAL; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + return -EINVAL; - lock_limit = rlimit(RLIMIT_MEMLOCK); - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + return -EINVAL; - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } + if (aux_size != nr_pages * PAGE_SIZE) + return -EINVAL; - WARN_ON(!rb && event->rb); + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + return -EINVAL; - if (vma->vm_flags & VM_WRITE) - flags |= RING_BUFFER_WRITABLE; + if (!is_power_of_2(nr_pages)) + return -EINVAL; - if (!rb) { - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); + if (!refcount_inc_not_zero(&rb->mmap_count)) + return -EINVAL; - if (!rb) { - ret = -ENOMEM; - goto unlock; + if (rb_has_aux(rb)) { + refcount_inc(&rb->aux_mmap_count); + + } else { + if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { + refcount_dec(&rb->mmap_count); + return -EPERM; } - atomic_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); - rb->mmap_locked = extra; + WARN_ON(!rb && event->rb); - ring_buffer_attach(event, rb); + if (vma->vm_flags & VM_WRITE) + rb_flags |= RING_BUFFER_WRITABLE; - perf_event_init_userpage(event); - perf_event_update_userpage(event); - } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, - event->attr.aux_watermark, flags); - if (!ret) - rb->aux_mmap_locked = extra; + event->attr.aux_watermark, rb_flags); + if (ret) { + refcount_dec(&rb->mmap_count); + return ret; + } + + refcount_set(&rb->aux_mmap_count, 1); + rb->aux_mmap_locked = extra; } -unlock: - if (!ret) { - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + perf_mmap_account(vma, user_extra, extra); + refcount_inc(&event->mmap_count); + + return 0; +} + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long vma_size, nr_pages; + mapped_f mapped; + int ret; + + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same rb. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + ret = security_perf_event_read(event); + if (ret) + return ret; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; - atomic_inc(&event->mmap_count); - } else if (rb) { - atomic_dec(&rb->mmap_count); + scoped_guard (mutex, &event->mmap_mutex) { + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + + if (vma->vm_pgoff == 0) + ret = perf_mmap_rb(vma, event, nr_pages); + else + ret = perf_mmap_aux(vma, event, nr_pages); + if (ret) + return ret; } -aux_unlock: - mutex_unlock(&event->mmap_mutex); /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; - if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); + + /* + * Try to map it into the page table. On fail, invoke + * perf_mmap_close() to undo the above, as the callsite expects + * full cleanup in this case and therefore does not invoke + * vmops::close(). + */ + ret = map_range(event->rb, vma); + if (ret) + perf_mmap_close(vma); return ret; } @@ -5422,6 +7200,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -5433,7 +7214,6 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { - .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, @@ -5450,14 +7230,6 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ -static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) -{ - /* only the parent has fasync state */ - if (event->parent) - event = event->parent; - return &event->fasync; -} - void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); @@ -5468,23 +7240,105 @@ void perf_event_wakeup(struct perf_event *event) } } -static void perf_pending_event(struct irq_work *entry) +static void perf_sigtrap(struct perf_event *event) +{ + /* + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. + */ + if (current->flags & PF_EXITING) + return; + + /* + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). + */ + if (WARN_ON_ONCE(event->ctx->task != current)) + return; + + send_sig_perf((void __user *)event->pending_addr, + event->orig_type, event->attr.sig_data); +} + +/* + * Deliver the pending work in-event-context or follow the context. + */ +static void __perf_pending_disable(struct perf_event *event) +{ + int cpu = READ_ONCE(event->oncpu); + + /* + * If the event isn't running; we done. event_sched_out() will have + * taken care of things. + */ + if (cpu < 0) + return; + + /* + * Yay, we hit home and are in the context of the event. + */ + if (cpu == smp_processor_id()) { + if (event->pending_disable) { + event->pending_disable = 0; + perf_event_disable_local(event); + } + return; + } + + /* + * CPU-A CPU-B + * + * perf_event_disable_inatomic() + * @pending_disable = 1; + * irq_work_queue(); + * + * sched-out + * @pending_disable = 0; + * + * sched-in + * perf_event_disable_inatomic() + * @pending_disable = 1; + * irq_work_queue(); // FAILS + * + * irq_work_run() + * perf_pending_disable() + * + * But the event runs on CPU-B and wants disabling there. + */ + irq_work_queue_on(&event->pending_disable_irq, cpu); +} + +static void perf_pending_disable(struct irq_work *entry) { - struct perf_event *event = container_of(entry, - struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); int rctx; - rctx = perf_swevent_get_recursion_context(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ + rctx = perf_swevent_get_recursion_context(); + __perf_pending_disable(event); + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); +} - if (event->pending_disable) { - event->pending_disable = 0; - perf_event_disable_local(event); - } +static void perf_pending_irq(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, struct perf_event, pending_irq); + int rctx; + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + + /* + * The wakeup isn't bound to the context of the event -- it can happen + * irrespective of where the event is. + */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); @@ -5494,26 +7348,88 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ -struct perf_guest_info_callbacks *perf_guest_cbs; +static void perf_pending_task(struct callback_head *head) +{ + struct perf_event *event = container_of(head, struct perf_event, pending_task); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + + if (event->pending_work) { + event->pending_work = 0; + perf_sigtrap(event); + local_dec(&event->ctx->nr_no_switch_fast); + } + put_event(event); + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); +} + +#ifdef CONFIG_GUEST_PERF_EVENTS +struct perf_guest_info_callbacks __rcu *perf_guest_cbs; + +DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); +DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = cbs; - return 0; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) + return; + + rcu_assign_pointer(perf_guest_cbs, cbs); + static_call_update(__perf_guest_state, cbs->state); + static_call_update(__perf_guest_get_ip, cbs->get_ip); + + /* Implementing ->handle_intel_pt_intr is optional. */ + if (cbs->handle_intel_pt_intr) + static_call_update(__perf_guest_handle_intel_pt_intr, + cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = NULL; - return 0; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) + return; + + rcu_assign_pointer(perf_guest_cbs, NULL); + static_call_update(__perf_guest_state, (void *)&__static_call_return0); + static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, + (void *)&__static_call_return0); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +#endif + +static bool should_sample_guest(struct perf_event *event) +{ + return !event->attr.exclude_guest && perf_guest_state(); +} + +unsigned long perf_misc_flags(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_arch_guest_misc_flags(regs); + + return perf_arch_misc_flags(regs); +} + +unsigned long perf_instruction_pointer(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_guest_get_ip(); + + return perf_arch_instruction_pointer(regs); +} static void perf_output_sample_regs(struct perf_output_handle *handle, @@ -5532,14 +7448,13 @@ perf_output_sample_regs(struct perf_output_handle *handle, } static void perf_sample_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (current->mm) { - perf_get_regs_user(regs_user, regs, regs_user_copy); + } else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; @@ -5558,7 +7473,7 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr, * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more - * precisly, but there's no way to get it safely under interrupt, + * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit. */ static u64 perf_ustack_task_size(struct pt_regs *regs) @@ -5581,6 +7496,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -5649,14 +7568,136 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, } } -static void __perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) +static unsigned long perf_prepare_sample_aux(struct perf_event *event, + struct perf_sample_data *data, + size_t size) { - u64 sample_type = event->attr.sample_type; + struct perf_event *sampler = event->aux_event; + struct perf_buffer *rb; + + data->aux_size = 0; + + if (!sampler) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) + goto out; + + rb = ring_buffer_get(sampler); + if (!rb) + goto out; + + /* + * If this is an NMI hit inside sampling code, don't take + * the sample. See also perf_aux_sample_output(). + */ + if (READ_ONCE(rb->aux_in_sampling)) { + data->aux_size = 0; + } else { + size = min_t(size_t, size, perf_aux_size(rb)); + data->aux_size = ALIGN(size, sizeof(u64)); + } + ring_buffer_put(rb); + +out: + return data->aux_size; +} + +static long perf_pmu_snapshot_aux(struct perf_buffer *rb, + struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) +{ + unsigned long flags; + long ret; + + /* + * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler + * paths. If we start calling them in NMI context, they may race with + * the IRQ ones, that is, for example, re-starting an event that's just + * been stopped, which is why we're using a separate callback that + * doesn't change the event state. + * + * IRQs need to be disabled to prevent IPIs from racing with us. + */ + local_irq_save(flags); + /* + * Guard against NMI hits inside the critical section; + * see also perf_prepare_sample_aux(). + */ + WRITE_ONCE(rb->aux_in_sampling, 1); + barrier(); + + ret = event->pmu->snapshot_aux(event, handle, size); + + barrier(); + WRITE_ONCE(rb->aux_in_sampling, 0); + local_irq_restore(flags); + + return ret; +} + +static void perf_aux_sample_output(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + struct perf_event *sampler = event->aux_event; + struct perf_buffer *rb; + unsigned long pad; + long size; + + if (WARN_ON_ONCE(!sampler || !data->aux_size)) + return; + + rb = ring_buffer_get(sampler); + if (!rb) + return; + + size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); + + /* + * An error here means that perf_output_copy() failed (returned a + * non-zero surplus that it didn't copy), which in its current + * enlightened implementation is not possible. If that changes, we'd + * like to know. + */ + if (WARN_ON_ONCE(size < 0)) + goto out_put; + + /* + * The pad comes from ALIGN()ing data->aux_size up to u64 in + * perf_prepare_sample_aux(), so should not be more than that. + */ + pad = data->aux_size - size; + if (WARN_ON_ONCE(pad >= sizeof(u64))) + pad = 8; - data->type = sample_type; - header->size += event->id_header_size; + if (pad) { + u64 zero = 0; + perf_output_copy(handle, &zero, pad); + } + +out_put: + ring_buffer_put(rb); +} + +/* + * A set of common sample data types saved even for non-sample records + * when event->attr.sample_id_all is set. + */ +#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ + PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) + +static void __perf_event_header__init_id(struct perf_sample_data *data, + struct perf_event *event, + u64 sample_type) +{ + data->type = event->attr.sample_type; + data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ @@ -5683,8 +7724,10 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { - if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + if (event->attr.sample_id_all) { + header->size += event->id_header_size; + __perf_event_header__init_id(data, event, event->attr.sample_type); + } } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -5724,10 +7767,10 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 enabled, u64 running) { u64 read_format = event->attr.read_format; - u64 values[4]; + u64 values[5]; int n = 0; - values[n++] = perf_event_count(event); + values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); @@ -5738,18 +7781,28 @@ static void perf_output_read_one(struct perf_output_handle *handle, } if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&event->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; - u64 values[5]; + unsigned long flags; + u64 values[6]; int n = 0; + bool self = has_inherit_and_sample_read(&event->attr); + + /* + * Disabling interrupts avoids all counter scheduling + * (context switches, timer based rotation and IPIs). + */ + local_irq_save(flags); values[n++] = 1 + leader->nr_siblings; @@ -5759,28 +7812,33 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if (leader != event) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); - values[n++] = perf_event_count(leader); + values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&leader->lost_samples); __output_copy(handle, values, n * sizeof(u64)); - list_for_each_entry(sub, &leader->sibling_list, group_entry) { + for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); - values[n++] = perf_event_count(sub); + values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); + if (read_format & PERF_FORMAT_LOST) + values[n++] = atomic64_read(&sub->lost_samples); __output_copy(handle, values, n * sizeof(u64)); } + + local_irq_restore(flags); } #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ @@ -5792,6 +7850,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. + * + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread + * counts rather than attempting to accumulate some value across all children on + * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) @@ -5824,6 +7886,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) @@ -5857,19 +7922,11 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); + int size = 1; - __output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } + size += data->callchain->nr; + size *= sizeof(u64); + __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { @@ -5913,7 +7970,17 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); + if (branch_sample_hw_index(event)) + perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); + /* + * Add the extension space which is appended + * right after the struct perf_branch_stack. + */ + if (data->br_stack_cntr) { + size = data->br_stack->nr * sizeof(u64); + perf_output_copy(handle, data->br_stack_cntr, size); + } } else { /* * we always store at least the value of nr @@ -5946,8 +8013,8 @@ void perf_output_sample(struct perf_output_handle *handle, data->regs_user.regs); } - if (sample_type & PERF_SAMPLE_WEIGHT) - perf_output_put(handle, data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); @@ -5972,11 +8039,30 @@ void perf_output_sample(struct perf_output_handle *handle, } } + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + perf_output_put(handle, data->phys_addr); + + if (sample_type & PERF_SAMPLE_CGROUP) + perf_output_put(handle, data->cgroup); + + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + perf_output_put(handle, data->data_page_size); + + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + perf_output_put(handle, data->code_page_size); + + if (sample_type & PERF_SAMPLE_AUX) { + perf_output_put(handle, data->aux_size); + + if (data->aux_size) + perf_aux_sample_output(event, handle, data); + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; if (wakeup_events) { - struct ring_buffer *rb = handle->rb; + struct perf_buffer *rb = handle->rb; int events = local_inc_return(&rb->events); if (events >= wakeup_events) { @@ -5987,74 +8073,228 @@ void perf_output_sample(struct perf_output_handle *handle, } } -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event, - struct pt_regs *regs) +static u64 perf_virt_to_phys(u64 virt) { - u64 sample_type = event->attr.sample_type; + u64 phys_addr = 0; - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header) + event->header_size; + if (!virt) + return 0; - header->misc = 0; - header->misc |= perf_misc_flags(regs); + if (virt >= TASK_SIZE) { + /* If it's vmalloc()d memory, leave phys_addr as 0 */ + if (virt_addr_valid((void *)(uintptr_t)virt) && + !(virt >= VMALLOC_START && virt < VMALLOC_END)) + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); + } else { + /* + * Walking the pages tables for user address. + * Interrupts are disabled, so it prevents any tear down + * of the page tables. + * Try IRQ-safe get_user_page_fast_only first. + * If failed, leave phys_addr as 0. + */ + if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct page *p; - __perf_event_header__init_id(header, data, event); + pagefault_disable(); + if (get_user_page_fast_only(virt, 0, &p)) { + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + put_page(p); + } + pagefault_enable(); + } + } - if (sample_type & PERF_SAMPLE_IP) - data->ip = perf_instruction_pointer(regs); + return phys_addr; +} - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; +/* + * Return the pagetable size of a given virtual address. + */ +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) +{ + u64 size = 0; - data->callchain = perf_callchain(event, regs); +#ifdef CONFIG_HAVE_GUP_FAST + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; - if (data->callchain) - size += data->callchain->nr; + pgdp = pgd_offset(mm, addr); + pgd = READ_ONCE(*pgdp); + if (pgd_none(pgd)) + return 0; + + if (pgd_leaf(pgd)) + return pgd_leaf_size(pgd); + + p4dp = p4d_offset_lockless(pgdp, pgd, addr); + p4d = READ_ONCE(*p4dp); + if (!p4d_present(p4d)) + return 0; + + if (p4d_leaf(p4d)) + return p4d_leaf_size(p4d); + + pudp = pud_offset_lockless(p4dp, p4d, addr); + pud = READ_ONCE(*pudp); + if (!pud_present(pud)) + return 0; + + if (pud_leaf(pud)) + return pud_leaf_size(pud); + + pmdp = pmd_offset_lockless(pudp, pud, addr); +again: + pmd = pmdp_get_lockless(pmdp); + if (!pmd_present(pmd)) + return 0; + + if (pmd_leaf(pmd)) + return pmd_leaf_size(pmd); + + ptep = pte_offset_map(&pmd, addr); + if (!ptep) + goto again; + + pte = ptep_get_lockless(ptep); + if (pte_present(pte)) + size = __pte_leaf_size(pmd, pte); + pte_unmap(ptep); +#endif /* CONFIG_HAVE_GUP_FAST */ + + return size; +} + +static u64 perf_get_page_size(unsigned long addr) +{ + struct mm_struct *mm; + unsigned long flags; + u64 size; + + if (!addr) + return 0; + + /* + * Software page-table walkers must disable IRQs, + * which prevents any tear down of the page tables. + */ + local_irq_save(flags); - header->size += size * sizeof(u64); + mm = current->mm; + if (!mm) { + /* + * For kernel threads and the like, use init_mm so that + * we can find kernel memory. + */ + mm = &init_mm; } - if (sample_type & PERF_SAMPLE_RAW) { - struct perf_raw_record *raw = data->raw; - int size; + size = perf_get_pgtable_size(mm, addr); - if (raw) { - struct perf_raw_frag *frag = &raw->frag; - u32 sum = 0; + local_irq_restore(flags); - do { - sum += frag->size; - if (perf_raw_frag_last(frag)) - break; - frag = frag->next; - } while (1); + return size; +} - size = round_up(sum + sizeof(u32), sizeof(u64)); - raw->size = size - sizeof(u32); - frag->pad = raw->size - sum; - } else { - size = sizeof(u64); - } +static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; + +static struct unwind_work perf_unwind_work; + +struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) +{ + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user && + !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; + const u32 max_stack = event->attr.sample_max_stack; + struct perf_callchain_entry *callchain; + u64 defer_cookie; + + if (!current->mm) + user = false; + + if (!kernel && !user) + return &__empty_callchain; + + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + + return callchain ?: &__empty_callchain; +} + +static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) +{ + return d * !!(flags & s); +} + +void perf_prepare_sample(struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; - header->size += size; + /* + * Add the sample flags that are dependent to others. And clear the + * sample flags that have already been done by the PMU driver. + */ + filtered_sample_type = sample_type; + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, + PERF_SAMPLE_IP); + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | + PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, + PERF_SAMPLE_REGS_USER); + filtered_sample_type &= ~data->sample_flags; + + if (filtered_sample_type == 0) { + /* Make sure it has the correct data->type for output */ + data->type = event->attr.sample_type; + return; } - if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - int size = sizeof(u64); /* nr */ - if (data->br_stack) { - size += data->br_stack->nr - * sizeof(struct perf_branch_entry); - } - header->size += size; + __perf_event_header__init_id(data, event, filtered_sample_type); + + if (filtered_sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(event, regs); + data->sample_flags |= PERF_SAMPLE_IP; } - if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) - perf_sample_regs_user(&data->regs_user, regs, - &data->regs_user_copy); + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) + perf_sample_save_callchain(data, event, regs); - if (sample_type & PERF_SAMPLE_REGS_USER) { + if (filtered_sample_type & PERF_SAMPLE_RAW) { + data->raw = NULL; + data->dyn_size += sizeof(u64); + data->sample_flags |= PERF_SAMPLE_RAW; + } + + if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { + data->br_stack = NULL; + data->dyn_size += sizeof(u64); + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } + + if (filtered_sample_type & PERF_SAMPLE_REGS_USER) + perf_sample_regs_user(&data->regs_user, regs); + + /* + * It cannot use the filtered_sample_type here as REGS_USER can be set + * by STACK_USER (using __cond_set() above) and we don't want to update + * the dyn_size if it's not requested by users. + */ + if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); @@ -6063,20 +8303,22 @@ void perf_prepare_sample(struct perf_event_header *header, size += hweight64(mask) * sizeof(u64); } - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_REGS_USER; } - if (sample_type & PERF_SAMPLE_STACK_USER) { + if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* - * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; + u16 header_size = perf_sample_data_size(data, event); u16 size = sizeof(u64); - stack_size = perf_sample_ustack_size(stack_size, header->size, + stack_size = perf_sample_ustack_size(stack_size, header_size, data->regs_user.regs); /* @@ -6088,10 +8330,31 @@ void perf_prepare_sample(struct perf_event_header *header, size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_STACK_USER; } - if (sample_type & PERF_SAMPLE_REGS_INTR) { + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + data->weight.full = 0; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { + data->data_src.val = PERF_MEM_NA; + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { + data->txn = 0; + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } + + if (filtered_sample_type & PERF_SAMPLE_ADDR) { + data->addr = 0; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); @@ -6103,27 +8366,148 @@ void perf_prepare_sample(struct perf_event_header *header, size += hweight64(mask) * sizeof(u64); } - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_REGS_INTR; + } + + if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { + data->phys_addr = perf_virt_to_phys(data->addr); + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } + +#ifdef CONFIG_CGROUP_PERF + if (filtered_sample_type & PERF_SAMPLE_CGROUP) { + struct cgroup *cgrp; + + /* protected by RCU */ + cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; + data->cgroup = cgroup_id(cgrp); + data->sample_flags |= PERF_SAMPLE_CGROUP; + } +#endif + + /* + * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't + * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, + * but the value will not dump to the userspace. + */ + if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { + data->data_page_size = perf_get_page_size(data->addr); + data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; + } + + if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { + data->code_page_size = perf_get_page_size(data->ip); + data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; + } + + if (filtered_sample_type & PERF_SAMPLE_AUX) { + u64 size; + u16 header_size = perf_sample_data_size(data, event); + + header_size += sizeof(u64); /* size */ + + /* + * Given the 16bit nature of header::size, an AUX sample can + * easily overflow it, what with all the preceding sample bits. + * Make sure this doesn't happen by using up to U16_MAX bytes + * per sample in total (rounded down to 8 byte boundary). + */ + size = min_t(size_t, U16_MAX - header_size, + event->attr.aux_sample_size); + size = rounddown(size, 8); + size = perf_prepare_sample_aux(event, data, size); + + WARN_ON_ONCE(size + header_size > U16_MAX); + data->dyn_size += size + sizeof(u64); /* size above */ + data->sample_flags |= PERF_SAMPLE_AUX; } } -static void __always_inline +void perf_prepare_header(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + header->type = PERF_RECORD_SAMPLE; + header->size = perf_sample_data_size(data, event); + header->misc = perf_misc_flags(event, regs); + + /* + * If you're adding more sample types here, you likely need to do + * something about the overflowing header::size, like repurpose the + * lowest 3 bits of size, which should be always zero at the moment. + * This raises a more important question, do we really need 512k sized + * samples and why, so good argumentation is in order for whatever you + * do here next. + */ + WARN_ON_ONCE(header->size & 7); +} + +static void __perf_event_aux_pause(struct perf_event *event, bool pause) +{ + if (pause) { + if (!event->hw.aux_paused) { + event->hw.aux_paused = 1; + event->pmu->stop(event, PERF_EF_PAUSE); + } + } else { + if (event->hw.aux_paused) { + event->hw.aux_paused = 0; + event->pmu->start(event, PERF_EF_RESUME); + } + } +} + +static void perf_event_aux_pause(struct perf_event *event, bool pause) +{ + struct perf_buffer *rb; + + if (WARN_ON_ONCE(!event)) + return; + + rb = ring_buffer_get(event); + if (!rb) + return; + + scoped_guard (irqsave) { + /* + * Guard against self-recursion here. Another event could trip + * this same from NMI context. + */ + if (READ_ONCE(rb->aux_in_pause_resume)) + break; + + WRITE_ONCE(rb->aux_in_pause_resume, 1); + barrier(); + __perf_event_aux_pause(event, pause); + barrier(); + WRITE_ONCE(rb->aux_in_pause_resume, 0); + } + ring_buffer_put(rb); +} + +static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, + struct perf_sample_data *, struct perf_event *, unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; + int err; /* protect the callchain buffers */ rcu_read_lock(); - perf_prepare_sample(&header, data, event, regs); + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); - if (output_begin(&handle, event, header.size)) + err = output_begin(&handle, data, event, header.size); + if (err) goto exit; perf_output_sample(&handle, &header, data, event); @@ -6132,6 +8516,7 @@ __perf_event_output(struct perf_event *event, exit: rcu_read_unlock(); + return err; } void @@ -6150,12 +8535,12 @@ perf_event_output_backward(struct perf_event *event, __perf_event_output(event, data, regs, perf_output_begin_backward); } -void +int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - __perf_event_output(event, data, regs, perf_output_begin); + return __perf_event_output(event, data, regs, perf_output_begin); } /* @@ -6187,7 +8572,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size); + ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; @@ -6252,7 +8637,6 @@ perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; - int ctxn; rcu_read_lock(); preempt_disable(); @@ -6269,11 +8653,9 @@ perf_iterate_sb(perf_iterate_f output, void *data, perf_iterate_sb_cpu(output, data); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_iterate_ctx(ctx, output, data, false); - } + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); @@ -6295,8 +8677,9 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (filter->inode) { - event->addr_filters_offs[count] = 0; + if (filter->path.dentry) { + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; restart++; } @@ -6314,24 +8697,22 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) void perf_event_exec(void) { struct perf_event_context *ctx; - int ctxn; - rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; + ctx = perf_pin_task_context(current); + if (!ctx) + return; - perf_event_enable_on_exec(ctxn); + perf_event_enable_on_exec(ctx); + perf_event_remove_on_exec(ctx); + scoped_guard(rcu) + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, - true); - } - rcu_read_unlock(); + perf_unpin_context(ctx); + put_ctx(ctx); } struct remote_output { - struct ring_buffer *rb; + struct perf_buffer *rb; int err; }; @@ -6339,7 +8720,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) { struct perf_event *parent = event->parent; struct remote_output *ro = data; - struct ring_buffer *rb = ro->rb; + struct perf_buffer *rb = ro->rb; struct stop_event_data sd = { .event = event, }; @@ -6367,8 +8748,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->pmu; - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; @@ -6455,16 +8835,23 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; task_event->event_id.pid = perf_event_pid(event, task); - task_event->event_id.ppid = perf_event_pid(event, current); - task_event->event_id.tid = perf_event_tid(event, task); - task_event->event_id.ptid = perf_event_tid(event, current); + + if (task_event->event_id.header.type == PERF_RECORD_EXIT) { + task_event->event_id.ppid = perf_event_pid(event, + task->real_parent); + task_event->event_id.ptid = perf_event_pid(event, + task->real_parent); + } else { /* PERF_RECORD_FORK */ + task_event->event_id.ppid = perf_event_pid(event, current); + task_event->event_id.ptid = perf_event_tid(event, current); + } task_event->event_id.time = perf_event_clock(event); @@ -6510,10 +8897,58 @@ static void perf_event_task(struct task_struct *task, task_ctx); } +/* + * Allocate data for a new task when profiling system-wide + * events which require PMU specific data + */ +static void +perf_event_alloc_task_data(struct task_struct *child, + struct task_struct *parent) +{ + struct kmem_cache *ctx_cache = NULL; + struct perf_ctx_data *cd; + + if (!refcount_read(&global_ctx_data_ref)) + return; + + scoped_guard (rcu) { + cd = rcu_dereference(parent->perf_ctx_data); + if (cd) + ctx_cache = cd->ctx_cache; + } + + if (!ctx_cache) + return; + + guard(percpu_read)(&global_ctx_data_rwsem); + scoped_guard (rcu) { + cd = rcu_dereference(child->perf_ctx_data); + if (!cd) { + /* + * A system-wide event may be unaccount, + * when attaching the perf_ctx_data. + */ + if (!refcount_read(&global_ctx_data_ref)) + return; + goto attach; + } + + if (!cd->global) { + cd->global = 1; + refcount_inc(&cd->refcount); + } + } + + return; +attach: + attach_task_ctx_data(child, ctx_cache, true); +} + void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); + perf_event_alloc_task_data(task, current); } /* @@ -6551,7 +8986,7 @@ static void perf_event_comm_output(struct perf_event *event, return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) @@ -6577,7 +9012,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strlcpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -6643,6 +9078,7 @@ static void perf_event_namespaces_output(struct perf_event *event, struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; + u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) @@ -6650,10 +9086,10 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) - return; + goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); @@ -6665,6 +9101,8 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); +out: + namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, @@ -6673,13 +9111,14 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, { struct path ns_path; struct inode *ns_inode; - void *error; + int error; error = ns_get_path(&ns_path, task, ns_ops); if (!error) { ns_inode = ns_path.dentry->d_inode; ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); ns_link_info->ino = ns_inode->i_ino; + path_put(&ns_path); } } @@ -6742,6 +9181,105 @@ void perf_event_namespaces(struct task_struct *task) } /* + * cgroup tracking + */ +#ifdef CONFIG_CGROUP_PERF + +struct perf_cgroup_event { + char *path; + int path_size; + struct { + struct perf_event_header header; + u64 id; + char path[]; + } event_id; +}; + +static int perf_event_cgroup_match(struct perf_event *event) +{ + return event->attr.cgroup; +} + +static void perf_event_cgroup_output(struct perf_event *event, void *data) +{ + struct perf_cgroup_event *cgroup_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u16 header_size = cgroup_event->event_id.header.size; + int ret; + + if (!perf_event_cgroup_match(event)) + return; + + perf_event_header__init_id(&cgroup_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, &sample, event, + cgroup_event->event_id.header.size); + if (ret) + goto out; + + perf_output_put(&handle, cgroup_event->event_id); + __output_copy(&handle, cgroup_event->path, cgroup_event->path_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + cgroup_event->event_id.header.size = header_size; +} + +static void perf_event_cgroup(struct cgroup *cgrp) +{ + struct perf_cgroup_event cgroup_event; + char path_enomem[16] = "//enomem"; + char *pathname; + size_t size; + + if (!atomic_read(&nr_cgroup_events)) + return; + + cgroup_event = (struct perf_cgroup_event){ + .event_id = { + .header = { + .type = PERF_RECORD_CGROUP, + .misc = 0, + .size = sizeof(cgroup_event.event_id), + }, + .id = cgroup_id(cgrp), + }, + }; + + pathname = kmalloc(PATH_MAX, GFP_KERNEL); + if (pathname == NULL) { + cgroup_event.path = path_enomem; + } else { + /* just to be sure to have enough space for alignment */ + cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64)); + cgroup_event.path = pathname; + } + + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(cgroup_event.path) + 1; + while (!IS_ALIGNED(size, sizeof(u64))) + cgroup_event.path[size++] = '\0'; + + cgroup_event.event_id.header.size += size; + cgroup_event.path_size = size; + + perf_iterate_sb(perf_event_cgroup_output, + &cgroup_event, + NULL); + + kfree(pathname); +} + +#endif + +/* * mmap tracking */ @@ -6754,6 +9292,8 @@ struct perf_mmap_event { u64 ino; u64 ino_generation; u32 prot, flags; + u8 build_id[BUILD_ID_SIZE_MAX]; + u32 build_id_size; struct { struct perf_event_header header; @@ -6784,6 +9324,8 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; + u32 type = mmap_event->event_id.header.type; + bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) @@ -6800,7 +9342,7 @@ static void perf_event_mmap_output(struct perf_event *event, } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; @@ -6808,13 +9350,25 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); + use_build_id = event->attr.build_id && mmap_event->build_id_size; + + if (event->attr.mmap2 && use_build_id) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; + perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { - perf_output_put(&handle, mmap_event->maj); - perf_output_put(&handle, mmap_event->min); - perf_output_put(&handle, mmap_event->ino); - perf_output_put(&handle, mmap_event->ino_generation); + if (use_build_id) { + u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; + + __output_copy(&handle, size, 4); + __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); + } else { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } @@ -6827,6 +9381,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_end(&handle); out: mmap_event->event_id.header.size = size; + mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -6839,7 +9394,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - char *name; + char *name = NULL; if (vma->vm_flags & VM_READ) prot |= PROT_READ; @@ -6853,17 +9408,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) else flags = MAP_PRIVATE; - if (vma->vm_flags & VM_DENYWRITE) - flags |= MAP_DENYWRITE; - if (vma->vm_flags & VM_MAYEXEC) - flags |= MAP_EXECUTABLE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - if (vma->vm_flags & VM_HUGETLB) + if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; if (file) { - struct inode *inode; + const struct inode *inode; dev_t dev; buf = kmalloc(PATH_MAX, GFP_KERNEL); @@ -6876,12 +9427,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - name = file_path(file, buf, PATH_MAX - sizeof(u64)); + name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { name = "//toolong"; goto cpy_name; } - inode = file_inode(vma->vm_file); + inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; gen = inode->i_generation; @@ -6890,33 +9441,22 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } else { - if (vma->vm_ops && vma->vm_ops->name) { + if (vma->vm_ops && vma->vm_ops->name) name = (char *) vma->vm_ops->name(vma); - if (name) - goto cpy_name; + if (!name) + name = (char *)arch_vma_name(vma); + if (!name) { + if (vma_is_initial_heap(vma)) + name = "[heap]"; + else if (vma_is_initial_stack(vma)) + name = "[stack]"; + else + name = "//anon"; } - - name = (char *)arch_vma_name(vma); - if (name) - goto cpy_name; - - if (vma->vm_start <= vma->vm_mm->start_brk && - vma->vm_end >= vma->vm_mm->brk) { - name = "[heap]"; - goto cpy_name; - } - if (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack) { - name = "[stack]"; - goto cpy_name; - } - - name = "//anon"; - goto cpy_name; } cpy_name: - strlcpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name); name = tmp; got_name: /* @@ -6942,6 +9482,9 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + if (atomic_read(&nr_build_id_events)) + build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -6956,7 +9499,11 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { - if (filter->inode != file_inode(file)) + /* d_inode(NULL) won't be equal to any mapped user-space file */ + if (!filter->path.dentry) + return false; + + if (d_inode(filter->path.dentry) != file_user_inode(file)) return false; if (filter->offset > offset + size) @@ -6968,28 +9515,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, return true; } +static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, + struct vm_area_struct *vma, + struct perf_addr_filter_range *fr) +{ + unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct file *file = vma->vm_file; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + return false; + + if (filter->offset < off) { + fr->start = vma->vm_start; + fr->size = min(vma_size, filter->size - (off - filter->offset)); + } else { + fr->start = vma->vm_start + filter->offset - off; + fr->size = min(vma->vm_end - fr->start, filter->size); + } + + return true; +} + static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; - struct file *file = vma->vm_file; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; + unsigned long flags; if (!has_addr_filter(event)) return; - if (!file) + if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (perf_addr_filter_match(filter, file, off, - vma->vm_end - vma->vm_start)) { - event->addr_filters_offs[count] = vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, + &event->addr_filter_ranges[count])) restart++; - } count++; } @@ -7008,7 +9574,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; - int ctxn; /* * Data tracing isn't supported yet and as such there is no need @@ -7018,13 +9583,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) return; rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (!ctx) - continue; - + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); - } rcu_read_unlock(); } @@ -7086,7 +9647,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, int ret; perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -7120,7 +9681,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) perf_event_header__init_id(&lost_samples_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; @@ -7175,7 +9736,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data) perf_event_header__init_id(&se->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, se->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; @@ -7210,9 +9771,12 @@ static void perf_event_switch(struct task_struct *task, }, }; - perf_iterate_sb(perf_event_switch_output, - &switch_event, - NULL); + if (!sched_in && task_is_runnable(task)) { + switch_event.event_id.header.misc |= + PERF_RECORD_MISC_SWITCH_OUT_PREEMPT; + } + + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } /* @@ -7246,7 +9810,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; @@ -7256,6 +9820,349 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, &sample, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + const char *sym) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + strscpy(name, sym); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { + struct bpf_prog *prog; + struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + } event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ + return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ + struct perf_bpf_event *bpf_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_bpf_match(event)) + return; + + perf_event_header__init_id(&bpf_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, &sample, event, + bpf_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, bpf_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, + enum perf_bpf_event_type type) +{ + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; + int i; + + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, + prog->aux->ksym.name); + + for (i = 1; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, + subprog->aux->ksym.name); + } +} + +void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) +{ + struct perf_bpf_event bpf_event; + + switch (type) { + case PERF_BPF_EVENT_PROG_LOAD: + case PERF_BPF_EVENT_PROG_UNLOAD: + if (atomic_read(&nr_ksymbol_events)) + perf_event_bpf_emit_ksymbols(prog, type); + break; + default: + return; + } + + if (!atomic_read(&nr_bpf_events)) + return; + + bpf_event = (struct perf_bpf_event){ + .prog = prog, + .event_id = { + .header = { + .type = PERF_RECORD_BPF_EVENT, + .size = sizeof(bpf_event.event_id), + }, + .type = type, + .flags = flags, + .id = prog->aux->id, + }, + }; + + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} + +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + +void perf_event_itrace_started(struct perf_event *event) +{ + WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE); +} + static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; @@ -7271,7 +10178,7 @@ static void perf_log_itrace_start(struct perf_event *event) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || - event->hw.itrace_started) + event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; @@ -7281,7 +10188,7 @@ static void perf_log_itrace_start(struct perf_event *event) rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -7292,6 +10199,37 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } +void perf_report_aux_output_id(struct perf_event *event, u64 hw_id) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u64 hw_id; + } rec; + int ret; + + if (event->parent) + event = event->parent; + + rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID; + rec.header.misc = 0; + rec.header.size = sizeof(rec); + rec.hw_id = hw_id; + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} +EXPORT_SYMBOL_GPL(perf_report_aux_output_id); + static int __perf_event_account_interrupt(struct perf_event *event, int throttle) { @@ -7305,14 +10243,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle - && hwc->interrupts >= max_samples_per_tick)) { - __this_cpu_inc(perf_throttled_count); - tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } + } + + if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); + perf_event_throttle_group(event); + ret = 1; } if (event->attr.freq) { @@ -7333,13 +10270,120 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + +#ifdef CONFIG_BPF_SYSCALL +static int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .event = event, + }; + struct bpf_prog *prog; + int ret = 0; + + ctx.regs = perf_arch_bpf_user_pt_regs(regs); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + prog = READ_ONCE(event->prog); + if (prog) { + perf_prepare_sample(data, event, regs); + ret = bpf_prog_run(prog, &ctx); + } + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + + return ret; +} + +static inline int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + if (prog->type != BPF_PROG_TYPE_PERF_EVENT) + return -EINVAL; + + if (event->attr.precise_ip && + prog->call_get_stack && + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || + event->attr.exclude_callchain_kernel || + event->attr.exclude_callchain_user)) { + /* + * On perf_event with precise_ip, calling bpf_get_stack() + * may trigger unwinder warnings and occasional crashes. + * bpf_get_[stack|stackid] works around this issue by using + * callchain attached to perf_sample_data. If the + * perf_event does not full (kernel and user) callchain + * attached to perf_sample_data, do not allow attaching BPF + * program that calls bpf_get_[stack|stackid]. + */ + return -EPROTO; + } + + event->prog = prog; + event->bpf_cookie = bpf_cookie; + return 0; +} + +static inline void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static inline int bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return 1; +} + +static inline int perf_event_set_bpf_handler(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -EOPNOTSUPP; +} + +static inline void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + /* * Generic event overflow handling, sampling. */ static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; @@ -7353,6 +10397,13 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + if (event->attr.aux_pause) + perf_event_aux_pause(event->aux_event, true); + + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && + !bpf_overflow_handler(event, data, regs)) + goto out; + /* * XXX event_limit might not quite work as expected on inherited * events @@ -7362,23 +10413,69 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - perf_event_disable_inatomic(event); + event->pmu->stop(event, 0); + } + + if (event->attr.sigtrap) { + /* + * The desired behaviour of sigtrap vs invalid samples is a bit + * tricky; on the one hand, one should not loose the SIGTRAP if + * it is the first event, on the other hand, we should also not + * trigger the WARN or override the data address. + */ + bool valid_sample = sample_is_allowed(event, regs); + unsigned int pending_id = 1; + enum task_work_notify_mode notify_mode; + + if (regs) + pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; + + notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME; + + if (!event->pending_work && + !task_work_add(current, &event->pending_task, notify_mode)) { + event->pending_work = pending_id; + local_inc(&event->ctx->nr_no_switch_fast); + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; + + } else if (event->attr.exclude_kernel && valid_sample) { + /* + * Should not be able to return to user space without + * consuming pending_work; with exceptions: + * + * 1. Where !exclude_kernel, events can overflow again + * in the kernel without returning to user space. + * + * 2. Events that can overflow again before the IRQ- + * work without user space progress (e.g. hrtimer). + * To approximate progress (with false negatives), + * check 32-bit hash of the current IP. + */ + WARN_ON_ONCE(event->pending_work != pending_id); + } } READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; - irq_work_queue(&event->pending); + irq_work_queue(&event->pending_irq); } +out: + if (event->attr.aux_resume) + perf_event_aux_pause(event->aux_event, false); return ret; } int perf_event_overflow(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } @@ -7391,11 +10488,7 @@ struct swevent_htable { struct swevent_hlist *swevent_hlist; struct mutex hlist_mutex; int hlist_refcount; - - /* Recursion avoidance in each contexts */ - int recursion[PERF_NR_CONTEXTS]; }; - static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); /* @@ -7414,16 +10507,16 @@ u64 perf_swevent_set_period(struct perf_event *event) hwc->last_period = hwc->sample_period; -again: - old = val = local64_read(&hwc->period_left); - if (val < 0) - return 0; + old = local64_read(&hwc->period_left); + do { + val = old; + if (val < 0) + return 0; - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (local64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + } while (!local64_try_cmpxchg(&hwc->period_left, &old, val)); return nr; } @@ -7483,8 +10576,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; @@ -7593,17 +10685,13 @@ DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - return get_recursion_context(swhash->recursion); + return get_recursion_context(current->perf_recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); - - put_recursion_context(swhash->recursion, rctx); + put_recursion_context(current->perf_recursion, rctx); } void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) @@ -7774,6 +10862,9 @@ static void sw_perf_event_destroy(struct perf_event *event) swevent_hlist_put(); } +static struct pmu perf_cpu_clock; /* fwd declaration */ +static struct pmu perf_task_clock; + static int perf_swevent_init(struct perf_event *event) { u64 event_id = event->attr.config; @@ -7789,7 +10880,10 @@ static int perf_swevent_init(struct perf_event *event) switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: + event->attr.type = perf_cpu_clock.type; + return -ENOENT; case PERF_COUNT_SW_TASK_CLOCK: + event->attr.type = perf_task_clock.type; return -ENOENT; default: @@ -7828,10 +10922,48 @@ static struct pmu perf_swevent = { #ifdef CONFIG_EVENT_TRACING +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) + struct perf_raw_record *raw) { - void *record = data->raw->frag.data; + void *record = raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -7843,18 +10975,18 @@ static int perf_tp_filter_match(struct perf_event *event, } static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, + struct perf_raw_record *raw, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 0; /* - * All tracepoints are from kernel-space. + * If exclude_kernel, only trace user-space tracepoints (uprobes) */ - if (event->attr.exclude_kernel) + if (event->attr.exclude_kernel && !user_mode(regs)) return 0; - if (!perf_tp_filter_match(event, data)) + if (!perf_tp_filter_match(event, raw)) return 0; return 1; @@ -7865,11 +10997,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { - struct bpf_prog *prog = call->prog; - - if (prog) { + if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; - if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } @@ -7879,6 +11009,49 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); +static void __perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_raw_record *raw, + struct perf_event *event) +{ + struct trace_entry *entry = record; + + if (event->attr.config != entry->type) + return; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + return; + if (perf_tp_event_match(event, raw, regs)) { + perf_sample_data_init(data, 0, 0); + perf_sample_save_raw_data(data, event, raw); + perf_swevent_event(event, count, data, regs); + } +} + +static void perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_raw_record *raw, + struct perf_event_context *ctx) +{ + unsigned int cpu = smp_processor_id(); + struct pmu *pmu = &perf_tracepoint; + struct perf_event *event, *sibling; + + perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, raw, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); + } + + perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, raw, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); + } +} + void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) @@ -7893,14 +11066,22 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }, }; - perf_sample_data_init(&data, 0, 0); - data.raw = &raw; - perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) + if (perf_tp_event_match(event, &raw, regs)) { + /* + * Here use the same on-stack perf_sample_data, + * some members in data are event-specific and + * need to be re-computed for different sweveents. + * Re-initialize data->sample_flags safely to avoid + * the problem that next event skips preparing data + * because data->sample_flags is set. + */ + perf_sample_data_init(&data, 0, 0); + perf_sample_save_raw_data(&data, event, &raw); perf_swevent_event(event, count, &data, regs); + } } /* @@ -7909,21 +11090,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, */ if (task && task != current) { struct perf_event_context *ctx; - struct trace_entry *entry = record; rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->attr.type != PERF_TYPE_TRACEPOINT) - continue; - if (event->attr.config != entry->type) - continue; - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } + raw_spin_lock(&ctx->lock); + perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); + raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } @@ -7932,177 +11107,253 @@ unlock: } EXPORT_SYMBOL_GPL(perf_tp_event); -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) +/* + * Flags in config, used by dynamic PMU kprobe and uprobe + * The flags should match following PMU_FORMAT_ATTR(). + * + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe + * if not set, create kprobe/uprobe + * + * The following values specify a reference counter (or semaphore in the + * terminology of tools like dtrace, systemtap, etc.) Userspace Statically + * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. + * + * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset + * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left + */ +enum perf_probe_config { + PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ + PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, + PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, +}; -static int perf_tp_event_init(struct perf_event *event) +PMU_FORMAT_ATTR(retprobe, "config:0"); +#endif + +#ifdef CONFIG_KPROBE_EVENTS +static struct attribute *kprobe_attrs[] = { + &format_attr_retprobe.attr, + NULL, +}; + +static struct attribute_group kprobe_format_group = { + .name = "format", + .attrs = kprobe_attrs, +}; + +static const struct attribute_group *kprobe_attr_groups[] = { + &kprobe_format_group, + NULL, +}; + +static int perf_kprobe_event_init(struct perf_event *event); +static struct pmu perf_kprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_kprobe_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, + .attr_groups = kprobe_attr_groups, +}; + +static int perf_kprobe_event_init(struct perf_event *event) { int err; + bool is_retprobe; - if (event->attr.type != PERF_TYPE_TRACEPOINT) + if (event->attr.type != perf_kprobe.type) return -ENOENT; + if (!perfmon_capable()) + return -EACCES; + /* - * no branch sampling for tracepoint events + * no branch sampling for probe events */ if (has_branch_stack(event)) return -EOPNOTSUPP; - err = perf_trace_init(event); + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + err = perf_kprobe_init(event, is_retprobe); if (err) return err; - event->destroy = tp_perf_event_destroy; + event->destroy = perf_kprobe_destroy; return 0; } +#endif /* CONFIG_KPROBE_EVENTS */ -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, +#ifdef CONFIG_UPROBE_EVENTS +PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); - .event_init = perf_tp_event_init, +static struct attribute *uprobe_attrs[] = { + &format_attr_retprobe.attr, + &format_attr_ref_ctr_offset.attr, + NULL, +}; + +static struct attribute_group uprobe_format_group = { + .name = "format", + .attrs = uprobe_attrs, +}; + +static const struct attribute_group *uprobe_attr_groups[] = { + &uprobe_format_group, + NULL, +}; + +static int perf_uprobe_event_init(struct perf_event *event); +static struct pmu perf_uprobe = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_uprobe_event_init, .add = perf_trace_add, .del = perf_trace_del, .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + .attr_groups = uprobe_attr_groups, }; -static inline void perf_tp_register(void) -{ - perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); -} - -static void perf_event_free_filter(struct perf_event *event) +static int perf_uprobe_event_init(struct perf_event *event) { - ftrace_profile_free_filter(event); -} - -#ifdef CONFIG_BPF_SYSCALL -static void bpf_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct bpf_perf_event_data_kern ctx = { - .data = data, - .regs = regs, - }; - int ret = 0; - - preempt_disable(); - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) - goto out; - rcu_read_lock(); - ret = BPF_PROG_RUN(event->prog, &ctx); - rcu_read_unlock(); -out: - __this_cpu_dec(bpf_prog_active); - preempt_enable(); - if (!ret) - return; + int err; + unsigned long ref_ctr_offset; + bool is_retprobe; - event->orig_overflow_handler(event, data, regs); -} + if (event->attr.type != perf_uprobe.type) + return -ENOENT; -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) -{ - struct bpf_prog *prog; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; - if (event->overflow_handler_context) - /* hw breakpoint or kernel counter */ - return -EINVAL; + /* + * no branch sampling for probe events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; - if (event->prog) - return -EEXIST; + is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; + ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; + err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); + if (err) + return err; - prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); - if (IS_ERR(prog)) - return PTR_ERR(prog); + event->destroy = perf_uprobe_destroy; - event->prog = prog; - event->orig_overflow_handler = READ_ONCE(event->overflow_handler); - WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); return 0; } +#endif /* CONFIG_UPROBE_EVENTS */ -static void perf_event_free_bpf_handler(struct perf_event *event) +static inline void perf_tp_register(void) { - struct bpf_prog *prog = event->prog; - - if (!prog) - return; - - WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); - event->prog = NULL; - bpf_prog_put(prog); + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +#ifdef CONFIG_KPROBE_EVENTS + perf_pmu_register(&perf_kprobe, "kprobe", -1); +#endif +#ifdef CONFIG_UPROBE_EVENTS + perf_pmu_register(&perf_uprobe, "uprobe", -1); +#endif } -#else -static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) + +static void perf_event_free_filter(struct perf_event *event) { - return -EOPNOTSUPP; + ftrace_profile_free_filter(event); } -static void perf_event_free_bpf_handler(struct perf_event *event) + +/* + * returns true if the event is a tracepoint, or a kprobe/upprobe created + * with perf_event_open() + */ +static inline bool perf_event_is_tracing(struct perf_event *event) { -} + if (event->pmu == &perf_tracepoint) + return true; +#ifdef CONFIG_KPROBE_EVENTS + if (event->pmu == &perf_kprobe) + return true; +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (event->pmu == &perf_uprobe) + return true; #endif + return false; +} -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { - bool is_kprobe, is_tracepoint; - struct bpf_prog *prog; + bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return perf_event_set_bpf_handler(event, prog_fd); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; - if (event->tp_event->prog) - return -EEXIST; + if (!perf_event_is_tracing(event)) + return perf_event_set_bpf_handler(event, prog, bpf_cookie); - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE; + is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; - if (!is_kprobe && !is_tracepoint) + is_syscall_tp = is_syscall_trace_event(event->tp_event); + if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; - prog = bpf_prog_get(prog_fd); - if (IS_ERR(prog)) - return PTR_ERR(prog); + if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) || + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) + return -EINVAL; - if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || - (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { - /* valid fd, but invalid bpf program type */ - bpf_prog_put(prog); + if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe) + /* only uprobe programs are allowed to be sleepable */ + return -EINVAL; + + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && !is_kprobe) + return -EINVAL; + + /* Writing to context allowed only for uprobes. */ + if (prog->aux->kprobe_write_ctx && !is_uprobe) return -EINVAL; - } - if (is_tracepoint) { + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); - if (prog->aux->max_ctx_offset > off) { - bpf_prog_put(prog); + if (prog->aux->max_ctx_offset > off) return -EACCES; - } } - event->tp_event->prog = prog; - return 0; + return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } -static void perf_event_free_bpf_prog(struct perf_event *event) +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { - struct bpf_prog *prog; + struct perf_event_context *ctx; + int ret; - perf_event_free_bpf_handler(event); + ctx = perf_event_ctx_lock(event); + ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie); + perf_event_ctx_unlock(event, ctx); - if (!event->tp_event) + return ret; +} + +void perf_event_free_bpf_prog(struct perf_event *event) +{ + if (!event->prog) return; - prog = event->tp_event->prog; - if (prog) { - event->tp_event->prog = NULL; - bpf_prog_put(prog); + if (!perf_event_is_tracing(event)) { + perf_event_free_bpf_handler(event); + return; } + perf_event_detach_bpf_prog(event); } #else @@ -8115,12 +11366,21 @@ static void perf_event_free_filter(struct perf_event *event) { } -static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { return -ENOENT; } -static void perf_event_free_bpf_prog(struct perf_event *event) +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -ENOENT; +} + +void perf_event_free_bpf_prog(struct perf_event *event) { } #endif /* CONFIG_EVENT_TRACING */ @@ -8162,8 +11422,7 @@ static void free_filters_list(struct list_head *filters) struct perf_addr_filter *filter, *iter; list_for_each_entry_safe(filter, iter, filters, entry) { - if (filter->inode) - iput(filter->inode); + path_put(&filter->path); list_del(&filter->entry); kfree(filter); } @@ -8196,31 +11455,36 @@ static void perf_addr_filters_splice(struct perf_event *event, free_filters_list(&list); } +static void perf_free_addr_filters(struct perf_event *event) +{ + /* + * Used during free paths, there is no concurrency. + */ + if (list_empty(&event->addr_filters.list)) + return; + + perf_addr_filters_splice(event, NULL); +} + /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. - * Called with mm::mmap_sem down for reading. + * Called with mm::mmap_lock down for reading. */ -static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, - struct mm_struct *mm) +static void perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm, + struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - struct file *file = vma->vm_file; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - unsigned long vma_size = vma->vm_end - vma->vm_start; - - if (!file) + for_each_vma(vmi, vma) { + if (!vma->vm_file) continue; - if (!perf_addr_filter_match(filter, file, off, vma_size)) - continue; - - return vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, fr)) + return; } - - return 0; } /* @@ -8243,26 +11507,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (task == TASK_TOMBSTONE) return; - if (!ifh->nr_file_filters) - return; - - mm = get_task_mm(event->ctx->task); - if (!mm) - goto restart; + if (ifh->nr_file_filters) { + mm = get_task_mm(task); + if (!mm) + goto restart; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); + } raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - event->addr_filters_offs[count] = 0; + if (filter->path.dentry) { + /* + * Adjust base offset if the filter is associated to a + * binary that needs to be mapped: + */ + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; - /* - * Adjust base offset if the filter is associated to a binary - * that needs to be mapped: - */ - if (filter->inode) - event->addr_filters_offs[count] = - perf_addr_filter_apply(filter, mm); + perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); + } else { + event->addr_filter_ranges[count].start = filter->offset; + event->addr_filter_ranges[count].size = filter->size; + } count++; } @@ -8270,9 +11537,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); - up_read(&mm->mmap_sem); + if (ifh->nr_file_filters) { + mmap_read_unlock(mm); - mmput(mm); + mmput(mm); + } restart: perf_event_stop(event, 1); @@ -8294,7 +11563,8 @@ restart: * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * - * if <size> is not specified, the range is treated as a single address. + * if <size> is not specified or is zero, the range is treated as a single + * address; not valid for ACTION=="filter". */ enum { IF_ACT_NONE = -1, @@ -8333,7 +11603,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, { struct perf_addr_filter *filter = NULL; char *start, *orig, *filename = NULL; - struct path path; substring_t args[MAX_OPT_ARGS]; int state = IF_STATE_ACTION, token; unsigned int kernel = 0; @@ -8344,6 +11613,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, return -ENOMEM; while ((start = strsep(&fstr, " ,\n")) != NULL) { + static const enum perf_addr_filter_action_t actions[] = { + [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER, + [IF_ACT_START] = PERF_ADDR_FILTER_ACTION_START, + [IF_ACT_STOP] = PERF_ADDR_FILTER_ACTION_STOP, + }; ret = -EINVAL; if (!*start) @@ -8360,33 +11634,30 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, switch (token) { case IF_ACT_FILTER: case IF_ACT_START: - filter->filter = 1; - case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail; + filter->action = actions[token]; state = IF_STATE_SOURCE; break; case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; + fallthrough; case IF_SRC_FILEADDR: case IF_SRC_FILE: if (state != IF_STATE_SOURCE) goto fail; - if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) - filter->range = 1; - *args[0].to = 0; ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail; - if (filter->range) { + if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) { *args[1].to = 0; ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) @@ -8394,8 +11665,9 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { - int fpos = filter->range ? 2 : 1; + int fpos = token == IF_SRC_FILE ? 2 : 1; + kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; @@ -8417,7 +11689,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ if (state == IF_STATE_END) { ret = -EINVAL; - if (kernel && event->attr.exclude_kernel) + + /* + * ACTION "filter" must have a non-zero length region + * specified. + */ + if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER && + !filter->size) goto fail; if (!kernel) { @@ -8434,43 +11712,42 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ ret = -EOPNOTSUPP; if (!event->ctx->task) - goto fail_free_name; + goto fail; /* look up the path and grab its inode */ - ret = kern_path(filename, LOOKUP_FOLLOW, &path); + ret = kern_path(filename, LOOKUP_FOLLOW, + &filter->path); if (ret) - goto fail_free_name; - - filter->inode = igrab(d_inode(path.dentry)); - path_put(&path); - kfree(filename); - filename = NULL; + goto fail; ret = -EINVAL; - if (!filter->inode || - !S_ISREG(filter->inode->i_mode)) - /* free_filters_list() will iput() */ + if (!filter->path.dentry || + !S_ISREG(d_inode(filter->path.dentry) + ->i_mode)) goto fail; event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ + kfree(filename); + filename = NULL; state = IF_STATE_ACTION; filter = NULL; + kernel = 0; } } if (state != IF_STATE_ACTION) goto fail; + kfree(filename); kfree(orig); return 0; -fail_free_name: - kfree(filename); fail: + kfree(filename); free_filters_list(filters); kfree(orig); @@ -8519,23 +11796,34 @@ fail_clear_files: static int perf_event_set_filter(struct perf_event *event, void __user *arg) { - char *filter_str; int ret = -EINVAL; - - if ((event->attr.type != PERF_TYPE_TRACEPOINT || - !IS_ENABLED(CONFIG_EVENT_TRACING)) && - !has_addr_filter(event)) - return -EINVAL; + char *filter_str; filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str); - if (IS_ENABLED(CONFIG_EVENT_TRACING) && - event->attr.type == PERF_TYPE_TRACEPOINT) - ret = ftrace_profile_set_filter(event, event->attr.config, - filter_str); - else if (has_addr_filter(event)) +#ifdef CONFIG_EVENT_TRACING + if (perf_event_is_tracing(event)) { + struct perf_event_context *ctx = event->ctx; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but + * the tracepoint stuff does not actually need it. So + * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we + * already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, + * but that does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + } else +#endif + if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); kfree(filter_str); @@ -8556,7 +11844,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event = container_of(hrtimer, struct perf_event, hw.hrtimer); - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE || + event->hw.state & PERF_HES_STOPPED) return HRTIMER_NORESTART; event->pmu->read(event); @@ -8594,18 +11883,28 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) period = max_t(u64, 10000, hwc->sample_period); } hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (is_sampling_event(event)) { + /* + * Careful: this function can be triggered in the hrtimer handler, + * for cpu-clock events, so hrtimer_cancel() would cause a + * deadlock. + * + * So use hrtimer_try_to_cancel() to try to stop the hrtimer, + * and the cpu-clock handler also sets the PERF_HES_STOPPED flag, + * which guarantees that perf_swevent_hrtimer() will stop the + * hrtimer once it sees the PERF_HES_STOPPED flag. + */ + if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); - hrtimer_cancel(&hwc->hrtimer); + hrtimer_try_to_cancel(&hwc->hrtimer); } } @@ -8616,8 +11915,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; + hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); /* * Since hrtimers have a fixed rate, we can do a static freq->period @@ -8650,14 +11948,17 @@ static void cpu_clock_event_update(struct perf_event *event) static void cpu_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); + if (flags & PERF_EF_UPDATE) + cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) @@ -8671,7 +11972,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) static void cpu_clock_event_del(struct perf_event *event, int flags) { - cpu_clock_event_stop(event, flags); + cpu_clock_event_stop(event, PERF_EF_UPDATE); } static void cpu_clock_event_read(struct perf_event *event) @@ -8681,7 +11982,7 @@ static void cpu_clock_event_read(struct perf_event *event) static int cpu_clock_event_init(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_SOFTWARE) + if (event->attr.type != perf_cpu_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) @@ -8702,6 +12003,7 @@ static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, + .dev = PMU_NULL_DEV, .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, @@ -8727,14 +12029,17 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); + if (flags & PERF_EF_UPDATE) + task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -8762,7 +12067,7 @@ static void task_clock_event_read(struct perf_event *event) static int task_clock_event_init(struct perf_event *event) { - if (event->attr.type != PERF_TYPE_SOFTWARE) + if (event->attr.type != perf_task_clock.type) return -ENOENT; if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) @@ -8783,6 +12088,7 @@ static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, .capabilities = PERF_PMU_CAP_NO_NMI, + .dev = PMU_NULL_DEV, .event_init = task_clock_event_init, .add = task_clock_event_add, @@ -8805,6 +12111,11 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } +static int perf_event_nop_int(struct perf_event *event, u64 value) +{ + return 0; +} + static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) @@ -8848,32 +12159,6 @@ static int perf_event_idx_default(struct perf_event *event) } /* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. - */ -static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) -{ - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - -static void free_pmu_context(struct pmu *pmu) -{ - mutex_lock(&pmus_lock); - free_percpu(pmu->pmu_cpu_context); - mutex_unlock(&pmus_lock); -} - -/* * Let userspace know that this PMU supports address range filtering: */ static ssize_t nr_addr_filters_show(struct device *dev, @@ -8882,7 +12167,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return sysfs_emit(page, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -8893,7 +12178,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return sysfs_emit(page, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -8904,7 +12189,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -8934,12 +12219,11 @@ perf_event_mux_interval_ms_store(struct device *dev, /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { - struct perf_cpu_context *cpuctx; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + struct perf_cpu_pmu_context *cpc; + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - cpu_function_call(cpu, - (remote_function_f)perf_mux_hrtimer_restart, cpuctx); + cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); @@ -8948,15 +12232,90 @@ perf_event_mux_interval_ms_store(struct device *dev, } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return topology_sibling_cpumask(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_die_cpumask(cpu); + case PERF_PMU_SCOPE_CLUSTER: + return topology_cluster_cpumask(cpu); + case PERF_PMU_SCOPE_PKG: + return topology_core_cpumask(cpu); + case PERF_PMU_SCOPE_SYS_WIDE: + return cpu_online_mask; + } + + return NULL; +} + +static inline struct cpumask *perf_scope_cpumask(unsigned int scope) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return perf_online_core_mask; + case PERF_PMU_SCOPE_DIE: + return perf_online_die_mask; + case PERF_PMU_SCOPE_CLUSTER: + return perf_online_cluster_mask; + case PERF_PMU_SCOPE_PKG: + return perf_online_pkg_mask; + case PERF_PMU_SCOPE_SYS_WIDE: + return perf_online_sys_mask; + } + + return NULL; +} + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cpumask *mask = perf_scope_cpumask(pmu->scope); + + if (mask) + return cpumap_print_to_pagebuf(true, buf, mask); + return 0; +} + +static DEVICE_ATTR_RO(cpumask); + static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, + &dev_attr_nr_addr_filters.attr, + &dev_attr_cpumask.attr, + NULL, +}; + +static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pmu *pmu = dev_get_drvdata(dev); + + if (n == 2 && !pmu->nr_addr_filters) + return 0; + + /* cpumask */ + if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) + return 0; + + return a->mode; +} + +static struct attribute_group pmu_dev_attr_group = { + .is_visible = pmu_dev_is_visible, + .attrs = pmu_dev_attrs, +}; + +static const struct attribute_group *pmu_dev_groups[] = { + &pmu_dev_attr_group, NULL, }; -ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; -static struct bus_type pmu_bus = { +static const struct bus_type pmu_bus = { .name = "event_source", .dev_groups = pmu_dev_groups, }; @@ -8976,23 +12335,25 @@ static int pmu_dev_alloc(struct pmu *pmu) pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; + pmu->dev->parent = pmu->parent; pmu->dev->release = pmu_dev_release; - ret = device_add(pmu->dev); + + ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) goto free_dev; - /* For PMUs with address filters, throw in an extra attribute: */ - if (pmu->nr_addr_filters) - ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); - + ret = device_add(pmu->dev); if (ret) - goto del_dev; + goto free_dev; + + if (pmu->attr_update) { + ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); + if (ret) + goto del_dev; + } out: return ret; @@ -9002,81 +12363,111 @@ del_dev: free_dev: put_device(pmu->dev); + pmu->dev = NULL; goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, const char *name, int type) +static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) { - int cpu, ret; + void *tmp, *val = idr_find(idr, id); - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; + if (val != old) + return false; - pmu->type = -1; - if (!name) - goto skip_type; - pmu->name = name; + tmp = idr_replace(idr, new, id); + if (IS_ERR(tmp)) + return false; - if (type < 0) { - type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); - if (type < 0) { - ret = type; - goto free_pdc; - } - } - pmu->type = type; + WARN_ON_ONCE(tmp != val); + return true; +} - if (pmu_bus_running) { - ret = pmu_dev_alloc(pmu); - if (ret) - goto free_idr; +static void perf_pmu_free(struct pmu *pmu) +{ + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); } -skip_type: - if (pmu->task_ctx_nr == perf_hw_context) { - static int hw_context_taken = 0; + if (pmu->cpu_pmu_context) { + int cpu; - /* - * Other than systems with heterogeneous CPUs, it never makes - * sense for two PMUs to share perf_hw_context. PMUs which are - * uncore must use perf_invalid_context. - */ - if (WARN_ON_ONCE(hw_context_taken && - !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) - pmu->task_ctx_nr = perf_invalid_context; + for_each_possible_cpu(cpu) { + struct perf_cpu_pmu_context *cpc; - hw_context_taken = 1; + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); + if (!cpc) + continue; + if (cpc->epc.embedded) { + /* refcount managed */ + put_pmu_ctx(&cpc->epc); + continue; + } + kfree(cpc); + } + free_percpu(pmu->cpu_pmu_context); } +} - pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); - if (pmu->pmu_cpu_context) - goto got_cpu_context; +DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T)) - ret = -ENOMEM; - pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); - if (!pmu->pmu_cpu_context) - goto free_dev; +int perf_pmu_register(struct pmu *_pmu, const char *name, int type) +{ + int cpu, max = PERF_TYPE_MAX; + + struct pmu *pmu __free(pmu_unregister) = _pmu; + guard(mutex)(&pmus_lock); + + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) + return -EINVAL; + + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, + "Can not register a pmu with an invalid scope.\n")) + return -EINVAL; + + pmu->name = name; + + if (type >= 0) + max = type; + + CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL); + if (pmu_type.id < 0) + return pmu_type.id; + + WARN_ON(type >= 0 && pmu_type.id != type); + + pmu->type = pmu_type.id; + atomic_set(&pmu->exclusive_cnt, 0); + + if (pmu_bus_running && !pmu->dev) { + int ret = pmu_dev_alloc(pmu); + if (ret) + return ret; + } + + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *); + if (!pmu->cpu_pmu_context) + return -ENOMEM; for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_pmu_context *cpc = + kmalloc_node(sizeof(struct perf_cpu_pmu_context), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); - lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); - lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.pmu = pmu; - cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); + if (!cpc) + return -ENOMEM; - __perf_mux_hrtimer_init(cpuctx, cpu); + *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc; + __perf_init_event_pmu_context(&cpc->epc, pmu); + __perf_mux_hrtimer_init(cpc, cpu); } -got_cpu_context: if (!pmu->start_txn) { if (pmu->pmu_enable) { /* @@ -9099,60 +12490,174 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->check_period) + pmu->check_period = perf_event_nop_int; + if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + + /* + * Now that the PMU is complete, make it visible to perf_try_init_event(). + */ + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) + return -EINVAL; list_add_rcu(&pmu->entry, &pmus); - atomic_set(&pmu->exclusive_cnt, 0); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - return ret; + take_idr_id(pmu_type); + _pmu = no_free_ptr(pmu); // let it rip + return 0; +} +EXPORT_SYMBOL_GPL(perf_pmu_register); -free_dev: - device_del(pmu->dev); - put_device(pmu->dev); +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) +{ + /* + * De-schedule the event and mark it REVOKED. + */ + perf_event_exit_event(event, ctx, true); -free_idr: - if (pmu->type >= PERF_TYPE_MAX) - idr_remove(&pmu_idr, pmu->type); + /* + * All _free_event() bits that rely on event->pmu: + * + * Notably, perf_mmap() relies on the ordering here. + */ + scoped_guard (mutex, &event->mmap_mutex) { + WARN_ON_ONCE(pmu->event_unmapped); + /* + * Mostly an empty lock sequence, such that perf_mmap(), which + * relies on mmap_mutex, is sure to observe the state change. + */ + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; + } + + exclusive_event_destroy(event); + module_put(pmu->module); -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; + event->pmu = NULL; /* force fault instead of UAF */ } -EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) { - int remove_device; + struct perf_event_context *ctx; - mutex_lock(&pmus_lock); - remove_device = pmu_bus_running; - list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard (spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) +{ + scoped_guard (mutex, &pmus_lock) { + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) + return -EINVAL; + + list_del_rcu(&pmu->entry); + } /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. + * + * Notably, the entirety of event creation, from perf_init_event() + * (which will now fail, because of the above) until + * perf_install_in_context() should be under SRCU such that + * this synchronizes against event creation. This avoids trying to + * detach events that are not fully formed. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - if (pmu->type >= PERF_TYPE_MAX) - idr_remove(&pmu_idr, pmu->type); - if (remove_device) { - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (pmu->event_unmapped && !pmu_empty(pmu)) { + /* + * Can't force remove events when pmu::event_unmapped() + * is used in perf_mmap_close(). + */ + guard(mutex)(&pmus_lock); + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu); + list_add_rcu(&pmu->entry, &pmus); + return -EBUSY; } - free_pmu_context(pmu); + + scoped_guard (mutex, &pmus_lock) + idr_remove(&pmu_idr, pmu->type); + + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ + perf_pmu_free(pmu); + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static inline bool has_extended_regs(struct perf_event *event) +{ + return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || + (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); +} + static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; @@ -9161,7 +12666,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (!try_module_get(pmu->module)) return -ENODEV; - if (event->group_leader != event) { + /* + * A number of pmu->event_init() methods iterate the sibling_list to, + * for example, validate if the group fits on the PMU. Therefore, + * if this is a sibling event, acquire the ctx->mutex to protect + * the sibling_list. + */ + if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. @@ -9178,52 +12689,120 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) perf_event_ctx_unlock(event->group_leader, ctx); if (ret) - module_put(pmu->module); + goto err_pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) { + ret = -EOPNOTSUPP; + goto err_destroy; + } + + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + ret = -EINVAL; + goto err_destroy; + } + + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask; + struct cpumask *pmu_cpumask; + int cpu; + + cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + pmu_cpumask = perf_scope_cpumask(pmu->scope); + + ret = -ENODEV; + if (!pmu_cpumask || !cpumask) + goto err_destroy; + + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + goto err_destroy; + + event->event_caps |= PERF_EV_CAP_READ_SCOPE; + } + + return 0; + +err_destroy: + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } +err_pmu: + event->pmu = NULL; + module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { + bool extended_type = false; struct pmu *pmu; - int idx; - int ret; + int type, ret; + + guard(srcu)(&pmus_srcu); /* pmu idr/list access */ - idx = srcu_read_lock(&pmus_srcu); + /* + * Save original type before calling pmu->event_init() since certain + * pmus overwrites event->attr.type to forward event to another pmu. + */ + event->orig_type = event->attr.type; /* Try parent's PMU first: */ if (event->parent && event->parent->pmu) { pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; } - rcu_read_lock(); - pmu = idr_find(&pmu_idr, event->attr.type); - rcu_read_unlock(); + /* + * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * are often aliases for PERF_TYPE_RAW. + */ + type = event->attr.type; + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { + type = event->attr.config >> PERF_PMU_TYPE_SHIFT; + if (!type) { + type = PERF_TYPE_RAW; + } else { + extended_type = true; + event->attr.config &= PERF_HW_EVENT_MASK; + } + } + +again: + scoped_guard (rcu) + pmu = idr_find(&pmu_idr, type); if (pmu) { + if (event->attr.type != type && type != PERF_TYPE_RAW && + !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) + return ERR_PTR(-ENOENT); + ret = perf_try_init_event(pmu, event); + if (ret == -ENOENT && event->attr.type != type && !extended_type) { + type = event->attr.type; + goto again; + } + if (ret) - pmu = ERR_PTR(ret); - goto unlock; + return ERR_PTR(ret); + + return pmu; } - list_for_each_entry_rcu(pmu, &pmus, entry) { + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } + if (ret != -ENOENT) + return ERR_PTR(ret); } - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - return pmu; + return ERR_PTR(-ENOENT); } static void attach_sb_event(struct perf_event *event) @@ -9248,15 +12827,6 @@ static void account_pmu_sb_event(struct perf_event *event) attach_sb_event(event); } -static void account_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_inc(&per_cpu(perf_cgroup_events, cpu)); -} - /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { @@ -9285,14 +12855,18 @@ static void account_event(struct perf_event *event) if (event->parent) return; - if (event->attach_state & PERF_ATTACH_TASK) + if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB)) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); + if (event->attr.build_id) + atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) atomic_inc(&nr_namespaces_events); + if (event->attr.cgroup) + atomic_inc(&nr_cgroup_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) @@ -9305,8 +12879,19 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { + /* + * We need the mutex here because static_branch_enable() + * must complete *before* the perf_sched_count increment + * becomes visible. + */ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled; @@ -9318,7 +12903,7 @@ static void account_event(struct perf_event *event) * call the perf scheduling hooks before proceeding to * install events that need them. */ - synchronize_sched(); + synchronize_rcu(); } /* * Now that we have waited for the sync_sched(), allow further @@ -9329,13 +12914,11 @@ static void account_event(struct perf_event *event) } enabled: - account_event_cpu(event, event->cpu); - account_pmu_sb_event(event); } /* - * Allocate and initialize a event structure + * Allocate and initialize an event structure */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, @@ -9346,16 +12929,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, void *context, int cgroup_fd) { struct pmu *pmu; - struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; + int node; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } + if (attr->sigtrap && !task) { + /* Requires a task: avoid signalling random tasks. */ + return ERR_PTR(-EINVAL); + } - event = kzalloc(sizeof(*event), GFP_KERNEL); + node = (cpu >= 0) ? cpu_to_node(cpu) : -1; + struct perf_event *event __free(__free_event) = + kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); @@ -9369,17 +12958,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->child_mutex); INIT_LIST_HEAD(&event->child_list); - INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->active_list); + init_event_group(event); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); + INIT_LIST_HEAD(&event->pmu_list); init_waitqueue_head(&event->waitq); - init_irq_work(&event->pending, perf_pending_event); + init_irq_work(&event->pending_irq, perf_pending_irq); + event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); + init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -9398,6 +12991,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (parent_event) + event->event_caps = parent_event->event_caps; + if (task) { event->attach_state = PERF_ATTACH_TASK; /* @@ -9405,7 +13001,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * and we cannot use the ctx information because we need the * pmu before we get a ctx. */ - event->hw.target = task; + event->hw.target = get_task_struct(task); } event->clock = &local_clock; @@ -9416,16 +13012,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) - if (overflow_handler == bpf_overflow_handler) { - struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + if (parent_event->prog) { + struct bpf_prog *prog = parent_event->prog; - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto err_ns; - } + bpf_prog_inc(prog); event->prog = prog; - event->orig_overflow_handler = - parent_event->orig_overflow_handler; } #endif } @@ -9447,45 +13038,91 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, hwc = &event->hw; hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) + if (is_event_in_freq_mode(event)) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); /* - * We currently do not support PERF_SAMPLE_READ on inherited events. + * We do not support PERF_SAMPLE_READ on inherited events unless + * PERF_SAMPLE_TID is also selected, which allows inherited events to + * collect per-thread samples. * See perf_output_read(). */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) - goto err_ns; + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) + return ERR_PTR(-EINVAL); if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; - if (cgroup_fd != -1) { - err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + pmu = perf_init_event(event); + if (IS_ERR(pmu)) + return (void*)pmu; + + /* + * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config(). + * The attach should be right after the perf_init_event(). + * Otherwise, the __free_event() would mistakenly detach the non-exist + * perf_ctx_data because of the other errors between them. + */ + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + err = attach_perf_ctx_data(event); if (err) - goto err_ns; + return ERR_PTR(err); } - pmu = perf_init_event(event); - if (IS_ERR(pmu)) { - err = PTR_ERR(pmu); - goto err_ns; + /* + * Disallow uncore-task events. Similarly, disallow uncore-cgroup + * events (they don't make sense as the cgroup will be different + * on other CPUs in the uncore mask). + */ + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) + return ERR_PTR(-EINVAL); + + if (event->attr.aux_output && + (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || + event->attr.aux_pause || event->attr.aux_resume)) + return ERR_PTR(-EOPNOTSUPP); + + if (event->attr.aux_pause && event->attr.aux_resume) + return ERR_PTR(-EINVAL); + + if (event->attr.aux_start_paused) { + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return ERR_PTR(-EOPNOTSUPP); + event->hw.aux_paused = 1; + } + + if (cgroup_fd != -1) { + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + return ERR_PTR(err); } err = exclusive_event_init(event); if (err) - goto err_pmu; + return ERR_PTR(err); if (has_addr_filter(event)) { - event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, - sizeof(unsigned long), - GFP_KERNEL); - if (!event->addr_filters_offs) { - err = -ENOMEM; - goto err_per_task; + event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, + sizeof(struct perf_addr_filter_range), + GFP_KERNEL); + if (!event->addr_filter_ranges) + return ERR_PTR(-ENOMEM); + + /* + * Clone the parent's vma offsets: they are valid until exec() + * even if the mm is not shared with the parent. + */ + if (event->parent) { + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + raw_spin_lock_irq(&ifh->lock); + memcpy(event->addr_filter_ranges, + event->parent->addr_filter_ranges, + pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); + raw_spin_unlock_irq(&ifh->lock); } /* force hw sync on the address filters */ @@ -9496,33 +13133,26 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_addr_filters; + return ERR_PTR(err); + event->attach_state |= PERF_ATTACH_CALLCHAIN; } } + err = security_perf_event_alloc(event); + if (err) + return ERR_PTR(err); + /* symmetric to unaccount_event() in _free_event() */ account_event(event); - return event; - -err_addr_filters: - kfree(event->addr_filters_offs); - -err_per_task: - exclusive_event_destroy(event); - -err_pmu: - if (event->destroy) - event->destroy(event); - module_put(pmu->module); -err_ns: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - if (event->ns) - put_pid_ns(event->ns); - kfree(event); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + lockdep_assert_held(&pmus_srcu); + scoped_guard (spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); - return ERR_PTR(err); + return_ptr(event); } static int perf_copy_attr(struct perf_event_attr __user *uattr, @@ -9531,56 +13161,29 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, u32 size; int ret; - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ + /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ + /* ABI compatibility quirk: */ + if (!size) size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) + if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; + attr->size = size; - if (attr->__reserved_1) + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -9618,9 +13221,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { + ret = perf_allow_kernel(); + if (ret) + return ret; + } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { @@ -9639,13 +13244,34 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, * __u16 sample size limit. */ if (attr->sample_stack_user >= USHRT_MAX) - ret = -EINVAL; + return -EINVAL; else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) - ret = -EINVAL; + return -EINVAL; } + if (!attr->sample_max_stack) + attr->sample_max_stack = sysctl_perf_event_max_stack; + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); + +#ifndef CONFIG_CGROUP_PERF + if (attr->sample_type & PERF_SAMPLE_CGROUP) + return -EINVAL; +#endif + if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && + (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) + return -EINVAL; + + if (!attr->inherit && attr->inherit_thread) + return -EINVAL; + + if (attr->remove_on_exec && attr->enable_on_exec) + return -EINVAL; + + if (attr->sigtrap && !attr->remove_on_exec) + return -EINVAL; + out: return ret; @@ -9655,14 +13281,25 @@ err_size: goto out; } +static void mutex_lock_double(struct mutex *a, struct mutex *b) +{ + if (b < a) + swap(a, b); + + mutex_lock(a); + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); +} + static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct ring_buffer *rb = NULL; + struct perf_buffer *rb = NULL; int ret = -EINVAL; - if (!output_event) + if (!output_event) { + mutex_lock(&event->mmap_mutex); goto set; + } /* don't allow circular references */ if (event == output_event) @@ -9677,7 +13314,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) /* * If its not a per-cpu rb, it must be the same task. */ - if (output_event->cpu == -1 && output_event->ctx != event->ctx) + if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out; /* @@ -9700,17 +13337,33 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) event->pmu != output_event->pmu) goto out; + /* + * Hold both mmap_mutex to serialize against perf_mmap_close(). Since + * output_event is already on rb->event_list, and the list iteration + * restarts after every removal, it is guaranteed this new event is + * observed *OR* if output_event is already removed, it's guaranteed we + * observe !rb->mmap_count. + */ + mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: - mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { + if (output_event->state <= PERF_EVENT_STATE_REVOKED) + goto unlock; + /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) goto unlock; + + /* did we race against perf_mmap_close() */ + if (!refcount_read(&rb->mmap_count)) { + ring_buffer_put(rb); + goto unlock; + } } ring_buffer_attach(event, rb); @@ -9718,20 +13371,13 @@ set: ret = 0; unlock: mutex_unlock(&event->mmap_mutex); + if (output_event) + mutex_unlock(&output_event->mmap_mutex); out: return ret; } -static void mutex_lock_double(struct mutex *a, struct mutex *b) -{ - if (b < a) - swap(a, b); - - mutex_lock(a); - mutex_lock_nested(b, SINGLE_DEPTH_NESTING); -} - static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) { bool nmi_safe = false; @@ -9752,11 +13398,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) break; case CLOCK_BOOTTIME: - event->clock = &ktime_get_boot_ns; + event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: - event->clock = &ktime_get_tai_ns; + event->clock = &ktime_get_clocktai_ns; break; default: @@ -9769,35 +13415,35 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } -/* - * Variation on perf_event_ctx_lock_nested(), except we take two context - * mutexes. - */ -static struct perf_event_context * -__perf_event_ctx_lock_double(struct perf_event *group_leader, - struct perf_event_context *ctx) +static bool +perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { - struct perf_event_context *gctx; + unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS; + bool is_capable = perfmon_capable(); -again: - rcu_read_lock(); - gctx = READ_ONCE(group_leader->ctx); - if (!atomic_inc_not_zero(&gctx->refcount)) { + if (attr->sigtrap) { + /* + * perf_event_attr::sigtrap sends signals to the other task. + * Require the current task to also have CAP_KILL. + */ + rcu_read_lock(); + is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL); rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); - mutex_lock_double(&gctx->mutex, &ctx->mutex); - - if (group_leader->ctx != gctx) { - mutex_unlock(&ctx->mutex); - mutex_unlock(&gctx->mutex); - put_ctx(gctx); - goto again; + /* + * If the required capabilities aren't available, checks for + * ptrace permissions: upgrade to ATTACH, since sending signals + * can effectively change the target task. + */ + ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS; } - return gctx; + /* + * Preserve ptrace permission check for backwards compatibility. The + * ptrace check also includes checks that the current task and other + * task have matching uids, and is therefore not done here explicitly. + */ + return is_capable || ptrace_may_access(task, ptrace_mode); } /** @@ -9807,17 +13453,18 @@ again: * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd + * @flags: perf event open flags */ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *uninitialized_var(gctx); + struct perf_event_context *ctx; struct file *event_file = NULL; - struct fd group = {NULL, 0}; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; @@ -9834,13 +13481,19 @@ SYSCALL_DEFINE5(perf_event_open, if (err) return err; + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(PERF_SECURITY_OPEN); + if (err) + return err; + if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + err = perf_allow_kernel(); + if (err) + return err; } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } @@ -9852,8 +13505,19 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } - if (!attr.sample_max_stack) - attr.sample_max_stack = sysctl_perf_event_max_stack; + /* Only privileged users can get physical addresses */ + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { + err = perf_allow_kernel(); + if (err) + return err; + } + + /* REGS_INTR can leak data, lockdown must prevent this */ + if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { + err = security_locked_down(LOCKDOWN_PERF); + if (err) + return err; + } /* * In cgroup mode, the pid argument is used to pass the fd @@ -9871,11 +13535,22 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { - err = perf_fget_light(group_fd, &group); - if (err) + if (!is_perf_file(group)) { + err = -EBADF; goto err_fd; - group_leader = group.file->private_data; + } + group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -9886,7 +13561,7 @@ SYSCALL_DEFINE5(perf_event_open, task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); - goto err_group_fd; + goto err_fd; } } @@ -9896,24 +13571,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - if (task) { - err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); - if (err) - goto err_task; - - /* - * Reuse ptrace permission checks for now. - * - * We must hold cred_guard_mutex across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto err_cred; - } - if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -9921,7 +13578,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cred; + goto err_task; } if (is_sampling_event(event)) { @@ -9946,46 +13603,53 @@ SYSCALL_DEFINE5(perf_event_open, if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; - if (group_leader && - (is_software_event(event) != is_software_event(group_leader))) { - if (is_software_event(event)) { - /* - * If event and group_leader are not both a software - * event, and event is, then group leader is not. - * - * Allow the addition of software events to !software - * groups, this is safe because software events never - * fail to schedule. - */ - pmu = group_leader->pmu; - } else if (is_software_event(group_leader) && - (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { - /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. - */ - move_group = 1; - } + if (task) { + err = down_read_interruptible(&task->signal->exec_update_lock); + if (err) + goto err_alloc; + + /* + * We must hold exec_update_lock across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!perf_check_permission(&attr, task)) + goto err_cred; } /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_alloc; + goto err_cred; } - if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { - err = -EBUSY; - goto err_context; + mutex_lock(&ctx->mutex); + + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } } - /* - * Look up the group leader (we will attach this event to it): - */ if (group_leader) { err = -EINVAL; @@ -9994,154 +13658,123 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_context; + goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) - goto err_context; + goto err_locked; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_locked; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same context; either task or cpu. + */ + if (group_leader->ctx != ctx) + goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_context; - } - - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } - - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, - f_flags); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - event_file = NULL; - goto err_context; - } - - if (move_group) { - gctx = __perf_event_ctx_lock_double(group_leader, ctx); - - if (gctx->task == TASK_TOMBSTONE) { - err = -ESRCH; goto err_locked; - } - /* - * Check if we raced against another sys_perf_event_open() call - * moving the software group underneath us. - */ - if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + if (is_software_event(event) && + !in_software_context(group_leader)) { /* - * If someone moved the group out from under us, check - * if this new event wound up on the same ctx, if so - * its the regular !move_group case, otherwise fail. + * If the event is a sw event, but the group_leader + * is on hw context. + * + * Allow the addition of software events to hw + * groups, this is safe because software events + * never fail to schedule. + * + * Note the comment that goes with struct + * perf_event_pmu_context. */ - if (gctx != ctx) { - err = -EINVAL; - goto err_locked; - } else { - perf_event_ctx_unlock(group_leader, gctx); - move_group = 0; + pmu = group_leader->pmu_ctx->pmu; + } else if (!is_software_event(event)) { + if (is_software_event(group_leader) && + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; } + + /* Don't allow group of multiple hw events from different pmus */ + if (!in_software_context(group_leader) && + group_leader->pmu_ctx->pmu != pmu) + goto err_locked; } - } else { - mutex_lock(&ctx->mutex); } - if (ctx->task == TASK_TOMBSTONE) { - err = -ESRCH; + /* + * Now that we're certain of the pmu; find the pmu_ctx. + */ + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); goto err_locked; } + event->pmu_ctx = pmu_ctx; + + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_context; + } if (!perf_event_validate_size(event)) { err = -E2BIG; - goto err_locked; + goto err_context; } - if (!task) { - /* - * Check if the @cpu we're creating an event for is online. - * - * We use the perf_cpu_context::ctx::mutex to serialize against - * the hotplug notifiers. See perf_event_{init,exit}_cpu(). - */ - struct perf_cpu_context *cpuctx = - container_of(ctx, struct perf_cpu_context, ctx); - - if (!cpuctx->online) { - err = -ENODEV; - goto err_locked; - } + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { + err = -EINVAL; + goto err_context; } - /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { - /* exclusive and group stuff are assumed mutually exclusive */ - WARN_ON_ONCE(move_group); - err = -EBUSY; - goto err_locked; + goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); + event_file = NULL; + goto err_context; + } + /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ perf_remove_from_context(group_leader, 0); - put_ctx(gctx); + put_pmu_ctx(group_leader->pmu_ctx); - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); - put_ctx(gctx); + put_pmu_ctx(sibling->pmu_ctx); } /* - * Wait for everybody to stop referencing the events through - * the old lists, before installing it on new lists. - */ - synchronize_rcu(); - - /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group @@ -10151,11 +13784,11 @@ SYSCALL_DEFINE5(perf_event_open, * By installing siblings first we NO-OP because they're not * reachable through the group lists. */ - list_for_each_entry(sibling, &group_leader->sibling_list, - group_entry) { + for_each_sibling_event(sibling, group_leader) { + sibling->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); - get_ctx(ctx); } /* @@ -10163,9 +13796,10 @@ SYSCALL_DEFINE5(perf_event_open, * event. What we want here is event in the initial * startup state, ready to be add into new context. */ + group_leader->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); - get_ctx(ctx); } /* @@ -10182,12 +13816,10 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { - mutex_unlock(&task->signal->cred_guard_mutex); + up_read(&task->signal->exec_update_lock); put_task_struct(task); } @@ -10196,39 +13828,29 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). + * File reference in group guarantees that group_leader has been + * kept alive until we place the new event on the sibling_list. + * This ensures destruction of the group leader will find + * the pointer to itself in perf_group_detach(). */ - fdput(group); fd_install(event_fd, event_file); return event_fd; +err_context: + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_locked: - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); -/* err_file: */ - fput(event_file); -err_context: perf_unpin_context(ctx); put_ctx(ctx); -err_alloc: - /* - * If event_file is set, the fput() above will have called ->release() - * and that will take care of freeing the event. - */ - if (!event_file) - free_event(event); err_cred: if (task) - mutex_unlock(&task->signal->cred_guard_mutex); + up_read(&task->signal->exec_update_lock); +err_alloc: + put_event(event); err_task: if (task) put_task_struct(task); -err_group_fd: - fdput(group); err_fd: put_unused_fd(event_fd); return err; @@ -10240,6 +13862,8 @@ err_fd: * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) + * @overflow_handler: callback to trigger when we hit the event + * @context: context data could be used in overflow_handler callback */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, @@ -10247,13 +13871,23 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, perf_overflow_handler_t overflow_handler, void *context) { + struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; + struct pmu *pmu; int err; /* - * Get the target context (task or percpu): + * Grouping is not supported for kernel events, neither is 'AUX', + * make sure the caller's intentions are adjusted. + */ + if (attr->aux_output || attr->aux_action) + return ERR_PTR(-EINVAL); + + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). */ + guard(srcu)(&pmus_srcu); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); @@ -10264,11 +13898,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + pmu = event->pmu; - ctx = find_get_context(event->pmu, task, event); + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_free; + goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); @@ -10278,6 +13919,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); + goto err_unlock; + } + event->pmu_ctx = pmu_ctx; + if (!task) { /* * Check if the @cpu we're creating an event for is online. @@ -10289,59 +13937,82 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; - goto err_unlock; + goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_unlock; + goto err_pmu_ctx; } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; +err_pmu_ctx: + put_pmu_ctx(pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); -err_free: - free_event(event); +err_alloc: + put_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +static void __perf_pmu_remove(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, + struct perf_event_groups *groups, + struct list_head *events) { - struct perf_event_context *src_ctx; - struct perf_event_context *dst_ctx; - struct perf_event *event, *tmp; - LIST_HEAD(events); - - src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; - dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + struct perf_event *event, *sibling; - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ - mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &src_ctx->event_list, - event_entry) { + perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, src_cpu); - put_ctx(src_ctx); - list_add(&event->migrate_entry, &events); + put_pmu_ctx(event->pmu_ctx); + list_add(&event->migrate_entry, events); + + for_each_sibling_event(sibling, event) { + perf_remove_from_context(sibling, 0); + put_pmu_ctx(sibling->pmu_ctx); + list_add(&sibling->migrate_entry, events); + } } +} + +static void __perf_pmu_install_event(struct pmu *pmu, + struct perf_event_context *ctx, + int cpu, struct perf_event *event) +{ + struct perf_event_pmu_context *epc; + struct perf_event_context *old_ctx = event->ctx; + + get_ctx(ctx); /* normally find_get_context() */ + + event->cpu = cpu; + epc = find_get_pmu_context(pmu, ctx, event); + event->pmu_ctx = epc; + + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + perf_install_in_context(ctx, event, cpu); /* - * Wait for the events to quiesce before re-instating them. + * Now that event->ctx is updated and visible, put the old ctx. */ - synchronize_rcu(); + put_ctx(old_ctx); +} + +static void __perf_pmu_install(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, struct list_head *events) +{ + struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. @@ -10351,45 +14022,72 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) * leader will enable its siblings, even if those are still on the old * context. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } +} + +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx, *dst_ctx; + LIST_HEAD(events); + + /* + * Since per-cpu context is persistent, no need to grab an extra + * reference. + */ + src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); + + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); + + if (!list_empty(&events)) { + /* + * Wait for the events to quiesce before re-instating them. + */ + synchronize_rcu(); + + __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); + } + mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); -static void sync_child_event(struct perf_event *child_event, - struct task_struct *child) +static void sync_child_event(struct perf_event *child_event) { struct perf_event *parent_event = child_event->parent; u64 child_val; - if (child_event->attr.inherit_stat) - perf_event_read_event(child_event, child); + if (child_event->attr.inherit_stat) { + struct task_struct *task = child_event->ctx->task; - child_val = perf_event_count(child_event); + if (task && task != TASK_TOMBSTONE) + perf_event_read_event(child_event, task); + } + + child_val = perf_event_count(child_event, false); /* * Add back the child's count to the parent's count: @@ -10402,71 +14100,74 @@ static void sync_child_event(struct perf_event *child_event, } static void -perf_event_exit_event(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, bool revoke) { - struct perf_event *parent_event = child_event->parent; + struct perf_event *parent_event = event->parent; + unsigned long detach_flags = DETACH_EXIT; + unsigned int attach_state; - /* - * Do not destroy the 'original' grouping; because of the context - * switch optimization the original events could've ended up in a - * random child task. - * - * If we were to destroy the original group, all group related - * operations would cease to function properly after this random - * child dies. - * - * Do destroy all inherited groups, we don't care about those - * and being thorough is better. - */ - raw_spin_lock_irq(&child_ctx->lock); - WARN_ON_ONCE(child_ctx->is_active); + if (parent_event) { + /* + * Do not destroy the 'original' grouping; because of the + * context switch optimization the original events could've + * ended up in a random child task. + * + * If we were to destroy the original group, all group related + * operations would cease to function properly after this + * random child dies. + * + * Do destroy all inherited groups, we don't care about those + * and being thorough is better. + */ + detach_flags |= DETACH_GROUP | DETACH_CHILD; + mutex_lock(&parent_event->child_mutex); + /* PERF_ATTACH_ITRACE might be set concurrently */ + attach_state = READ_ONCE(event->attach_state); + } - if (parent_event) - perf_group_detach(child_event); - list_del_event(child_event, child_ctx); - child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ - raw_spin_unlock_irq(&child_ctx->lock); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; + perf_remove_from_context(event, detach_flags); /* - * Parent events are governed by their filedesc, retain them. - */ - if (!parent_event) { - perf_event_wakeup(child_event); - return; - } - /* - * Child events can be cleaned up. + * Child events can be freed. */ + if (parent_event) { + mutex_unlock(&parent_event->child_mutex); - sync_child_event(child_event, child); + /* + * Match the refcount initialization. Make sure it doesn't happen + * twice if pmu_detach_event() calls it on an already exited task. + */ + if (attach_state & PERF_ATTACH_CHILD) { + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + /* + * pmu_detach_event() will have an extra refcount. + * perf_pending_task() might have one too. + */ + put_event(event); + } - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); + return; + } /* - * Kick perf_poll() for is_event_hup(). + * Parent events are governed by their filedesc, retain them. */ - perf_event_wakeup(parent_event); - free_event(child_event); - put_event(parent_event); + perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child, int ctxn) +static void perf_event_exit_task_context(struct task_struct *task, bool exit) { - struct perf_event_context *child_ctx, *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; - WARN_ON_ONCE(child != current); - - child_ctx = perf_pin_task_context(child, ctxn); - if (!child_ctx) + ctx = perf_pin_task_context(task); + if (!ctx) return; /* @@ -10479,27 +14180,28 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ - mutex_lock(&child_ctx->mutex); + mutex_lock(&ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ - raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); + raw_spin_lock_irq(&ctx->lock); + if (exit) + task_ctx_sched_out(ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); - put_ctx(child_ctx); /* cannot be last */ - WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); - put_task_struct(current); /* cannot be last */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + put_ctx(ctx); /* cannot be last */ + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ - clone_ctx = unclone_ctx(child_ctx); - raw_spin_unlock_irq(&child_ctx->lock); + clone_ctx = unclone_ctx(ctx); + raw_spin_unlock_irq(&ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -10509,29 +14211,48 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ - perf_event_task(child, child_ctx, 0); + if (exit) + perf_event_task(task, ctx, 0); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx, child); + list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) + perf_event_exit_event(child_event, ctx, false); - mutex_unlock(&child_ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(child_ctx); + if (!exit) { + /* + * perf_event_release_kernel() could still have a reference on + * this context. In that case we must wait for these events to + * have been freed (in particular all their references to this + * task must've been dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, + refcount_read(&ctx->refcount) == 1); + } + put_ctx(ctx); } /* - * When a child task exits, feed back event values to parent events. + * When a task exits, feed back event values to parent events. * - * Can be called with cred_guard_mutex held when called from - * install_exec_creds(). + * Can be called with exec_update_lock held when called from + * setup_new_exec(). */ -void perf_event_exit_task(struct task_struct *child) +void perf_event_exit_task(struct task_struct *task) { struct perf_event *event, *tmp; - int ctxn; - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, + WARN_ON_ONCE(task != current); + + mutex_lock(&task->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &task->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); @@ -10542,93 +14263,45 @@ void perf_event_exit_task(struct task_struct *child) */ smp_store_release(&event->owner, NULL); } - mutex_unlock(&child->perf_event_mutex); + mutex_unlock(&task->perf_event_mutex); - for_each_task_context_nr(ctxn) - perf_event_exit_task_context(child, ctxn); + perf_event_exit_task_context(task, true); /* * The perf_event_exit_task_context calls perf_event_task - * with child's task_ctx, which generates EXIT events for - * child contexts and sets child->perf_event_ctxp[] to NULL. + * with task's task_ctx, which generates EXIT events for + * task contexts and sets task->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ - perf_event_task(child, NULL, 0); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; + perf_event_task(task, NULL, 0); - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - put_event(parent); - - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - free_event(event); + /* + * Detach the perf_ctx_data for the system-wide event. + */ + guard(percpu_read)(&global_ctx_data_rwsem); + detach_task_ctx_data(task); } /* - * Free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. + * Free a context as created by inheritance by perf_event_init_task() below, + * used by fork() in case of fail. * - * Not all locks are strictly required, but take them anyway to be nice and - * help out with the lockdep assertions. + * Even though the task has never lived, the context and events have been + * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); - - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - - mutex_unlock(&ctx->mutex); - put_ctx(ctx); - } + perf_event_exit_task_context(task, false); } void perf_event_delayed_put(struct task_struct *task) { - int ctxn; - - for_each_task_context_nr(ctxn) - WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); + WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) { - struct file *file; - - file = fget_raw(fd); + struct file *file = fget(fd); if (!file) return ERR_PTR(-EBADF); @@ -10640,6 +14313,14 @@ struct file *perf_event_get(unsigned int fd) return file; } +const struct perf_event *perf_get_event(struct file *file) +{ + if (file->f_op != &perf_fops) + return ERR_PTR(-EINVAL); + + return file->private_data; +} + const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) @@ -10648,8 +14329,17 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } +int perf_allow_kernel(void) +{ + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(PERF_SECURITY_KERNEL); +} +EXPORT_SYMBOL_GPL(perf_allow_kernel); + /* - * Inherit a event from parent task to child task. + * Inherit an event from parent task to child task. * * Returns: * - valid pointer on success @@ -10664,7 +14354,8 @@ inherit_event(struct perf_event *parent_event, struct perf_event *group_leader, struct perf_event_context *child_ctx) { - enum perf_event_active_state parent_state = parent_event->state; + enum perf_event_state parent_state = parent_event->state; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; @@ -10677,6 +14368,14 @@ inherit_event(struct perf_event *parent_event, if (parent_event->parent) parent_event = parent_event->parent; + if (parent_event->state <= PERF_EVENT_STATE_REVOKED) + return NULL; + + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, @@ -10685,6 +14384,16 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + get_ctx(child_ctx); + child_event->ctx = child_ctx; + + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); + if (IS_ERR(pmu_ctx)) { + free_event(child_event); + return ERR_CAST(pmu_ctx); + } + child_event->pmu_ctx = pmu_ctx; + /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against @@ -10699,8 +14408,6 @@ inherit_event(struct perf_event *parent_event, return NULL; } - get_ctx(child_ctx); - /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, @@ -10721,7 +14428,6 @@ inherit_event(struct perf_event *parent_event, local64_set(&hwc->period_left, sample_period); } - child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; @@ -10737,6 +14443,7 @@ inherit_event(struct perf_event *parent_event, */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); + child_event->attach_state |= PERF_ATTACH_CHILD; raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* @@ -10777,12 +14484,18 @@ static int inherit_group(struct perf_event *parent_event, * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway. */ - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + for_each_sibling_event(sub, parent_event) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); + + if (sub->aux_event == parent_event && child_ctr && + !perf_get_aux_event(child_ctr, leader)) + return -EINVAL; } + if (leader) + leader->group_generation = parent_event->group_generation; return 0; } @@ -10800,18 +14513,21 @@ static int inherit_group(struct perf_event *parent_event, static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, - int *inherited_all) + struct task_struct *child, + u64 clone_flags, int *inherited_all) { - int ret; struct perf_event_context *child_ctx; + int ret; - if (!event->attr.inherit) { + if (!event->attr.inherit || + (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || + /* Do not inherit if sigtrap and signal handlers were cleared. */ + (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -10819,16 +14535,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); + child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; - child->perf_event_ctxp[ctxn] = child_ctx; + child->perf_event_ctxp = child_ctx; } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; @@ -10838,7 +14552,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -static int perf_event_init_context(struct task_struct *child, int ctxn) +static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -10848,14 +14562,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) unsigned long flags; int ret = 0; - if (likely(!parent->perf_event_ctxp[ctxn])) + if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent, ctxn); + parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; @@ -10876,9 +14590,9 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -10892,9 +14606,9 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) parent_ctx->rotate_disable = 1; raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); - list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -10902,7 +14616,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* @@ -10936,20 +14650,20 @@ out_unlock: /* * Initialize the perf_event context in task_struct */ -int perf_event_init_task(struct task_struct *child) +int perf_event_init_task(struct task_struct *child, u64 clone_flags) { - int ctxn, ret; + int ret; - memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + memset(child->perf_recursion, 0, sizeof(child->perf_recursion)); + child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); + child->perf_ctx_data = NULL; - for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn); - if (ret) { - perf_event_free_task(child); - return ret; - } + ret = perf_event_init_context(child, clone_flags); + if (ret) { + perf_event_free_task(child); + return ret; } return 0; @@ -10958,26 +14672,37 @@ int perf_event_init_task(struct task_struct *child) static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; + struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); -#ifdef CONFIG_CGROUP_PERF - INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); -#endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); + + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); + cpuctx->heap = cpuctx->heap_default; } } -void perf_swevent_init_cpu(unsigned int cpu) +static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10995,33 +14720,70 @@ void perf_swevent_init_cpu(unsigned int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *event; raw_spin_lock(&ctx->lock); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } +static void perf_event_clear_cpumask(unsigned int cpu) +{ + int target[PERF_PMU_MAX_SCOPE]; + unsigned int scope; + struct pmu *pmu; + + cpumask_clear_cpu(cpu, perf_online_mask); + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); + + target[scope] = -1; + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) + continue; + target[scope] = cpumask_any_but(cpumask, cpu); + if (target[scope] < nr_cpu_ids) + cpumask_set_cpu(target[scope], pmu_cpumask); + } + + /* migrate */ + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->scope == PERF_PMU_SCOPE_NONE || + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) + continue; + + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); + } +} + static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; + // XXX simplify cpuctx->online mutex_lock(&pmus_lock); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + /* + * Clear the cpumasks, and migrate to other CPUs if possible. + * Must be invoked before the __perf_event_exit_context. + */ + perf_event_clear_cpumask(cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); - cpuctx->online = 0; - mutex_unlock(&ctx->mutex); - } - cpumask_clear_cpu(cpu, perf_online_mask); + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; + mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); } #else @@ -11030,24 +14792,57 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +static void perf_event_setup_cpumask(unsigned int cpu) +{ + struct cpumask *pmu_cpumask; + unsigned int scope; + + /* + * Early boot stage, the cpumask hasn't been set yet. + * The perf_online_<domain>_masks includes the first CPU of each domain. + * Always unconditionally set the boot CPU for the perf_online_<domain>_masks. + */ + if (cpumask_empty(perf_online_mask)) { + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + pmu_cpumask = perf_scope_cpumask(scope); + if (WARN_ON_ONCE(!pmu_cpumask)) + continue; + cpumask_set_cpu(cpu, pmu_cpumask); + } + goto end; + } + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + + pmu_cpumask = perf_scope_cpumask(scope); + + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_empty(cpumask) && + cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) + cpumask_set_cpu(cpu, pmu_cpumask); + } +end: + cpumask_set_cpu(cpu, perf_online_mask); +} + int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); - cpumask_set_cpu(cpu, perf_online_mask); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + perf_event_setup_cpumask(cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - cpuctx->online = 1; - mutex_unlock(&ctx->mutex); - } + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; @@ -11085,11 +14880,14 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); - perf_pmu_register(&perf_cpu_clock, NULL, -1); - perf_pmu_register(&perf_task_clock, NULL, -1); + perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1); + perf_pmu_register(&perf_task_clock, "task_clock", -1); perf_tp_register(); perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); @@ -11097,6 +14895,8 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); + perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); + /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. @@ -11130,7 +14930,7 @@ static int __init perf_event_sysfs_init(void) goto unlock; list_for_each_entry(pmu, &pmus, entry) { - if (!pmu->name || pmu->type < 0) + if (pmu->dev) continue; ret = pmu_dev_alloc(pmu); @@ -11173,12 +14973,20 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) kfree(jc); } +static int perf_cgroup_css_online(struct cgroup_subsys_state *css) +{ + perf_event_cgroup(css->cgroup); + return 0; +} + static int __perf_cgroup_move(void *info) { struct task_struct *task = info; - rcu_read_lock(); - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); - rcu_read_unlock(); + + preempt_disable(); + perf_cgroup_switch(task); + preempt_enable(); + return 0; } @@ -11194,6 +15002,7 @@ static void perf_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, + .css_online = perf_cgroup_css_online, .attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can @@ -11201,5 +15010,8 @@ struct cgroup_subsys perf_event_cgrp_subsys = { * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ + +DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); |
