diff options
Diffstat (limited to 'kernel/trace/trace_event_perf.c')
| -rw-r--r-- | kernel/trace/trace_event_perf.c | 334 |
1 files changed, 262 insertions, 72 deletions
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 84b1e045faba..a6bb7577e8c5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,19 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace event based perf event profiling/tracing * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> */ #include <linux/module.h> #include <linux/kprobes.h> +#include <linux/security.h> #include "trace.h" +#include "trace_probe.h" static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses - * suprises + * surprises */ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) perf_trace_t; @@ -21,13 +24,53 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; -static int perf_trace_event_perm(struct ftrace_event_call *tp_event, +static int perf_trace_event_perm(struct trace_event_call *tp_event, struct perf_event *p_event) { + int ret; + + if (tp_event->perf_perm) { + ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + + /* + * We checked and allowed to create parent, + * allow children without checking. + */ + if (p_event->parent) + return 0; + + /* + * It's ok to check current process (owner) permissions in here, + * because code below is called only via perf_event_open syscall. + */ + /* The ftrace function trace is allowed only for root. */ - if (ftrace_event_is_function(tp_event) && - perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (ftrace_event_is_function(tp_event)) { + ret = perf_allow_tracepoint(); + if (ret) + return ret; + + if (!is_sampling_event(p_event)) + return 0; + + /* + * We don't allow user space callchains for function trace + * event, due to issues with page faults while tracing page + * fault handler and its overall trickiness nature. + */ + if (!p_event->attr.exclude_callchain_user) + return -EINVAL; + + /* + * Same reason to disable user stack dump as for user space + * callchains above. + */ + if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) + return -EINVAL; + } /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) @@ -43,13 +86,14 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + ret = perf_allow_tracepoint(); + if (ret) + return ret; return 0; } -static int perf_trace_event_reg(struct ftrace_event_call *tp_event, +static int perf_trace_event_reg(struct trace_event_call *tp_event, struct perf_event *p_event) { struct hlist_head __percpu *list; @@ -109,11 +153,11 @@ fail: static void perf_trace_event_unreg(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; int i; if (--tp_event->perf_refcount > 0) - goto out; + return; tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); @@ -132,23 +176,21 @@ static void perf_trace_event_unreg(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } -out: - module_put(tp_event->mod); } static int perf_trace_event_open(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); } static void perf_trace_event_close(struct perf_event *p_event) { - struct ftrace_event_call *tp_event = p_event->tp_event; + struct trace_event_call *tp_event = p_event->tp_event; tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, +static int perf_trace_event_init(struct trace_event_call *tp_event, struct perf_event *p_event) { int ret; @@ -172,18 +214,18 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { - struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; + struct trace_event_call *tp_event; + u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && tp_event->class && tp_event->class->reg && - try_module_get(tp_event->mod)) { + trace_event_try_get_ref(tp_event)) { ret = perf_trace_event_init(tp_event, p_event); if (ret) - module_put(tp_event->mod); + trace_event_put_ref(tp_event); break; } } @@ -197,93 +239,248 @@ void perf_trace_destroy(struct perf_event *p_event) mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); mutex_unlock(&event_mutex); } -int perf_trace_add(struct perf_event *p_event, int flags) +#ifdef CONFIG_KPROBE_EVENTS +int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) { - struct ftrace_event_call *tp_event = p_event->tp_event; - struct hlist_head __percpu *pcpu_list; - struct hlist_head *list; + int ret; + char *func = NULL; + struct trace_event_call *tp_event; + + if (p_event->attr.kprobe_func) { + func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (IS_ERR(func)) { + ret = PTR_ERR(func); + return (ret == -EINVAL) ? -E2BIG : ret; + } + + if (func[0] == '\0') { + kfree(func); + func = NULL; + } + } + + tp_event = create_local_trace_kprobe( + func, (void *)(unsigned long)(p_event->attr.kprobe_addr), + p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } - pcpu_list = tp_event->perf_events; - if (WARN_ON_ONCE(!pcpu_list)) + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_kprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(func); + return ret; +} + +void perf_kprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); + mutex_unlock(&event_mutex); + + destroy_local_trace_kprobe(p_event->tp_event); +} +#endif /* CONFIG_KPROBE_EVENTS */ + +#ifdef CONFIG_UPROBE_EVENTS +int perf_uprobe_init(struct perf_event *p_event, + unsigned long ref_ctr_offset, bool is_retprobe) +{ + int ret; + char *path = NULL; + struct trace_event_call *tp_event; + + if (!p_event->attr.uprobe_path) return -EINVAL; + path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), + PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + return (ret == -EINVAL) ? -E2BIG : ret; + } + if (path[0] == '\0') { + ret = -EINVAL; + goto out; + } + + tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, + ref_ctr_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + /* + * local trace_uprobe need to hold event_mutex to call + * uprobe_buffer_enable() and uprobe_buffer_disable(). + * event_mutex is not required for local trace_kprobes. + */ + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_uprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(path); + return ret; +} + +void perf_uprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); + mutex_unlock(&event_mutex); + destroy_local_trace_uprobe(p_event->tp_event); +} +#endif /* CONFIG_UPROBE_EVENTS */ + +int perf_trace_add(struct perf_event *p_event, int flags) +{ + struct trace_event_call *tp_event = p_event->tp_event; + struct hw_perf_event *hwc = &p_event->hw; + if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; - list = this_cpu_ptr(pcpu_list); - hlist_add_head_rcu(&p_event->hlist_entry, list); + if (is_sampling_event(p_event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(p_event); + } + + /* + * If TRACE_REG_PERF_ADD returns false; no custom action was performed + * and we need to take the default action of enqueueing our event on + * the right per-cpu hlist. + */ + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { + struct hlist_head __percpu *pcpu_list; + struct hlist_head *list; + + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) + return -EINVAL; + + list = this_cpu_ptr(pcpu_list); + hlist_add_head_rcu(&p_event->hlist_entry, list); + } - return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); + return 0; } void perf_trace_del(struct perf_event *p_event, int flags) { - struct ftrace_event_call *tp_event = p_event->tp_event; - hlist_del_rcu(&p_event->hlist_entry); - tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); + struct trace_event_call *tp_event = p_event->tp_event; + + /* + * If TRACE_REG_PERF_DEL returns false; no custom action was performed + * and we need to take the default action of dequeueing our event from + * the right per-cpu hlist. + */ + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) + hlist_del_rcu(&p_event->hlist_entry); } -__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs *regs, int *rctxp) +void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) { - struct trace_entry *entry; - unsigned long flags; char *raw_data; - int pc; + int rctx; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); - pc = preempt_count(); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough, wanted %d, have %d", + size, PERF_MAX_TRACE_SIZE)) + return NULL; - *rctxp = perf_swevent_get_recursion_context(); - if (*rctxp < 0) + *rctxp = rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) return NULL; - raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); + if (regs) + *regs = this_cpu_ptr(&__perf_regs[rctx]); + raw_data = this_cpu_ptr(perf_trace_buf[rctx]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + return raw_data; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); +NOKPROBE_SYMBOL(perf_trace_buf_alloc); - entry = (struct trace_entry *)raw_data; - local_save_flags(flags); - tracing_generic_entry_update(entry, flags, pc); - entry->type = type; +void perf_trace_buf_update(void *record, u16 type) +{ + struct trace_entry *entry = record; - return raw_data; + tracing_generic_entry_update(entry, type, tracing_gen_ctx()); } -EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *pt_regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct ftrace_entry *entry; - struct hlist_head *head; + struct perf_event *event; + struct hlist_head head; struct pt_regs regs; int rctx; + int bit; + + if (!rcu_is_watching()) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + if ((unsigned long)ops->private != smp_processor_id()) + goto out; + + event = container_of(ops, struct perf_event, ftrace_ops); + + /* + * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all + * the perf code does is hlist_for_each_entry_rcu(), so we can + * get away with simply setting the @head.first pointer in order + * to create a singular list. + */ + head.first = &event->hlist_entry; #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ sizeof(u64)) - sizeof(u32)) BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + memset(®s, 0, sizeof(regs)); perf_fetch_caller_regs(®s); - entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) - return; + goto out; entry->ip = ip; entry->parent_ip = parent_ip; + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, + 1, ®s, &head, NULL); - head = this_cpu_ptr(event_function.perf_events); - perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, - 1, ®s, head, NULL); - +out: + ftrace_test_recursion_unlock(bit); #undef ENTRY_SIZE } @@ -291,8 +488,9 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_CONTROL; - ops->func = perf_ftrace_function_call; + ops->func = perf_ftrace_function_call; + ops->private = (void *)(unsigned long)nr_cpu_ids; + return register_ftrace_function(ops); } @@ -304,19 +502,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event) return ret; } -static void perf_ftrace_function_enable(struct perf_event *event) -{ - ftrace_function_local_enable(&event->ftrace_ops); -} - -static void perf_ftrace_function_disable(struct perf_event *event) -{ - ftrace_function_local_disable(&event->ftrace_ops); -} - -int perf_ftrace_event_register(struct ftrace_event_call *call, +int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data) { + struct perf_event *event = data; + switch (type) { case TRACE_REG_REGISTER: case TRACE_REG_UNREGISTER: @@ -329,11 +519,11 @@ int perf_ftrace_event_register(struct ftrace_event_call *call, case TRACE_REG_PERF_CLOSE: return perf_ftrace_function_unregister(data); case TRACE_REG_PERF_ADD: - perf_ftrace_function_enable(data); - return 0; + event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id(); + return 1; case TRACE_REG_PERF_DEL: - perf_ftrace_function_disable(data); - return 0; + event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids; + return 1; } return -EINVAL; |
