diff options
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 211 | ||||
-rw-r--r-- | kernel/events/internal.h | 6 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 31 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 2 |
4 files changed, 169 insertions, 81 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..3e691b75b2db 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx) return parent_ctx; } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, + enum pid_type type) { + u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; - return task_tgid_nr_ns(p, event->ns); + nr = __task_pid_nr_ns(p, type, event->ns); + /* avoid -1 if it is idle thread or runs in another ns */ + if (!nr && !pid_alive(p)) + nr = -1; + return nr; } -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; + return perf_event_pid_type(event, p, __PIDTYPE_TGID); +} - return task_pid_nr_ns(p, event->ns); +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + return perf_event_pid_type(event, p, PIDTYPE_PID); } /* @@ -1570,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + size += sizeof(data->phys_addr); + event->header_size = size; } @@ -2217,6 +2225,33 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + * total_time_enabled = tstamp_stopped - tstamp_enabled + * total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + + event->tstamp_stopped = now; + event->tstamp_enabled = now - event->total_time_enabled; + event->tstamp_running = now - event->total_time_running; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2224,9 +2259,12 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + /* + * We can be called with event->state == STATE_OFF when we create with + * .disabled = 1. In that case the IOC_ENABLE will call this function. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2471,10 +2509,11 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + __perf_event_enable_time(event, tstamp); list_for_each_entry(sub, &event->sibling_list, group_entry) { + /* XXX should not be > INACTIVE if event isn't */ if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + __perf_event_enable_time(sub, tstamp); } } @@ -3180,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, return; perf_ctx_lock(cpuctx, ctx); + /* + * We must check ctx->nr_events while holding ctx->lock, such + * that we serialize against perf_install_in_context(). + */ + if (!ctx->nr_events) + goto unlock; + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -3193,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); + +unlock: perf_ctx_unlock(cpuctx, ctx); } @@ -3625,10 +3673,7 @@ unlock: static inline u64 perf_event_count(struct perf_event *event) { - if (event->pmu->count) - return event->pmu->count(event); - - return __perf_event_count(event); + return local64_read(&event->count) + atomic64_read(&event->child_count); } /* @@ -3659,15 +3704,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } - /* - * It must not have a pmu::count method, those are not - * NMI safe. - */ - if (event->pmu->count) { - ret = -EOPNOTSUPP; - goto out; - } - /* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) && event->hw.target != current) { @@ -5090,7 +5126,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5149,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5447,7 @@ aux_unlock: vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } @@ -5972,6 +6008,9 @@ void perf_output_sample(struct perf_output_handle *handle, } } + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + perf_output_put(handle, data->phys_addr); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5987,6 +6026,38 @@ void perf_output_sample(struct perf_output_handle *handle, } } +static u64 perf_virt_to_phys(u64 virt) +{ + u64 phys_addr = 0; + struct page *p = NULL; + + if (!virt) + return 0; + + if (virt >= TASK_SIZE) { + /* If it's vmalloc()d memory, leave phys_addr as 0 */ + if (virt_addr_valid((void *)(uintptr_t)virt) && + !(virt >= VMALLOC_START && virt < VMALLOC_END)) + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); + } else { + /* + * Walking the pages tables for user address. + * Interrupts are disabled, so it prevents any tear down + * of the page tables. + * Try IRQ-safe __get_user_pages_fast first. + * If failed, leave phys_addr as 0. + */ + if ((current->mm != NULL) && + (__get_user_pages_fast(virt, 1, 0, &p) == 1)) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + + if (p) + put_page(p); + } + + return phys_addr; +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, @@ -6105,6 +6176,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + data->phys_addr = perf_virt_to_phys(data->addr); } static void __always_inline @@ -7256,6 +7330,11 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +void perf_event_itrace_started(struct perf_event *event) +{ + event->attach_state |= PERF_ATTACH_ITRACE; +} + static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; @@ -7271,7 +7350,7 @@ static void perf_log_itrace_start(struct perf_event *event) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || - event->hw.itrace_started) + event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; @@ -7875,16 +7954,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, - rctx, task); + rctx, task, NULL); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task) + struct task_struct *task, struct perf_event *event) { struct perf_sample_data data; - struct perf_event *event; struct perf_raw_record raw = { .frag = { @@ -7898,9 +7976,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); - hlist_for_each_entry_rcu(event, head, hlist_entry) { + /* Use the given event instead of the hlist */ + if (event) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); + } else { + hlist_for_each_entry_rcu(event, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } } /* @@ -8050,7 +8134,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { - bool is_kprobe, is_tracepoint; + bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; if (event->attr.type != PERF_TYPE_TRACEPOINT) @@ -8061,7 +8145,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; - if (!is_kprobe && !is_tracepoint) + is_syscall_tp = is_syscall_trace_event(event->tp_event); + if (!is_kprobe && !is_tracepoint && !is_syscall_tp) /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; @@ -8070,13 +8155,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return PTR_ERR(prog); if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || - (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) || + (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; } - if (is_tracepoint) { + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); if (prog->aux->max_ctx_offset > off) { @@ -9580,6 +9666,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + attr->size = size; + if (attr->__reserved_1) return -EINVAL; @@ -9852,6 +9940,11 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* Only privileged users can get physical addresses */ + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!attr.sample_max_stack) attr.sample_max_stack = sysctl_perf_event_max_stack; @@ -10001,28 +10094,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned @@ -11201,5 +11293,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = { * controller is not mounted on a legacy hierarchy. */ .implicit_on_dfl = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 486fd78eb8d5..843e97047335 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -38,9 +38,9 @@ struct ring_buffer { struct user_struct *mmap_user; /* AUX area */ - local_t aux_head; + long aux_head; local_t aux_nest; - local_t aux_wakeup; + long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; @@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion) { int rctx; - if (in_nmi()) + if (unlikely(in_nmi())) rctx = 3; else if (in_irq()) rctx = 2; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ee97196bb151..af71a84e12ee 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) goto err_put; - aux_head = local_read(&rb->aux_head); + aux_head = rb->aux_head; handle->rb = rb; handle->event = event; @@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, */ if (!rb->aux_overwrite) { aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); - handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; + handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); @@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; - local_set(&rb->aux_head, aux_head); + rb->aux_head = aux_head; } else { handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; - aux_head = local_read(&rb->aux_head); - local_add(size, &rb->aux_head); + aux_head = rb->aux_head; + rb->aux_head += size; } if (size || handle->aux_flags) { @@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags); } - aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - - if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + rb->user_page->aux_head = rb->aux_head; + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { wakeup = true; - local_add(rb->aux_watermark, &rb->aux_wakeup); + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); } if (wakeup) { @@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { struct ring_buffer *rb = handle->rb; - unsigned long aux_head; if (size > handle->size) return -ENOSPC; - local_add(size, &rb->aux_head); + rb->aux_head += size; - aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + rb->user_page->aux_head = rb->aux_head; + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { perf_output_wakeup(handle); - local_add(rb->aux_watermark, &rb->aux_wakeup); - handle->wakeup = local_read(&rb->aux_wakeup) + - rb->aux_watermark; + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); + handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } - handle->head = aux_head; + handle->head = rb->aux_head; handle->size -= size; return 0; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e137f98a50c..267f6ef91d97 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - newmm->uprobes_state.xol_area = NULL; - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ |