From da97e18458fb42d7c00fac5fd1c56a3896ec666e Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 14 Oct 2019 13:03:08 -0400 Subject: perf_event: Add support for LSM and SELinux checks In current mainline, the degree of access to perf_event_open(2) system call depends on the perf_event_paranoid sysctl. This has a number of limitations: 1. The sysctl is only a single value. Many types of accesses are controlled based on the single value thus making the control very limited and coarse grained. 2. The sysctl is global, so if the sysctl is changed, then that means all processes get access to perf_event_open(2) opening the door to security issues. This patch adds LSM and SELinux access checking which will be used in Android to access perf_event_open(2) for the purposes of attaching BPF programs to tracepoints, perf profiling and other operations from userspace. These operations are intended for production systems. 5 new LSM hooks are added: 1. perf_event_open: This controls access during the perf_event_open(2) syscall itself. The hook is called from all the places that the perf_event_paranoid sysctl is checked to keep it consistent with the systctl. The hook gets passed a 'type' argument which controls CPU, kernel and tracepoint accesses (in this context, CPU, kernel and tracepoint have the same semantics as the perf_event_paranoid sysctl). Additionally, I added an 'open' type which is similar to perf_event_paranoid sysctl == 3 patch carried in Android and several other distros but was rejected in mainline [1] in 2016. 2. perf_event_alloc: This allocates a new security object for the event which stores the current SID within the event. It will be useful when the perf event's FD is passed through IPC to another process which may try to read the FD. Appropriate security checks will limit access. 3. perf_event_free: Called when the event is closed. 4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event. 5. perf_event_write: Called from the ioctl(2) syscalls for the event. [1] https://lwn.net/Articles/696240/ Since Peter had suggest LSM hooks in 2016 [1], I am adding his Suggested-by tag below. To use this patch, we set the perf_event_paranoid sysctl to -1 and then apply selinux checking as appropriate (default deny everything, and then add policy rules to give access to domains that need it). In the future we can remove the perf_event_paranoid sysctl altogether. Suggested-by: Peter Zijlstra Co-developed-by: Peter Zijlstra Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: James Morris Cc: Arnaldo Carvalho de Melo Cc: rostedt@goodmis.org Cc: Yonghong Song Cc: Kees Cook Cc: Ingo Molnar Cc: Alexei Starovoitov Cc: jeffv@google.com Cc: Jiri Olsa Cc: Daniel Borkmann Cc: primiano@google.com Cc: Song Liu Cc: rsavitski@google.com Cc: Namhyung Kim Cc: Matthew Garrett Link: https://lkml.kernel.org/r/20191014170308.70668-1-joel@joelfernandes.org --- kernel/events/core.c | 57 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 11 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..f9a5d4356562 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4229,8 +4229,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (!task) { /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); + err = perf_allow_cpu(&event->attr); + if (err) + return ERR_PTR(err); cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -4539,6 +4540,8 @@ static void _free_event(struct perf_event *event) unaccount_event(event); + security_perf_event_free(event); + if (event->rb) { /* * Can happen when we close an event with re-directed output. @@ -4992,6 +4995,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct perf_event_context *ctx; int ret; + ret = security_perf_event_read(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = __perf_read(event, buf, count); perf_event_ctx_unlock(event, ctx); @@ -5256,6 +5263,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct perf_event_context *ctx; long ret; + /* Treat ioctl like writes as it is likely a mutating operation. */ + ret = security_perf_event_write(event); + if (ret) + return ret; + ctx = perf_event_ctx_lock(event); ret = _perf_ioctl(event, cmd, arg); perf_event_ctx_unlock(event, ctx); @@ -5719,6 +5731,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; + ret = security_perf_event_read(event); + if (ret) + return ret; + vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { @@ -5844,7 +5860,7 @@ accounting: lock_limit >>= PAGE_SHIFT; locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + if ((locked > lock_limit) && perf_is_paranoid() && !capable(CAP_IPC_LOCK)) { ret = -EPERM; goto unlock; @@ -10578,11 +10594,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + err = security_perf_event_alloc(event); + if (err) + goto err_callchain_buffer; + /* symmetric to unaccount_event() in _free_event() */ account_event(event); return event; +err_callchain_buffer: + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } err_addr_filters: kfree(event->addr_filter_ranges); @@ -10671,9 +10696,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } /* privileged levels capture (kernel, hv): check permissions */ - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { + ret = perf_allow_kernel(attr); + if (ret) + return ret; + } } if (attr->sample_type & PERF_SAMPLE_REGS_USER) { @@ -10886,13 +10913,19 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + if (err) + return err; + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + err = perf_allow_kernel(&attr); + if (err) + return err; } if (attr.namespaces) { @@ -10909,9 +10942,11 @@ SYSCALL_DEFINE5(perf_event_open, } /* Only privileged users can get physical addresses */ - if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && - perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { + err = perf_allow_kernel(&attr); + if (err) + return err; + } err = security_locked_down(LOCKDOWN_PERF); if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) -- cgit From 8a9f91c51ea72b126864e0db616b1bac12261200 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Mon, 14 Oct 2019 16:14:59 +0800 Subject: perf/ring_buffer: Modify the parameter type of perf_mmap_free_page() In perf_mmap_free_page(), the unsigned long type is converted to the pointer type, but where the call is made, the pointer type is converted to the unsigned long type. There is no need to do these operations. Modify the parameter type of perf_mmap_free_page() to pointer type. Signed-off-by: Yunfeng Ye Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Link: https://lkml.kernel.org/r/e6ae3f0c-d04c-50f9-544a-aee3b30330cd@huawei.com --- kernel/events/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ffb59a4ef4ff..abc145cbfedf 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -799,9 +799,9 @@ fail: return NULL; } -static void perf_mmap_free_page(unsigned long addr) +static void perf_mmap_free_page(void *addr) { - struct page *page = virt_to_page((void *)addr); + struct page *page = virt_to_page(addr); page->mapping = NULL; __free_page(page); @@ -811,9 +811,9 @@ void rb_free(struct ring_buffer *rb) { int i; - perf_mmap_free_page((unsigned long)rb->user_page); + perf_mmap_free_page(rb->user_page); for (i = 0; i < rb->nr_pages; i++) - perf_mmap_free_page((unsigned long)rb->data_pages[i]); + perf_mmap_free_page(rb->data_pages[i]); kfree(rb); } -- cgit From d7e78706e43107fa269fe34b1a69e653f5ec9f2c Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Mon, 14 Oct 2019 16:15:57 +0800 Subject: perf/ring_buffer: Matching the memory allocate and free, in rb_alloc() Currently perf_mmap_alloc_page() is used to allocate memory in rb_alloc(), but using free_page() to free memory in the failure path. It's better to use perf_mmap_free_page() instead. Signed-off-by: Yunfeng Ye Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Cc: Cc: Cc: Link: https://lkml.kernel.org/r/575c7e8c-90c7-4e3a-b41d-f894d8cdbd7f@huawei.com --- kernel/events/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index abc145cbfedf..246c83ac5643 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -754,6 +754,14 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } +static void perf_mmap_free_page(void *addr) +{ + struct page *page = virt_to_page(addr); + + page->mapping = NULL; + __free_page(page); +} + struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct ring_buffer *rb; @@ -788,9 +796,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) fail_data_pages: for (i--; i >= 0; i--) - free_page((unsigned long)rb->data_pages[i]); + perf_mmap_free_page(rb->data_pages[i]); - free_page((unsigned long)rb->user_page); + perf_mmap_free_page(rb->user_page); fail_user_page: kfree(rb); @@ -799,14 +807,6 @@ fail: return NULL; } -static void perf_mmap_free_page(void *addr) -{ - struct page *page = virt_to_page(addr); - - page->mapping = NULL; - __free_page(page); -} - void rb_free(struct ring_buffer *rb) { int i; -- cgit From aa5de305c90c995321df452f274fe75b52bd8fcc Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 18 Oct 2019 20:20:40 -0700 Subject: kernel/events/uprobes.c: only do FOLL_SPLIT_PMD for uprobe register Attaching uprobe to text section in THP splits the PMD mapped page table into PTE mapped entries. On uprobe detach, we would like to regroup PMD mapped page table entry to regain performance benefit of THP. However, the regroup is broken For perf_event based trace_uprobe. This is because perf_event based trace_uprobe calls uprobe_unregister twice on close: first in TRACE_REG_PERF_CLOSE, then in TRACE_REG_PERF_UNREGISTER. The second call will split the PMD mapped page table entry, which is not the desired behavior. Fix this by only use FOLL_SPLIT_PMD for uprobe register case. Add a WARN() to confirm uprobe unregister never work on huge pages, and abort the operation when this WARN() triggers. Link: http://lkml.kernel.org/r/20191017164223.2762148-6-songliubraving@fb.com Fixes: 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") Signed-off-by: Song Liu Reviewed-by: Srikar Dronamraju Cc: Kirill A. Shutemov Cc: Oleg Nesterov Cc: Matthew Wilcox (Oracle) Cc: William Kucharski Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 94d38a39d72e..c74761004ee5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -474,14 +474,17 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; bool orig_page_huge = false; + unsigned int gup_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); retry: + if (is_register) + gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -489,6 +492,12 @@ retry: if (ret <= 0) goto put_old; + if (WARN(!is_register && PageCompound(old_page), + "uprobe unregister should never work on compound page\n")) { + ret = -EINVAL; + goto put_old; + } + /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); -- cgit From 5e6c3c7b1ec217c1c4c95d9148182302b9969b97 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 21 Oct 2019 10:33:54 +0200 Subject: perf/aux: Fix tracking of auxiliary trace buffer allocation The following commit from the v5.4 merge window: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") ... breaks auxiliary trace buffer tracking. If I run command 'perf record -e rbd000' to record samples and saving them in the **auxiliary** trace buffer then the value of 'locked_vm' becomes negative after all trace buffers have been allocated and released: During allocation the values increase: [52.250027] perf_mmap user->locked_vm:0x87 pinned_vm:0x0 ret:0 [52.250115] perf_mmap user->locked_vm:0x107 pinned_vm:0x0 ret:0 [52.250251] perf_mmap user->locked_vm:0x188 pinned_vm:0x0 ret:0 [52.250326] perf_mmap user->locked_vm:0x208 pinned_vm:0x0 ret:0 [52.250441] perf_mmap user->locked_vm:0x289 pinned_vm:0x0 ret:0 [52.250498] perf_mmap user->locked_vm:0x309 pinned_vm:0x0 ret:0 [52.250613] perf_mmap user->locked_vm:0x38a pinned_vm:0x0 ret:0 [52.250715] perf_mmap user->locked_vm:0x408 pinned_vm:0x2 ret:0 [52.250834] perf_mmap user->locked_vm:0x408 pinned_vm:0x83 ret:0 [52.250915] perf_mmap user->locked_vm:0x408 pinned_vm:0x103 ret:0 [52.251061] perf_mmap user->locked_vm:0x408 pinned_vm:0x184 ret:0 [52.251146] perf_mmap user->locked_vm:0x408 pinned_vm:0x204 ret:0 [52.251299] perf_mmap user->locked_vm:0x408 pinned_vm:0x285 ret:0 [52.251383] perf_mmap user->locked_vm:0x408 pinned_vm:0x305 ret:0 [52.251544] perf_mmap user->locked_vm:0x408 pinned_vm:0x386 ret:0 [52.251634] perf_mmap user->locked_vm:0x408 pinned_vm:0x406 ret:0 [52.253018] perf_mmap user->locked_vm:0x408 pinned_vm:0x487 ret:0 [52.253197] perf_mmap user->locked_vm:0x408 pinned_vm:0x508 ret:0 [52.253374] perf_mmap user->locked_vm:0x408 pinned_vm:0x589 ret:0 [52.253550] perf_mmap user->locked_vm:0x408 pinned_vm:0x60a ret:0 [52.253726] perf_mmap user->locked_vm:0x408 pinned_vm:0x68b ret:0 [52.253903] perf_mmap user->locked_vm:0x408 pinned_vm:0x70c ret:0 [52.254084] perf_mmap user->locked_vm:0x408 pinned_vm:0x78d ret:0 [52.254263] perf_mmap user->locked_vm:0x408 pinned_vm:0x80e ret:0 The value of user->locked_vm increases to a limit then the memory is tracked by pinned_vm. During deallocation the size is subtracted from pinned_vm until it hits a limit. Then a larger value is subtracted from locked_vm leading to a large number (because of type unsigned): [64.267797] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x78d [64.267826] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x70c [64.267848] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x68b [64.267869] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x60a [64.267891] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x589 [64.267911] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x508 [64.267933] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x487 [64.267952] perf_mmap_close mmap_user->locked_vm:0x408 pinned_vm:0x406 [64.268883] perf_mmap_close mmap_user->locked_vm:0x307 pinned_vm:0x406 [64.269117] perf_mmap_close mmap_user->locked_vm:0x206 pinned_vm:0x406 [64.269433] perf_mmap_close mmap_user->locked_vm:0x105 pinned_vm:0x406 [64.269536] perf_mmap_close mmap_user->locked_vm:0x4 pinned_vm:0x404 [64.269797] perf_mmap_close mmap_user->locked_vm:0xffffffffffffff84 pinned_vm:0x303 [64.270105] perf_mmap_close mmap_user->locked_vm:0xffffffffffffff04 pinned_vm:0x202 [64.270374] perf_mmap_close mmap_user->locked_vm:0xfffffffffffffe84 pinned_vm:0x101 [64.270628] perf_mmap_close mmap_user->locked_vm:0xfffffffffffffe04 pinned_vm:0x0 This value sticks for the user until system is rebooted, causing follow-on system calls using locked_vm resource limit to fail. Note: There is no issue using the normal trace buffer. In fact the issue is in perf_mmap_close(). During allocation auxiliary trace buffer memory is either traced as 'extra' and added to 'pinned_vm' or trace as 'user_extra' and added to 'locked_vm'. This applies for normal trace buffers and auxiliary trace buffer. However in function perf_mmap_close() all auxiliary trace buffer is subtraced from 'locked_vm' and never from 'pinned_vm'. This breaks the ballance. Signed-off-by: Thomas Richter Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: gor@linux.ibm.com Cc: hechaol@fb.com Cc: heiko.carstens@de.ibm.com Cc: linux-perf-users@vger.kernel.org Cc: songliubraving@fb.com Fixes: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") Link: https://lkml.kernel.org/r/20191021083354.67868-1-tmricht@linux.ibm.com [ Minor readability edits. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..f5d7950d1931 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5607,8 +5607,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); + if (!rb->aux_mmap_locked) + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + else + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); -- cgit From f3a519e4add93b7b31a6616f0b09635ff2e6a159 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 22 Oct 2019 10:39:40 +0300 Subject: perf/aux: Fix AUX output stopping Commit: 8a58ddae2379 ("perf/core: Fix exclusive events' grouping") allows CAP_EXCLUSIVE events to be grouped with other events. Since all of those also happen to be AUX events (which is not the case the other way around, because arch/s390), this changes the rules for stopping the output: the AUX event may not be on its PMU's context any more, if it's grouped with a HW event, in which case it will be on that HW event's context instead. If that's the case, munmap() of the AUX buffer can't find and stop the AUX event, potentially leaving the last reference with the atomic context, which will then end up freeing the AUX buffer. This will then trip warnings: Fix this by using the context's PMU context when looking for events to stop, instead of the event's PMU context. Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20191022073940.61814-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index f5d7950d1931..bb3748d29b04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6949,7 +6949,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->pmu; + struct pmu *pmu = event->ctx->pmu; struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, -- cgit From 8c7e975667fbc3b7c816119dd56104739899f125 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 25 Oct 2019 15:16:36 +0300 Subject: perf/core: Start rejecting the syscall with attr.__reserved_2 set Commit: 1a5941312414c ("perf: Add wakeup watermark control to the AUX area") added attr.__reserved_2 padding, but forgot to add an ABI check to reject attributes with this field set. Fix that. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: adrian.hunter@intel.com Cc: mathieu.poirier@linaro.org Link: https://lkml.kernel.org/r/20191025121636.75182-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index bb3748d29b04..aec8dba2bea4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10635,7 +10635,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->size = size; - if (attr->__reserved_1) + if (attr->__reserved_1 || attr->__reserved_2) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) -- cgit From c2b98a8661514f29a44ebd0925cf4b1503beb48c Mon Sep 17 00:00:00 2001 From: Alexey Budankov Date: Wed, 23 Oct 2019 10:13:56 +0300 Subject: perf/x86: Synchronize PMU task contexts on optimized context switches Install Intel specific PMU task context synchronization adapter and extend optimized context switch path with PMU specific task context synchronization to fix LBR callstack virtualization on context switches. Signed-off-by: Alexey Budankov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Song Liu Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/9c6445a9-bdba-ef03-3859-f1f91198f27a@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0940c8810be0..f48d38b55e7b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3204,10 +3204,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + struct pmu *pmu = ctx->pmu; + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * PMU specific parts of task perf context can require + * additional synchronization. As an example of such + * synchronization see implementation details of Intel + * LBR call stack data profiling; + */ + if (pmu->swap_task_ctx) + pmu->swap_task_ctx(ctx, next_ctx); + else + swap(ctx->task_ctx_data, next_ctx->task_ctx_data); /* * RCU_INIT_POINTER here is safe because we've not -- cgit From db0503e4f6751f2c719d002ba1becd1811633e6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Oct 2019 16:02:39 +0200 Subject: perf/core: Optimize perf_install_in_event() Andi reported that when creating a lot of events, a lot of time is spent in IPIs and asked if it would be possible to elide some of that. Now when, as for example the perf-tool always does, events are created disabled, then these events will not need to be scheduled when added to the context (they're still disable) and therefore the IPI is not required -- except for the very first event, that will need to set ctx->is_active. ( It might be possible to set ctx->is_active remotely for cpu_ctx, but we really need the IPI for task_ctx, so lets not make that distinction. ) Also use __perf_effective_state() since group events depend on the state of the leader, if the leader is OFF, the whole group is OFF. So when sibling events are created enabled (XXX check tool) then we only need a single IPI to create and enable the whole group (+ that initial IPI to initialize the context). Suggested-by: Andi Kleen Reported-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index f48d38b55e7b..ea70ca614987 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2666,6 +2666,25 @@ perf_install_in_context(struct perf_event_context *ctx, */ smp_store_release(&event->ctx, ctx); + /* + * perf_event_attr::disabled events will not run and can be initialized + * without IPI. Except when this is the first event for the context, in + * that case we need the magic of the IPI to set ctx->is_active. + * + * The IOC_ENABLE that is sure to follow the creation of a disabled + * event will issue the IPI and reprogram the hardware. + */ + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + raw_spin_lock_irq(&ctx->lock); + if (ctx->task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); + return; + } + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; -- cgit From 66d258c5b048840991de49697264af75f5b09def Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Oct 2019 20:31:03 +0200 Subject: perf/core: Optimize perf_init_event() Andi reported that he was hitting the linear search in perf_init_event() a lot. Make more agressive use of the IDR lookup to avoid hitting the linear search. With exception of PERF_TYPE_SOFTWARE (which relies on a hideous hack), we can put everything in the IDR. On top of that, we can alias TYPE_HARDWARE and TYPE_HW_CACHE to TYPE_RAW on the lookup side. This greatly reduces the chances of hitting the linear search. Reported-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index ea70ca614987..4d67c5d35c13 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10080,7 +10080,7 @@ static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, const char *name, int type) { - int cpu, ret; + int cpu, ret, max = PERF_TYPE_MAX; mutex_lock(&pmus_lock); ret = -ENOMEM; @@ -10093,12 +10093,17 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto skip_type; pmu->name = name; - if (type < 0) { - type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); - if (type < 0) { - ret = type; + if (type != PERF_TYPE_SOFTWARE) { + if (type >= 0) + max = type; + + ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); + if (ret < 0) goto free_pdc; - } + + WARN_ON(type >= 0 && ret != type); + + type = ret; } pmu->type = type; @@ -10188,7 +10193,7 @@ free_dev: put_device(pmu->dev); free_idr: - if (pmu->type >= PERF_TYPE_MAX) + if (pmu->type != PERF_TYPE_SOFTWARE) idr_remove(&pmu_idr, pmu->type); free_pdc: @@ -10210,7 +10215,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - if (pmu->type >= PERF_TYPE_MAX) + if (pmu->type != PERF_TYPE_SOFTWARE) idr_remove(&pmu_idr, pmu->type); if (pmu_bus_running) { if (pmu->nr_addr_filters) @@ -10280,9 +10285,8 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { + int idx, type, ret; struct pmu *pmu; - int idx; - int ret; idx = srcu_read_lock(&pmus_srcu); @@ -10295,12 +10299,27 @@ static struct pmu *perf_init_event(struct perf_event *event) } rcu_read_lock(); - pmu = idr_find(&pmu_idr, event->attr.type); + /* + * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * are often aliases for PERF_TYPE_RAW. + */ + type = event->attr.type; + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) + type = PERF_TYPE_RAW; + +again: + pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { ret = perf_try_init_event(pmu, event); + if (ret == -ENOENT && event->attr.type != type) { + type = event->attr.type; + goto again; + } + if (ret) pmu = ERR_PTR(ret); + goto unlock; } -- cgit From d44f821b0e13275735e8f3fe4db8703b45f05d52 Mon Sep 17 00:00:00 2001 From: "Liang, Kan" Date: Tue, 22 Oct 2019 11:13:09 +0200 Subject: perf/core: Optimize perf_init_event() for TYPE_SOFTWARE Andi reported that he was hitting the linear search in perf_init_event() a lot. Now that all !TYPE_SOFTWARE events should hit the IDR, make sure the TYPE_SOFTWARE events are at the head of the list such that we'll quickly find the right PMU (provided a valid event was given). Signed-off-by: Liang, Kan Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d67c5d35c13..cfd89b4a02d8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10180,7 +10180,16 @@ got_cpu_context: if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; - list_add_rcu(&pmu->entry, &pmus); + /* + * Ensure the TYPE_SOFTWARE PMUs are at the head of the list, + * since these cannot be in the IDR. This way the linear search + * is fast, provided a valid software event is provided. + */ + if (type == PERF_TYPE_SOFTWARE || !name) + list_add_rcu(&pmu->entry, &pmus); + else + list_add_tail_rcu(&pmu->entry, &pmus); + atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: -- cgit From 09f4e8f05d85bfc98fe9227e988a7c1b3ec416ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Nov 2019 12:51:04 +0100 Subject: perf/core: Disallow uncore-cgroup events While discussing uncore event scheduling, I noticed we do not in fact seem to dis-allow making uncore-cgroup events. Such events make no sense what so ever because the cgroup is a CPU local state where uncore counts across a number of CPUs. Disallow them. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index aec8dba2bea4..022a34b66e60 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10535,6 +10535,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + /* + * Disallow uncore-cgroup events, they don't make sense as the cgroup will + * be different on other CPUs in the uncore mask. + */ + if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + err = -EINVAL; + goto err_pmu; + } + if (event->attr.aux_output && !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { err = -EOPNOTSUPP; -- cgit From 00496fe5e09e8c8bb115540e7e3470553cd07a5c Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 1 Nov 2019 17:12:48 +0200 Subject: perf/aux: Fix the aux_output group inheritance fix Commit f733c6b508bc ("perf/core: Fix inheritance of aux_output groups") adds a NULL pointer dereference in case inherit_group() races with perf_release(), which causes the below crash: > BUG: kernel NULL pointer dereference, address: 000000000000010b > #PF: supervisor read access in kernel mode > #PF: error_code(0x0000) - not-present page > PGD 3b203b067 P4D 3b203b067 PUD 3b2040067 PMD 0 > Oops: 0000 [#1] SMP KASAN > CPU: 0 PID: 315 Comm: exclusive-group Tainted: G B 5.4.0-rc3-00181-g72e1839403cb-dirty #878 > RIP: 0010:perf_get_aux_event+0x86/0x270 > Call Trace: > ? __perf_read_group_add+0x3b0/0x3b0 > ? __kasan_check_write+0x14/0x20 > ? __perf_event_init_context+0x154/0x170 > inherit_task_group.isra.0.part.0+0x14b/0x170 > perf_event_init_task+0x296/0x4b0 Fix this by skipping over events that are getting closed, in the inheritance path. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: f733c6b508bc ("perf/core: Fix inheritance of aux_output groups") Link: https://lkml.kernel.org/r/20191101151248.47327-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 022a34b66e60..b752bd3aa03b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11899,7 +11899,7 @@ static int inherit_group(struct perf_event *parent_event, if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); - if (sub->aux_event == parent_event && + if (sub->aux_event == parent_event && child_ctr && !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } -- cgit From f25d8ba9e1b204b90fbf55970ea6e68955006068 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 30 Oct 2019 15:47:30 +0200 Subject: perf/core: Reattach a misplaced comment A comment is in a wrong place in perf_event_create_kernel_counter(). Fix that. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20191030134731.5437-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index b752bd3aa03b..e18d8d34ca77 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11331,10 +11331,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct perf_event *event; int err; - /* - * Get the target context (task or percpu): - */ - event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { @@ -11345,6 +11341,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); -- cgit From dce5affb94eb54edfff17727a6240a6a5d998666 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 30 Oct 2019 15:47:31 +0200 Subject: perf/aux: Disallow aux_output for kernel events Commit ab43762ef0109 ("perf: Allow normal events to output AUX data") added 'aux_output' bit to the attribute structure, which relies on AUX events and grouping, neither of which is supported for the kernel events. This notwithstanding, attempts have been made to use it in the kernel code, suggesting the necessity of an explicit hard -EINVAL. Fix this by rejecting attributes with aux_output set for kernel events. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20191030134731.5437-3-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index e18d8d34ca77..7655441065a9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11331,6 +11331,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct perf_event *event; int err; + /* + * Grouping is not supported for kernel events, neither is 'AUX', + * make sure the caller's intentions are adjusted. + */ + if (attr->aux_output) + return ERR_PTR(-EINVAL); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { -- cgit From 697d877849d4b34ab58d7078d6930bad0ef6fc66 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 5 Nov 2019 09:57:02 +0200 Subject: perf/core: Consistently fail fork on allocation failures Commit: 313ccb9615948 ("perf: Allocate context task_ctx_data for child event") makes the inherit path skip over the current event in case of task_ctx_data allocation failure. This, however, is inconsistent with allocation failures in perf_event_alloc(), which would abort the fork. Correct this by returning an error code on task_ctx_data allocation failure and failing the fork in that case. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20191105075702.60319-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7655441065a9..466e333db1f3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11802,7 +11802,7 @@ inherit_event(struct perf_event *parent_event, GFP_KERNEL); if (!child_ctx->task_ctx_data) { free_event(child_event); - return NULL; + return ERR_PTR(-ENOMEM); } } -- cgit From d00dbd29814236ad128ff9517e8f7af6b6ef4ba0 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Wed, 6 Nov 2019 13:25:27 +0000 Subject: perf/core: Fix missing static inline on perf_cgroup_switch() It looks like a "static inline" has been missed in front of the empty definition of perf_cgroup_switch() under certain configurations. Fixes the following sparse warning: kernel/events/core.c:1035:1: warning: symbol 'perf_cgroup_switch' was not declared. Should it be static? Signed-off-by: Ben Dooks (Codethink) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lkml.kernel.org/r/20191106132527.19977-1-ben.dooks@codethink.co.uk Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 466e333db1f3..00a014670ed0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1031,7 +1031,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { } -void +static inline void perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } -- cgit From deb0c3c29d552ab81ecd5481bb83bf2f4e41927d Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 6 Nov 2019 00:29:35 -0500 Subject: perf/core: Fix unlock balance in perf_init_event() Commit: 66d258c5b048 ("perf/core: Optimize perf_init_event()") introduced an unlock imbalance in perf_init_event() where it calls "goto again" and then only repeat rcu_read_unlock(). Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 66d258c5b048 ("perf/core: Optimize perf_init_event()") Link: https://lkml.kernel.org/r/20191106052935.8352-1-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6cb6d685191d..8d65e03b98f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10307,7 +10307,6 @@ static struct pmu *perf_init_event(struct perf_event *event) goto unlock; } - rcu_read_lock(); /* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW. @@ -10317,6 +10316,7 @@ static struct pmu *perf_init_event(struct perf_event *event) type = PERF_TYPE_RAW; again: + rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { -- cgit From a4faf00d994c40e64f656805ac375c65e324eefb Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 25 Oct 2019 17:08:33 +0300 Subject: perf/aux: Allow using AUX data in perf samples AUX data can be used to annotate perf events such as performance counters or tracepoints/breakpoints by including it in sample records when PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging and profiling by providing, for example, a history of instruction flow leading up to the event's overflow. The implementation makes use of grouping an AUX event with all the events that wish to take samples of the AUX data, such that the former is the group leader. The samplees should also specify the desired size of the AUX sample via attr.aux_sample_size. AUX capable PMUs need to explicitly add support for sampling, because it relies on a new callback to take a snapshot of the buffer without touching the event states. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: adrian.hunter@intel.com Cc: mathieu.poirier@linaro.org Link: https://lkml.kernel.org/r/20191025140835.53665-2-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 173 +++++++++++++++++++++++++++++++++++++++++++- kernel/events/internal.h | 1 + kernel/events/ring_buffer.c | 36 +++++++++ 3 files changed, 207 insertions(+), 3 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8d65e03b98f2..16d80ad8d6d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1941,6 +1941,11 @@ static void perf_put_aux_event(struct perf_event *event) } } +static bool perf_need_aux_event(struct perf_event *event) +{ + return !!event->attr.aux_output || !!event->attr.aux_sample_size; +} + static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { @@ -1953,7 +1958,17 @@ static int perf_get_aux_event(struct perf_event *event, if (!group_leader) return 0; - if (!perf_aux_output_match(event, group_leader)) + /* + * aux_output and aux_sample_size are mutually exclusive. + */ + if (event->attr.aux_output && event->attr.aux_sample_size) + return 0; + + if (event->attr.aux_output && + !perf_aux_output_match(event, group_leader)) + return 0; + + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) @@ -6222,6 +6237,122 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, } } +static unsigned long perf_prepare_sample_aux(struct perf_event *event, + struct perf_sample_data *data, + size_t size) +{ + struct perf_event *sampler = event->aux_event; + struct ring_buffer *rb; + + data->aux_size = 0; + + if (!sampler) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) + goto out; + + if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) + goto out; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + goto out; + + /* + * If this is an NMI hit inside sampling code, don't take + * the sample. See also perf_aux_sample_output(). + */ + if (READ_ONCE(rb->aux_in_sampling)) { + data->aux_size = 0; + } else { + size = min_t(size_t, size, perf_aux_size(rb)); + data->aux_size = ALIGN(size, sizeof(u64)); + } + ring_buffer_put(rb); + +out: + return data->aux_size; +} + +long perf_pmu_snapshot_aux(struct ring_buffer *rb, + struct perf_event *event, + struct perf_output_handle *handle, + unsigned long size) +{ + unsigned long flags; + long ret; + + /* + * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler + * paths. If we start calling them in NMI context, they may race with + * the IRQ ones, that is, for example, re-starting an event that's just + * been stopped, which is why we're using a separate callback that + * doesn't change the event state. + * + * IRQs need to be disabled to prevent IPIs from racing with us. + */ + local_irq_save(flags); + /* + * Guard against NMI hits inside the critical section; + * see also perf_prepare_sample_aux(). + */ + WRITE_ONCE(rb->aux_in_sampling, 1); + barrier(); + + ret = event->pmu->snapshot_aux(event, handle, size); + + barrier(); + WRITE_ONCE(rb->aux_in_sampling, 0); + local_irq_restore(flags); + + return ret; +} + +static void perf_aux_sample_output(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + struct perf_event *sampler = event->aux_event; + unsigned long pad; + struct ring_buffer *rb; + long size; + + if (WARN_ON_ONCE(!sampler || !data->aux_size)) + return; + + rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + if (!rb) + return; + + size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size); + + /* + * An error here means that perf_output_copy() failed (returned a + * non-zero surplus that it didn't copy), which in its current + * enlightened implementation is not possible. If that changes, we'd + * like to know. + */ + if (WARN_ON_ONCE(size < 0)) + goto out_put; + + /* + * The pad comes from ALIGN()ing data->aux_size up to u64 in + * perf_prepare_sample_aux(), so should not be more than that. + */ + pad = data->aux_size - size; + if (WARN_ON_ONCE(pad >= sizeof(u64))) + pad = 8; + + if (pad) { + u64 zero = 0; + perf_output_copy(handle, &zero, pad); + } + +out_put: + ring_buffer_put(rb); +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -6541,6 +6672,13 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_AUX) { + perf_output_put(handle, data->aux_size); + + if (data->aux_size) + perf_aux_sample_output(event, handle, data); + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -6729,6 +6867,35 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); + + if (sample_type & PERF_SAMPLE_AUX) { + u64 size; + + header->size += sizeof(u64); /* size */ + + /* + * Given the 16bit nature of header::size, an AUX sample can + * easily overflow it, what with all the preceding sample bits. + * Make sure this doesn't happen by using up to U16_MAX bytes + * per sample in total (rounded down to 8 byte boundary). + */ + size = min_t(size_t, U16_MAX - header->size, + event->attr.aux_sample_size); + size = rounddown(size, 8); + size = perf_prepare_sample_aux(event, data, size); + + WARN_ON_ONCE(size + header->size > U16_MAX); + header->size += size; + } + /* + * If you're adding more sample types here, you likely need to do + * something about the overflowing header::size, like repurpose the + * lowest 3 bits of size, which should be always zero at the moment. + * This raises a more important question, do we really need 512k sized + * samples and why, so good argumentation is in order for whatever you + * do here next. + */ + WARN_ON_ONCE(header->size & 7); } static __always_inline int @@ -10727,7 +10894,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->size = size; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -11277,7 +11444,7 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (event->attr.aux_output && !perf_get_aux_event(event, group_leader)) + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) goto err_locked; /* diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 3aef4191798c..747d67f130cb 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -50,6 +50,7 @@ struct ring_buffer { unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; + int aux_in_sampling; void **aux_pages; void *aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 246c83ac5643..7ffd5c763f93 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -562,6 +562,42 @@ void *perf_get_aux(struct perf_output_handle *handle) } EXPORT_SYMBOL_GPL(perf_get_aux); +/* + * Copy out AUX data from an AUX handle. + */ +long perf_output_copy_aux(struct perf_output_handle *aux_handle, + struct perf_output_handle *handle, + unsigned long from, unsigned long to) +{ + unsigned long tocopy, remainder, len = 0; + struct ring_buffer *rb = aux_handle->rb; + void *addr; + + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + + do { + tocopy = PAGE_SIZE - offset_in_page(from); + if (to > from) + tocopy = min(tocopy, to - from); + if (!tocopy) + break; + + addr = rb->aux_pages[from >> PAGE_SHIFT]; + addr += offset_in_page(from); + + remainder = perf_output_copy(handle, addr, tocopy); + if (remainder) + return -EFAULT; + + len += tocopy; + from += tocopy; + from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1; + } while (to != from); + + return len; +} + #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) -- cgit From 3ca270fc9edb258d5bfa271bcf851614e9e6e7d4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 27 Oct 2019 18:52:38 +0800 Subject: perf/core: Provide a kernel-internal interface to recalibrate event period Currently, perf_event_period() is used by user tools via ioctl. Based on naming convention, exporting perf_event_period() for kernel users (such as KVM) who may recalibrate the event period for their assigned counter according to their requirements. The perf_event_period() is an external accessor, just like the perf_event_{en,dis}able() and should thus use perf_event_ctx_lock(). Suggested-by: Kan Liang Signed-off-by: Like Xu Acked-by: Peter Zijlstra Signed-off-by: Paolo Bonzini --- kernel/events/core.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9ec0b0bfddbd..e1b83d2731da 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5106,16 +5106,11 @@ static int perf_event_check_period(struct perf_event *event, u64 value) return event->pmu->check_period(event, value); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) +static int _perf_event_period(struct perf_event *event, u64 value) { - u64 value; - if (!is_sampling_event(event)) return -EINVAL; - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - if (!value) return -EINVAL; @@ -5133,6 +5128,19 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) return 0; } +int perf_event_period(struct perf_event *event, u64 value) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_period(event, value); + perf_event_ctx_unlock(event, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(perf_event_period); + static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) @@ -5176,8 +5184,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); + { + u64 value; + + if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) + return -EFAULT; + return _perf_event_period(event, value); + } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); -- cgit From 52ba4b0b99770e892f43da1238f437155acb8b58 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 27 Oct 2019 18:52:39 +0800 Subject: perf/core: Provide a kernel-internal interface to pause perf_event Exporting perf_event_pause() as an external accessor for kernel users (such as KVM) who may do both disable perf_event and read count with just one time to hold perf_event_ctx_lock. Also the value could be reset optionally. Suggested-by: Peter Zijlstra Signed-off-by: Like Xu Acked-by: Peter Zijlstra Signed-off-by: Paolo Bonzini --- kernel/events/core.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index e1b83d2731da..fc9f5ebf4849 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5029,6 +5029,24 @@ static void _perf_event_reset(struct perf_event *event) perf_event_update_userpage(event); } +/* Assume it's not an event with inherit set. */ +u64 perf_event_pause(struct perf_event *event, bool reset) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(event->attr.inherit); + _perf_event_disable(event); + count = local64_read(&event->count); + if (reset) + local64_set(&event->count, 0); + perf_event_ctx_unlock(event, ctx); + + return count; +} +EXPORT_SYMBOL_GPL(perf_event_pause); + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block -- cgit From 85192dbf4de08795afe2b88e52a36fc6abfc3dba Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:03 -0800 Subject: bpf: Convert bpf_prog refcnt to atomic64_t Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64 and remove artificial 32k limit. This allows to make bpf_prog's refcounting non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc. Validated compilation by running allyesconfig kernel build. Suggested-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com --- kernel/events/core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index aec8dba2bea4..73c616876597 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10477,12 +10477,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) if (overflow_handler == bpf_overflow_handler) { - struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + struct bpf_prog *prog = parent_event->prog; - if (IS_ERR(prog)) { - err = PTR_ERR(prog); - goto err_ns; - } + bpf_prog_inc(prog); event->prog = prog; event->orig_overflow_handler = parent_event->orig_overflow_handler; -- cgit From 36b3db03b4741b8935b68fffc7e69951d8d70a89 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 15 Nov 2019 18:08:18 +0200 Subject: perf/core: Fix the mlock accounting, again Commit: 5e6c3c7b1ec2 ("perf/aux: Fix tracking of auxiliary trace buffer allocation") tried to guess the correct combination of arithmetic operations that would undo the AUX buffer's mlock accounting, and failed, leaking the bottom part when an allocation needs to be charged partially to both user->locked_vm and mm->pinned_vm, eventually leaving the user with no locked bonus: $ perf record -e intel_pt//u -m1,128 uname [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.061 MB perf.data ] $ perf record -e intel_pt//u -m1,128 uname Permission error mapping pages. Consider increasing /proc/sys/kernel/perf_event_mlock_kb, or try again with a smaller value of -m/--mmap_pages. (current value: 1,128) Fix this by subtracting both locked and pinned counts when AUX buffer is unmapped. Reported-by: Thomas Richter Tested-by: Thomas Richter Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mark Rutland Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 00a014670ed0..8f66a4833ded 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5607,10 +5607,8 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - if (!rb->aux_mmap_locked) - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - else - atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); + atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm); + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); -- cgit From c4b75479741c9c3a4f0abff5baa5013d27640ac1 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 20 Nov 2019 19:06:40 +0200 Subject: perf/core: Make the mlock accounting simple again Commit: d44248a41337 ("perf/core: Rework memory accounting in perf_mmap()") does a lot of things to the mlock accounting arithmetics, while the only thing that actually needed to happen is subtracting the part that is charged to the mm from the part that is charged to the user, so that the former isn't charged twice. Signed-off-by: Alexander Shishkin Acked-by: Song Liu Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Peter Zijlstra Cc: Wanpeng Li Cc: Yauheni Kaliuta Cc: songliubraving@fb.com Link: https://lkml.kernel.org/r/20191120170640.54123-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f66a4833ded..7e8980d0b997 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5825,13 +5825,7 @@ accounting: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - if (user_locked <= user_lock_limit) { - /* charge all to locked_vm */ - } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) { - /* charge all to pinned_vm */ - extra = user_extra; - user_extra = 0; - } else { + if (user_locked > user_lock_limit) { /* * charge locked_vm until it hits user_lock_limit; * charge the rest from pinned_vm -- cgit From ff68dac6d65cd1347dad5d780dd8c90f29dc1b0b Mon Sep 17 00:00:00 2001 From: Gaowei Pu Date: Sat, 30 Nov 2019 17:51:03 -0800 Subject: mm/mmap.c: use IS_ERR_VALUE to check return value of get_unmapped_area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_unmapped_area() returns an address or -errno on failure. Historically we have checked for the failure by offset_in_page() which is correct but quite hard to read. Newer code started using IS_ERR_VALUE which is much easier to read. Convert remaining users of offset_in_page as well. [mhocko@suse.com: rewrite changelog] [mhocko@kernel.org: fix mremap.c and uprobes.c sites also] Link: http://lkml.kernel.org/r/20191012102512.28051-1-pugaowei@gmail.com Signed-off-by: Gaowei Pu Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Wei Yang Cc: Konstantin Khlebnikov Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Mike Kravetz Cc: Rik van Riel Cc: Qian Cai Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c74761004ee5..ece7e13f6e4a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { + if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } -- cgit