From fdccc3fb7a42ea4e4cd77d2fb8fa3a45c66ec0bf Mon Sep 17 00:00:00 2001 From: "leilei.lin" Date: Wed, 9 Aug 2017 08:29:21 +0800 Subject: perf/core: Reduce context switch overhead Skip most of the PMU context switching overhead when ctx->nr_events is 0. 50% performance overhead was observed under an extreme testcase. Signed-off-by: leilei.lin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: eranian@gmail.com Cc: jolsa@redhat.com Cc: linxiulei@gmail.com Cc: yang_oliver@hotmail.com Link: http://lkml.kernel.org/r/20170809002921.69813-1-leilei.lin@alibaba-inc.com [ Rewrote the changelog. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index ee20d4c546b5..d704e23914bf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3211,6 +3211,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, return; perf_ctx_lock(cpuctx, ctx); + /* + * We must check ctx->nr_events while holding ctx->lock, such + * that we serialize against perf_install_in_context(). + */ + if (!ctx->nr_events) + goto unlock; + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -3224,6 +3231,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); + +unlock: perf_ctx_unlock(cpuctx, ctx); } -- cgit From 2ab346cfb0decf01523949e29f5cf542f2304611 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Aug 2017 17:18:16 +0100 Subject: perf/aux: Make aux_{head,wakeup} ring_buffer members long The aux_head and aux_wakeup members of struct ring_buffer are defined using the local_t type, despite the fact that they are only accessed via the perf_aux_output_*() functions, which cannot race with each other for a given ring buffer. This patch changes the type of the members to long, so we can avoid using the local_*() API where it isn't needed. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1502900297-21839-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 31 ++++++++++++++----------------- 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 486fd78eb8d5..2941b868353c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -38,9 +38,9 @@ struct ring_buffer { struct user_struct *mmap_user; /* AUX area */ - local_t aux_head; + long aux_head; local_t aux_nest; - local_t aux_wakeup; + long aux_wakeup; unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ee97196bb151..25437fda56e3 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) goto err_put; - aux_head = local_read(&rb->aux_head); + aux_head = rb->aux_head; handle->rb = rb; handle->event = event; @@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, */ if (!rb->aux_overwrite) { aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); - handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; + handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); @@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; - local_set(&rb->aux_head, aux_head); + rb->aux_head = aux_head; } else { handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; - aux_head = local_read(&rb->aux_head); - local_add(size, &rb->aux_head); + aux_head = rb->aux_head; + rb->aux_head += size; } if (size || handle->aux_flags) { @@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->aux_flags); } - aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - - if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + rb->user_page->aux_head = rb->aux_head; + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { wakeup = true; - local_add(rb->aux_watermark, &rb->aux_wakeup); + rb->aux_wakeup += rb->aux_watermark; } if (wakeup) { @@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { struct ring_buffer *rb = handle->rb; - unsigned long aux_head; if (size > handle->size) return -ENOSPC; - local_add(size, &rb->aux_head); + rb->aux_head += size; - aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + rb->user_page->aux_head = rb->aux_head; + if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { perf_output_wakeup(handle); - local_add(rb->aux_watermark, &rb->aux_wakeup); - handle->wakeup = local_read(&rb->aux_wakeup) + - rb->aux_watermark; + rb->aux_wakeup += rb->aux_watermark; + handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } - handle->head = aux_head; + handle->head = rb->aux_head; handle->size -= size; return 0; -- cgit From d9a50b0256f06bd39a1bed1ba40baec37c356b11 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Aug 2017 17:18:17 +0100 Subject: perf/aux: Ensure aux_wakeup represents most recent wakeup index The aux_watermark member of struct ring_buffer represents the period (in terms of bytes) at which wakeup events should be generated when data is written to the aux buffer in non-snapshot mode. On hardware that cannot generate an interrupt when the aux_head reaches an arbitrary wakeup index (such as ARM SPE), the aux_head sampled from handle->head in perf_aux_output_{skip,end} may in fact be past the wakeup index. This can lead to wakeup slowly falling behind the head. For example, consider the case where hardware can only generate an interrupt on a page-boundary and the aux buffer is initialised as follows: // Buffer size is 2 * PAGE_SIZE rb->aux_head = rb->aux_wakeup = 0 rb->aux_watermark = PAGE_SIZE / 2 following the first perf_aux_output_begin call, the handle is initialised with: handle->head = 0 handle->size = 2 * PAGE_SIZE handle->wakeup = PAGE_SIZE / 2 and the hardware will be programmed to generate an interrupt at PAGE_SIZE. When the interrupt is raised, the hardware head will be at PAGE_SIZE, so calling perf_aux_output_end(handle, PAGE_SIZE) puts the ring buffer into the following state: rb->aux_head = PAGE_SIZE rb->aux_wakeup = PAGE_SIZE / 2 rb->aux_watermark = PAGE_SIZE / 2 and then the next call to perf_aux_output_begin will result in: handle->head = handle->wakeup = PAGE_SIZE for which the semantics are unclear and, for a smaller aux_watermark (e.g. PAGE_SIZE / 4), then the wakeup would in fact be behind head at this point. This patch fixes the problem by rounding down the aux_head (as sampled from the handle) to the nearest aux_watermark boundary when updating rb->aux_wakeup, therefore taking into account any overruns by the hardware. Reported-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1502900297-21839-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 2 +- kernel/events/ring_buffer.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2941b868353c..5377c591c57a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -40,7 +40,7 @@ struct ring_buffer { /* AUX area */ long aux_head; local_t aux_nest; - long aux_wakeup; + long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 25437fda56e3..af71a84e12ee 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -453,7 +453,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) rb->user_page->aux_head = rb->aux_head; if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { wakeup = true; - rb->aux_wakeup += rb->aux_watermark; + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); } if (wakeup) { @@ -486,7 +486,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->user_page->aux_head = rb->aux_head; if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { perf_output_wakeup(handle); - rb->aux_wakeup += rb->aux_watermark; + rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } -- cgit From 1d953111b648e48923171c3c9cf17be2250544fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Aug 2017 17:59:28 +0200 Subject: perf/core: Don't report zero PIDs for exiting tasks The exiting/dead task has no PIDs and in this case perf_event_pid/tid() return zero, change them to return -1 to distinguish this case from idle threads. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170822155928.GA6892@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1ac5015bab04..b411321b6c26 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx) return parent_ctx; } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, + enum pid_type type) { + u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; - return task_tgid_nr_ns(p, event->ns); + nr = __task_pid_nr_ns(p, type, event->ns); + /* avoid -1 if it is idle thread or runs in another ns */ + if (!nr && !pid_alive(p)) + nr = -1; + return nr; } -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; + return perf_event_pid_type(event, p, __PIDTYPE_TGID); +} - return task_pid_nr_ns(p, event->ns); +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + return perf_event_pid_type(event, p, PIDTYPE_PID); } /* -- cgit From d0618410eced4eb092295fad10312a4545fcdfaf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 22 Aug 2017 19:22:43 +0200 Subject: tracing, perf: Adjust code layout in get_recursion_context() In an XDP redirect applications using tracepoint xdp:xdp_redirect to diagnose TX overrun, I noticed perf_swevent_get_recursion_context() was consuming 2% CPU. This was reduced to 1.85% with this simple change. Looking at the annotated asm code, it was clear that the unlikely case in_nmi() test was chosen (by the compiler) as the most likely event/branch. This small adjustment makes the compiler (GCC version 7.1.1 20170622 (Red Hat 7.1.1-3)) put in_nmi() as an unlikely branch. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/150342256382.16595.986861478681783732.stgit@firesoul Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5377c591c57a..843e97047335 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion) { int rctx; - if (in_nmi()) + if (unlikely(in_nmi())) rctx = 3; else if (in_irq()) rctx = 2; -- cgit From 8d4e6c4caa12dafbcba138e5450b7af17b0b2194 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Mar 2017 18:39:56 +0300 Subject: perf/core, pt, bts: Get rid of itrace_started I just noticed that hw.itrace_started and hw.config are aliased to the same location. Now, the PT driver happens to use both, which works out fine by sheer luck: - STORE(hw.itrace_start) is ordered before STORE(hw.config), in the program order, although there are no compiler barriers to ensure that, - to the perf_log_itrace_start() hw.itrace_start looks set at the same time as when it is intended to be set because both stores happen in the same path, - hw.config is never reset to zero in the PT driver. Now, the use of hw.config by the PT driver makes more sense (it being a HW PMU) than messing around with itrace_started, which is an awkward API to begin with. This patch replaces hw.itrace_started with an attach_state bit and an API call for the PMU drivers to use to communicate the condition. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: vince@deater.net Link: http://lkml.kernel.org/r/20170330153956.25994-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index e5467e107624..77fd6b11ef22 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7301,6 +7301,11 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +void perf_event_itrace_started(struct perf_event *event) +{ + event->attach_state |= PERF_ATTACH_ITRACE; +} + static void perf_log_itrace_start(struct perf_event *event) { struct perf_output_handle handle; @@ -7316,7 +7321,7 @@ static void perf_log_itrace_start(struct perf_event *event) event = event->parent; if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || - event->hw.itrace_started) + event->attach_state & PERF_ATTACH_ITRACE) return; rec.header.type = PERF_RECORD_ITRACE_START; -- cgit From fc7ce9c74c3ad232b084d80148654f926d01ece7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 28 Aug 2017 20:52:49 -0400 Subject: perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR For understanding how the workload maps to memory channels and hardware behavior, it's very important to collect address maps with physical addresses. For example, 3D XPoint access can only be found by filtering the physical address. Add a new sample type for physical address. perf already has a facility to collect data virtual address. This patch introduces a function to convert the virtual address to physical address. The function is quite generic and can be extended to any architecture as long as a virtual address is provided. - For kernel direct mapping addresses, virt_to_phys is used to convert the virtual addresses to physical address. - For user virtual addresses, __get_user_pages_fast is used to walk the pages tables for user physical address. - This does not work for vmalloc addresses right now. These are not resolved, but code to do that could be added. The new sample type requires collecting the virtual address. The virtual address will not be output unless SAMPLE_ADDR is applied. For security, the physical address can only be exposed to root or privileged user. Tested-by: Madhavan Srinivasan Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mpe@ellerman.id.au Link: http://lkml.kernel.org/r/1503967969-48278-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index 77fd6b11ef22..ce64f3fed5c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1575,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + size += sizeof(data->phys_addr); + event->header_size = size; } @@ -6017,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle, } } + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + perf_output_put(handle, data->phys_addr); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -6032,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle, } } +static u64 perf_virt_to_phys(u64 virt) +{ + u64 phys_addr = 0; + struct page *p = NULL; + + if (!virt) + return 0; + + if (virt >= TASK_SIZE) { + /* If it's vmalloc()d memory, leave phys_addr as 0 */ + if (virt_addr_valid((void *)(uintptr_t)virt) && + !(virt >= VMALLOC_START && virt < VMALLOC_END)) + phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt); + } else { + /* + * Walking the pages tables for user address. + * Interrupts are disabled, so it prevents any tear down + * of the page tables. + * Try IRQ-safe __get_user_pages_fast first. + * If failed, leave phys_addr as 0. + */ + if ((current->mm != NULL) && + (__get_user_pages_fast(virt, 1, 0, &p) == 1)) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + + if (p) + put_page(p); + } + + return phys_addr; +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, @@ -6150,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + data->phys_addr = perf_virt_to_phys(data->addr); } static void __always_inline @@ -9909,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* Only privileged users can get physical addresses */ + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!attr.sample_max_stack) attr.sample_max_stack = sysctl_perf_event_max_stack; -- cgit