From fdccc3fb7a42ea4e4cd77d2fb8fa3a45c66ec0bf Mon Sep 17 00:00:00 2001
From: "leilei.lin" <leilei.lin@alibaba-inc.com>
Date: Wed, 9 Aug 2017 08:29:21 +0800
Subject: perf/core: Reduce context switch overhead

Skip most of the PMU context switching overhead when ctx->nr_events is 0.

50% performance overhead was observed under an extreme testcase.

Signed-off-by: leilei.lin <leilei.lin@alibaba-inc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: alexander.shishkin@linux.intel.com
Cc: eranian@gmail.com
Cc: jolsa@redhat.com
Cc: linxiulei@gmail.com
Cc: yang_oliver@hotmail.com
Link: http://lkml.kernel.org/r/20170809002921.69813-1-leilei.lin@alibaba-inc.com
[ Rewrote the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel/events')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ee20d4c546b5..d704e23914bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3211,6 +3211,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 		return;
 
 	perf_ctx_lock(cpuctx, ctx);
+	/*
+	 * We must check ctx->nr_events while holding ctx->lock, such
+	 * that we serialize against perf_install_in_context().
+	 */
+	if (!ctx->nr_events)
+		goto unlock;
+
 	perf_pmu_disable(ctx->pmu);
 	/*
 	 * We want to keep the following priority order:
@@ -3224,6 +3231,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 	perf_pmu_enable(ctx->pmu);
+
+unlock:
 	perf_ctx_unlock(cpuctx, ctx);
 }
 
-- 
cgit 


From 2ab346cfb0decf01523949e29f5cf542f2304611 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 16 Aug 2017 17:18:16 +0100
Subject: perf/aux: Make aux_{head,wakeup} ring_buffer members long

The aux_head and aux_wakeup members of struct ring_buffer are defined
using the local_t type, despite the fact that they are only accessed via
the perf_aux_output_*() functions, which cannot race with each other for a
given ring buffer.

This patch changes the type of the members to long, so we can avoid
using the local_*() API where it isn't needed.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1502900297-21839-1-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/internal.h    |  4 ++--
 kernel/events/ring_buffer.c | 31 ++++++++++++++-----------------
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel/events')

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78eb8d5..2941b868353c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -38,9 +38,9 @@ struct ring_buffer {
 	struct user_struct		*mmap_user;
 
 	/* AUX area */
-	local_t				aux_head;
+	long				aux_head;
 	local_t				aux_nest;
-	local_t				aux_wakeup;
+	long				aux_wakeup;
 	unsigned long			aux_pgoff;
 	int				aux_nr_pages;
 	int				aux_overwrite;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ee97196bb151..25437fda56e3 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
 		goto err_put;
 
-	aux_head = local_read(&rb->aux_head);
+	aux_head = rb->aux_head;
 
 	handle->rb = rb;
 	handle->event = event;
@@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	 */
 	if (!rb->aux_overwrite) {
 		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
-		handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 		if (aux_head - aux_tail < perf_aux_size(rb))
 			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
 
@@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 		handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
 
 		aux_head = handle->head;
-		local_set(&rb->aux_head, aux_head);
+		rb->aux_head = aux_head;
 	} else {
 		handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
 
-		aux_head = local_read(&rb->aux_head);
-		local_add(size, &rb->aux_head);
+		aux_head = rb->aux_head;
+		rb->aux_head += size;
 	}
 
 	if (size || handle->aux_flags) {
@@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 		                     handle->aux_flags);
 	}
 
-	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-
-	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+	rb->user_page->aux_head = rb->aux_head;
+	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
 		wakeup = true;
-		local_add(rb->aux_watermark, &rb->aux_wakeup);
+		rb->aux_wakeup += rb->aux_watermark;
 	}
 
 	if (wakeup) {
@@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 {
 	struct ring_buffer *rb = handle->rb;
-	unsigned long aux_head;
 
 	if (size > handle->size)
 		return -ENOSPC;
 
-	local_add(size, &rb->aux_head);
+	rb->aux_head += size;
 
-	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+	rb->user_page->aux_head = rb->aux_head;
+	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
 		perf_output_wakeup(handle);
-		local_add(rb->aux_watermark, &rb->aux_wakeup);
-		handle->wakeup = local_read(&rb->aux_wakeup) +
-				 rb->aux_watermark;
+		rb->aux_wakeup += rb->aux_watermark;
+		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 	}
 
-	handle->head = aux_head;
+	handle->head = rb->aux_head;
 	handle->size -= size;
 
 	return 0;
-- 
cgit 


From d9a50b0256f06bd39a1bed1ba40baec37c356b11 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 16 Aug 2017 17:18:17 +0100
Subject: perf/aux: Ensure aux_wakeup represents most recent wakeup index

The aux_watermark member of struct ring_buffer represents the period (in
terms of bytes) at which wakeup events should be generated when data is
written to the aux buffer in non-snapshot mode. On hardware that cannot
generate an interrupt when the aux_head reaches an arbitrary wakeup index
(such as ARM SPE), the aux_head sampled from handle->head in
perf_aux_output_{skip,end} may in fact be past the wakeup index. This
can lead to wakeup slowly falling behind the head. For example, consider
the case where hardware can only generate an interrupt on a page-boundary
and the aux buffer is initialised as follows:

  // Buffer size is 2 * PAGE_SIZE
  rb->aux_head = rb->aux_wakeup = 0
  rb->aux_watermark = PAGE_SIZE / 2

following the first perf_aux_output_begin call, the handle is
initialised with:

  handle->head = 0
  handle->size = 2 * PAGE_SIZE
  handle->wakeup = PAGE_SIZE / 2

and the hardware will be programmed to generate an interrupt at
PAGE_SIZE.

When the interrupt is raised, the hardware head will be at PAGE_SIZE,
so calling perf_aux_output_end(handle, PAGE_SIZE) puts the ring buffer
into the following state:

  rb->aux_head = PAGE_SIZE
  rb->aux_wakeup = PAGE_SIZE / 2
  rb->aux_watermark = PAGE_SIZE / 2

and then the next call to perf_aux_output_begin will result in:

  handle->head = handle->wakeup = PAGE_SIZE

for which the semantics are unclear and, for a smaller aux_watermark
(e.g. PAGE_SIZE / 4), then the wakeup would in fact be behind head at
this point.

This patch fixes the problem by rounding down the aux_head (as sampled
from the handle) to the nearest aux_watermark boundary when updating
rb->aux_wakeup, therefore taking into account any overruns by the
hardware.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1502900297-21839-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/internal.h    | 2 +-
 kernel/events/ring_buffer.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/events')

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2941b868353c..5377c591c57a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -40,7 +40,7 @@ struct ring_buffer {
 	/* AUX area */
 	long				aux_head;
 	local_t				aux_nest;
-	long				aux_wakeup;
+	long				aux_wakeup;	/* last aux_watermark boundary crossed by aux_head */
 	unsigned long			aux_pgoff;
 	int				aux_nr_pages;
 	int				aux_overwrite;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 25437fda56e3..af71a84e12ee 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -453,7 +453,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 	rb->user_page->aux_head = rb->aux_head;
 	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
 		wakeup = true;
-		rb->aux_wakeup += rb->aux_watermark;
+		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
 	}
 
 	if (wakeup) {
@@ -486,7 +486,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 	rb->user_page->aux_head = rb->aux_head;
 	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
 		perf_output_wakeup(handle);
-		rb->aux_wakeup += rb->aux_watermark;
+		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
 		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 	}
 
-- 
cgit 


From 1d953111b648e48923171c3c9cf17be2250544fa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 22 Aug 2017 17:59:28 +0200
Subject: perf/core: Don't report zero PIDs for exiting tasks

The exiting/dead task has no PIDs and in this case perf_event_pid/tid()
return zero, change them to return -1 to distinguish this case from
idle threads.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170822155928.GA6892@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel/events')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1ac5015bab04..b411321b6c26 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx)
 	return parent_ctx;
 }
 
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+				enum pid_type type)
 {
+	u32 nr;
 	/*
 	 * only top level events have the pid namespace they were created in
 	 */
 	if (event->parent)
 		event = event->parent;
 
-	return task_tgid_nr_ns(p, event->ns);
+	nr = __task_pid_nr_ns(p, type, event->ns);
+	/* avoid -1 if it is idle thread or runs in another ns */
+	if (!nr && !pid_alive(p))
+		nr = -1;
+	return nr;
 }
 
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
 {
-	/*
-	 * only top level events have the pid namespace they were created in
-	 */
-	if (event->parent)
-		event = event->parent;
+	return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+}
 
-	return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+	return perf_event_pid_type(event, p, PIDTYPE_PID);
 }
 
 /*
-- 
cgit 


From d0618410eced4eb092295fad10312a4545fcdfaf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 22 Aug 2017 19:22:43 +0200
Subject: tracing, perf: Adjust code layout in get_recursion_context()

In an XDP redirect applications using tracepoint xdp:xdp_redirect to
diagnose TX overrun, I noticed perf_swevent_get_recursion_context()
was consuming 2% CPU. This was reduced to 1.85% with this simple
change.

Looking at the annotated asm code, it was clear that the unlikely case
in_nmi() test was chosen (by the compiler) as the most likely
event/branch.  This small adjustment makes the compiler (GCC version
7.1.1 20170622 (Red Hat 7.1.1-3)) put in_nmi() as an unlikely branch.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/150342256382.16595.986861478681783732.stgit@firesoul
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/events')

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 5377c591c57a..843e97047335 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion)
 {
 	int rctx;
 
-	if (in_nmi())
+	if (unlikely(in_nmi()))
 		rctx = 3;
 	else if (in_irq())
 		rctx = 2;
-- 
cgit 


From 8d4e6c4caa12dafbcba138e5450b7af17b0b2194 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 30 Mar 2017 18:39:56 +0300
Subject: perf/core, pt, bts: Get rid of itrace_started

I just noticed that hw.itrace_started and hw.config are aliased to the
same location. Now, the PT driver happens to use both, which works out
fine by sheer luck:

 - STORE(hw.itrace_start) is ordered before STORE(hw.config), in the
    program order, although there are no compiler barriers to ensure that,

 - to the perf_log_itrace_start() hw.itrace_start looks set at the same
   time as when it is intended to be set because both stores happen in the
   same path,

 - hw.config is never reset to zero in the PT driver.

Now, the use of hw.config by the PT driver makes more sense (it being a
HW PMU) than messing around with itrace_started, which is an awkward API
to begin with.

This patch replaces hw.itrace_started with an attach_state bit and an
API call for the PMU drivers to use to communicate the condition.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170330153956.25994-1-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel/events')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5467e107624..77fd6b11ef22 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7301,6 +7301,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+void perf_event_itrace_started(struct perf_event *event)
+{
+	event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
 static void perf_log_itrace_start(struct perf_event *event)
 {
 	struct perf_output_handle handle;
@@ -7316,7 +7321,7 @@ static void perf_log_itrace_start(struct perf_event *event)
 		event = event->parent;
 
 	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
-	    event->hw.itrace_started)
+	    event->attach_state & PERF_ATTACH_ITRACE)
 		return;
 
 	rec.header.type	= PERF_RECORD_ITRACE_START;
-- 
cgit 


From fc7ce9c74c3ad232b084d80148654f926d01ece7 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Mon, 28 Aug 2017 20:52:49 -0400
Subject: perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR

For understanding how the workload maps to memory channels and hardware
behavior, it's very important to collect address maps with physical
addresses. For example, 3D XPoint access can only be found by filtering
the physical address.

Add a new sample type for physical address.

perf already has a facility to collect data virtual address. This patch
introduces a function to convert the virtual address to physical address.
The function is quite generic and can be extended to any architecture as
long as a virtual address is provided.

 - For kernel direct mapping addresses, virt_to_phys is used to convert
   the virtual addresses to physical address.

 - For user virtual addresses, __get_user_pages_fast is used to walk the
   pages tables for user physical address.

 - This does not work for vmalloc addresses right now. These are not
   resolved, but code to do that could be added.

The new sample type requires collecting the virtual address. The
virtual address will not be output unless SAMPLE_ADDR is applied.

For security, the physical address can only be exposed to root or
privileged user.

Tested-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: mpe@ellerman.id.au
Link: http://lkml.kernel.org/r/1503967969-48278-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'kernel/events')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77fd6b11ef22..ce64f3fed5c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1575,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		size += sizeof(data->phys_addr);
+
 	event->header_size = size;
 }
 
@@ -6017,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		perf_output_put(handle, data->phys_addr);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6032,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle,
 	}
 }
 
+static u64 perf_virt_to_phys(u64 virt)
+{
+	u64 phys_addr = 0;
+	struct page *p = NULL;
+
+	if (!virt)
+		return 0;
+
+	if (virt >= TASK_SIZE) {
+		/* If it's vmalloc()d memory, leave phys_addr as 0 */
+		if (virt_addr_valid((void *)(uintptr_t)virt) &&
+		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
+			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+	} else {
+		/*
+		 * Walking the pages tables for user address.
+		 * Interrupts are disabled, so it prevents any tear down
+		 * of the page tables.
+		 * Try IRQ-safe __get_user_pages_fast first.
+		 * If failed, leave phys_addr as 0.
+		 */
+		if ((current->mm != NULL) &&
+		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+		if (p)
+			put_page(p);
+	}
+
+	return phys_addr;
+}
+
 void perf_prepare_sample(struct perf_event_header *header,
 			 struct perf_sample_data *data,
 			 struct perf_event *event,
@@ -6150,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		data->phys_addr = perf_virt_to_phys(data->addr);
 }
 
 static void __always_inline
@@ -9909,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* Only privileged users can get physical addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (!attr.sample_max_stack)
 		attr.sample_max_stack = sysctl_perf_event_max_stack;
 
-- 
cgit