539 files changed, 26459 insertions, 14916 deletions
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 95b30fa25d56..62e847bcdcdd 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2080,6 +2080,8 @@ Some of the relevant points of interest are as follows:
 <li>	<a href="#Scheduler and RCU">Scheduler and RCU</a>.
 <li>	<a href="#Tracing and RCU">Tracing and RCU</a>.
 <li>	<a href="#Energy Efficiency">Energy Efficiency</a>.
+<li>	<a href="#Scheduling-Clock Interrupts and RCU">
+	Scheduling-Clock Interrupts and RCU</a>.
 <li>	<a href="#Memory Efficiency">Memory Efficiency</a>.
 <li>	<a href="#Performance, Scalability, Response Time, and Reliability">
 	Performance, Scalability, Response Time, and Reliability</a>.
@@ -2532,6 +2534,134 @@ I learned of many of these requirements via angry phone calls:
 Flaming me on the Linux-kernel mailing list was apparently not
 sufficient to fully vent their ire at RCU's energy-efficiency bugs!
 
+<h3><a name="Scheduling-Clock Interrupts and RCU">
+Scheduling-Clock Interrupts and RCU</a></h3>
+
+<p>
+The kernel transitions between in-kernel non-idle execution, userspace
+execution, and the idle loop.
+Depending on kernel configuration, RCU handles these states differently:
+
+<table border=3>
+<tr><th><tt>HZ</tt> Kconfig</th>
+	<th>In-Kernel</th>
+		<th>Usermode</th>
+			<th>Idle</th></tr>
+<tr><th align="left"><tt>HZ_PERIODIC</tt></th>
+	<td>Can rely on scheduling-clock interrupt.</td>
+		<td>Can rely on scheduling-clock interrupt and its
+		    detection of interrupt from usermode.</td>
+			<td>Can rely on RCU's dyntick-idle detection.</td></tr>
+<tr><th align="left"><tt>NO_HZ_IDLE</tt></th>
+	<td>Can rely on scheduling-clock interrupt.</td>
+		<td>Can rely on scheduling-clock interrupt and its
+		    detection of interrupt from usermode.</td>
+			<td>Can rely on RCU's dyntick-idle detection.</td></tr>
+<tr><th align="left"><tt>NO_HZ_FULL</tt></th>
+	<td>Can only sometimes rely on scheduling-clock interrupt.
+	    In other cases, it is necessary to bound kernel execution
+	    times and/or use IPIs.</td>
+		<td>Can rely on RCU's dyntick-idle detection.</td>
+			<td>Can rely on RCU's dyntick-idle detection.</td></tr>
+</table>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Why can't <tt>NO_HZ_FULL</tt> in-kernel execution rely on the
+	scheduling-clock interrupt, just like <tt>HZ_PERIODIC</tt>
+	and <tt>NO_HZ_IDLE</tt> do?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	Because, as a performance optimization, <tt>NO_HZ_FULL</tt>
+	does not necessarily re-enable the scheduling-clock interrupt
+	on entry to each and every system call.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>
+However, RCU must be reliably informed as to whether any given
+CPU is currently in the idle loop, and, for <tt>NO_HZ_FULL</tt>,
+also whether that CPU is executing in usermode, as discussed
+<a href="#Energy Efficiency">earlier</a>.
+It also requires that the scheduling-clock interrupt be enabled when
+RCU needs it to be:
+
+<ol>
+<li>	If a CPU is either idle or executing in usermode, and RCU believes
+	it is non-idle, the scheduling-clock tick had better be running.
+	Otherwise, you will get RCU CPU stall warnings.  Or at best,
+	very long (11-second) grace periods, with a pointless IPI waking
+	the CPU from time to time.
+<li>	If a CPU is in a portion of the kernel that executes RCU read-side
+	critical sections, and RCU believes this CPU to be idle, you will get
+	random memory corruption.  <b>DON'T DO THIS!!!</b>
+
+	<br>This is one reason to test with lockdep, which will complain
+	about this sort of thing.
+<li>	If a CPU is in a portion of the kernel that is absolutely
+	positively no-joking guaranteed to never execute any RCU read-side
+	critical sections, and RCU believes this CPU to to be idle,
+	no problem.  This sort of thing is used by some architectures
+	for light-weight exception handlers, which can then avoid the
+	overhead of <tt>rcu_irq_enter()</tt> and <tt>rcu_irq_exit()</tt>
+	at exception entry and exit, respectively.
+	Some go further and avoid the entireties of <tt>irq_enter()</tt>
+	and <tt>irq_exit()</tt>.
+
+	<br>Just make very sure you are running some of your tests with
+	<tt>CONFIG_PROVE_RCU=y</tt>, just in case one of your code paths
+	was in fact joking about not doing RCU read-side critical sections.
+<li>	If a CPU is executing in the kernel with the scheduling-clock
+	interrupt disabled and RCU believes this CPU to be non-idle,
+	and if the CPU goes idle (from an RCU perspective) every few
+	jiffies, no problem.  It is usually OK for there to be the
+	occasional gap between idle periods of up to a second or so.
+
+	<br>If the gap grows too long, you get RCU CPU stall warnings.
+<li>	If a CPU is either idle or executing in usermode, and RCU believes
+	it to be idle, of course no problem.
+<li>	If a CPU is executing in the kernel, the kernel code
+	path is passing through quiescent states at a reasonable
+	frequency (preferably about once per few jiffies, but the
+	occasional excursion to a second or so is usually OK) and the
+	scheduling-clock interrupt is enabled, of course no problem.
+
+	<br>If the gap between a successive pair of quiescent states grows
+	too long, you get RCU CPU stall warnings.
+</ol>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	But what if my driver has a hardware interrupt handler
+	that can run for many seconds?
+	I cannot invoke <tt>schedule()</tt> from an hardware
+	interrupt handler, after all!
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	One approach is to do <tt>rcu_irq_exit();rcu_irq_enter();</tt>
+	every so often.
+	But given that long-running interrupt handlers can cause
+	other problems, not least for response time, shouldn't you
+	work to keep your interrupt handler's runtime within reasonable
+	bounds?
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>
+But as long as RCU is properly informed of kernel state transitions between
+in-kernel execution, usermode execution, and idle, and as long as the
+scheduling-clock interrupt is enabled when RCU needs it to be, you
+can rest assured that the bugs you encounter will be in some other
+part of RCU or some other part of the kernel!
+
 <h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
 
 <p>
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 6beda556faf3..49747717d905 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -23,6 +23,14 @@ over a rather long period of time, but improvements are always welcome!
 	Yet another exception is where the low real-time latency of RCU's
 	read-side primitives is critically important.
 
+	One final exception is where RCU readers are used to prevent
+	the ABA problem (https://en.wikipedia.org/wiki/ABA_problem)
+	for lockless updates.  This does result in the mildly
+	counter-intuitive situation where rcu_read_lock() and
+	rcu_read_unlock() are used to protect updates, however, this
+	approach provides the same potential simplifications that garbage
+	collectors do.
+
 1.	Does the update code have proper mutual exclusion?
 
 	RCU does allow -readers- to run (almost) naked, but -writers- must
@@ -40,7 +48,9 @@ over a rather long period of time, but improvements are always welcome!
 	explain how this single task does not become a major bottleneck on
 	big multiprocessor machines (for example, if the task is updating
 	information relating to itself that other tasks can read, there
-	by definition can be no bottleneck).
+	by definition can be no bottleneck).  Note that the definition
+	of "large" has changed significantly:  Eight CPUs was "large"
+	in the year 2000, but a hundred CPUs was unremarkable in 2017.
 
 2.	Do the RCU read-side critical sections make proper use of
 	rcu_read_lock() and friends?  These primitives are needed
@@ -55,6 +65,12 @@ over a rather long period of time, but improvements are always welcome!
 	Disabling of preemption can serve as rcu_read_lock_sched(), but
 	is less readable.
 
+	Letting RCU-protected pointers "leak" out of an RCU read-side
+	critical section is every bid as bad as letting them leak out
+	from under a lock.  Unless, of course, you have arranged some
+	other means of protection, such as a lock or a reference count
+	-before- letting them out of the RCU read-side critical section.
+
 3.	Does the update code tolerate concurrent accesses?
 
 	The whole point of RCU is to permit readers to run without
@@ -78,10 +94,10 @@ over a rather long period of time, but improvements are always welcome!
 
 		This works quite well, also.
 
-	c.	Make updates appear atomic to readers.  For example,
+	c.	Make updates appear atomic to readers.	For example,
 		pointer updates to properly aligned fields will
 		appear atomic, as will individual atomic primitives.
-		Sequences of perations performed under a lock will -not-
+		Sequences of operations performed under a lock will -not-
 		appear to be atomic to RCU readers, nor will sequences
 		of multiple atomic primitives.
 
@@ -168,8 +184,8 @@ over a rather long period of time, but improvements are always welcome!
 
 5.	If call_rcu(), or a related primitive such as call_rcu_bh(),
 	call_rcu_sched(), or call_srcu() is used, the callback function
-	must be written to be called from softirq context.  In particular,
-	it cannot block.
+	will be called from softirq context.  In particular, it cannot
+	block.
 
 6.	Since synchronize_rcu() can block, it cannot be called from
 	any sort of irq context.  The same rule applies for
@@ -178,11 +194,14 @@ over a rather long period of time, but improvements are always welcome!
 	synchronize_sched_expedite(), and synchronize_srcu_expedited().
 
 	The expedited forms of these primitives have the same semantics
-	as the non-expedited forms, but expediting is both expensive
-	and unfriendly to real-time workloads.	Use of the expedited
-	primitives should be restricted to rare configuration-change
-	operations that would not normally be undertaken while a real-time
-	workload is running.
+	as the non-expedited forms, but expediting is both expensive and
+	(with the exception of synchronize_srcu_expedited()) unfriendly
+	to real-time workloads.  Use of the expedited primitives should
+	be restricted to rare configuration-change operations that would
+	not normally be undertaken while a real-time workload is running.
+	However, real-time workloads can use rcupdate.rcu_normal kernel
+	boot parameter to completely disable expedited grace periods,
+	though this might have performance implications.
 
 	In particular, if you find yourself invoking one of the expedited
 	primitives repeatedly in a loop, please do everyone a favor:
@@ -193,11 +212,6 @@ over a rather long period of time, but improvements are always welcome!
 	of the system, especially to real-time workloads running on
 	the rest of the system.
 
-	In addition, it is illegal to call the expedited forms from
-	a CPU-hotplug notifier, or while holding a lock that is acquired
-	by a CPU-hotplug notifier.  Failing to observe this restriction
-	will result in deadlock.
-
 7.	If the updater uses call_rcu() or synchronize_rcu(), then the
 	corresponding readers must use rcu_read_lock() and
 	rcu_read_unlock().  If the updater uses call_rcu_bh() or
@@ -321,7 +335,7 @@ over a rather long period of time, but improvements are always welcome!
 	Similarly, disabling preemption is not an acceptable substitute
 	for rcu_read_lock().  Code that attempts to use preemption
 	disabling where it should be using rcu_read_lock() will break
-	in real-time kernel builds.
+	in CONFIG_PREEMPT=y kernel builds.
 
 	If you want to wait for interrupt handlers, NMI handlers, and
 	code under the influence of preempt_disable(), you instead
@@ -356,23 +370,22 @@ over a rather long period of time, but improvements are always welcome!
 	not the case, a self-spawning RCU callback would prevent the
 	victim CPU from ever going offline.)
 
-14.	SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(),
-	synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu())
-	may only be invoked from process context.  Unlike other forms of
-	RCU, it -is- permissible to block in an SRCU read-side critical
-	section (demarked by srcu_read_lock() and srcu_read_unlock()),
-	hence the "SRCU": "sleepable RCU".  Please note that if you
-	don't need to sleep in read-side critical sections, you should be
-	using RCU rather than SRCU, because RCU is almost always faster
-	and easier to use than is SRCU.
-
-	Also unlike other forms of RCU, explicit initialization
-	and cleanup is required via init_srcu_struct() and
-	cleanup_srcu_struct().	These are passed a "struct srcu_struct"
-	that defines the scope of a given SRCU domain.	Once initialized,
-	the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock()
-	synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
-	A given synchronize_srcu() waits only for SRCU read-side critical
+14.	Unlike other forms of RCU, it -is- permissible to block in an
+	SRCU read-side critical section (demarked by srcu_read_lock()
+	and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
+	Please note that if you don't need to sleep in read-side critical
+	sections, you should be using RCU rather than SRCU, because RCU
+	is almost always faster and easier to use than is SRCU.
+
+	Also unlike other forms of RCU, explicit initialization and
+	cleanup is required either at build time via DEFINE_SRCU()
+	or DEFINE_STATIC_SRCU() or at runtime via init_srcu_struct()
+	and cleanup_srcu_struct().  These last two are passed a
+	"struct srcu_struct" that defines the scope of a given
+	SRCU domain.  Once initialized, the srcu_struct is passed
+	to srcu_read_lock(), srcu_read_unlock() synchronize_srcu(),
+	synchronize_srcu_expedited(), and call_srcu().	A given
+	synchronize_srcu() waits only for SRCU read-side critical
 	sections governed by srcu_read_lock() and srcu_read_unlock()
 	calls that have been passed the same srcu_struct.  This property
 	is what makes sleeping read-side critical sections tolerable --
@@ -390,10 +403,16 @@ over a rather long period of time, but improvements are always welcome!
 	Therefore, SRCU should be used in preference to rw_semaphore
 	only in extremely read-intensive situations, or in situations
 	requiring SRCU's read-side deadlock immunity or low read-side
-	realtime latency.
+	realtime latency.  You should also consider percpu_rw_semaphore
+	when you need lightweight readers.
 
-	Note that, rcu_assign_pointer() relates to SRCU just as it does
-	to other forms of RCU.
+	SRCU's expedited primitive (synchronize_srcu_expedited())
+	never sends IPIs to other CPUs, so it is easier on
+	real-time workloads than is synchronize_rcu_expedited(),
+	synchronize_rcu_bh_expedited() or synchronize_sched_expedited().
+
+	Note that rcu_dereference() and rcu_assign_pointer() relate to
+	SRCU just as they do to other forms of RCU.
 
 15.	The whole point of call_rcu(), synchronize_rcu(), and friends
 	is to wait until all pre-existing readers have finished before
@@ -435,3 +454,33 @@ over a rather long period of time, but improvements are always welcome!
 
 	These debugging aids can help you find problems that are
 	otherwise extremely difficult to spot.
+
+18.	If you register a callback using call_rcu(), call_rcu_bh(),
+	call_rcu_sched(), or call_srcu(), and pass in a function defined
+	within a loadable module, then it in necessary to wait for
+	all pending callbacks to be invoked after the last invocation
+	and before unloading that module.  Note that it is absolutely
+	-not- sufficient to wait for a grace period!  The current (say)
+	synchronize_rcu() implementation waits only for all previous
+	callbacks registered on the CPU that synchronize_rcu() is running
+	on, but it is -not- guaranteed to wait for callbacks registered
+	on other CPUs.
+
+	You instead need to use one of the barrier functions:
+
+	o	call_rcu() -> rcu_barrier()
+	o	call_rcu_bh() -> rcu_barrier_bh()
+	o	call_rcu_sched() -> rcu_barrier_sched()
+	o	call_srcu() -> srcu_barrier()
+
+	However, these barrier functions are absolutely -not- guaranteed
+	to wait for a grace period.  In fact, if there are no call_rcu()
+	callbacks waiting anywhere in the system, rcu_barrier() is within
+	its rights to return immediately.
+
+	So if you need to wait for both an RCU grace period and for
+	all pre-existing call_rcu() callbacks, you will need to execute
+	both rcu_barrier() and synchronize_rcu(), if necessary, using
+	something like workqueues to to execute them concurrently.
+
+	See rcubarrier.txt for more information.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 745f429fda79..7d4ae110c2c9 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -76,15 +76,12 @@ o	I hear that RCU is patented?  What is with that?
 	Of these, one was allowed to lapse by the assignee, and the
 	others have been contributed to the Linux kernel under GPL.
 	There are now also LGPL implementations of user-level RCU
-	available (http://lttng.org/?q=node/18).
+	available (http://liburcu.org/).
 
 o	I hear that RCU needs work in order to support realtime kernels?
 
-	This work is largely completed.  Realtime-friendly RCU can be
-	enabled via the CONFIG_PREEMPT_RCU kernel configuration
-	parameter.  However, work is in progress for enabling priority
-	boosting of preempted RCU read-side critical sections.	This is
-	needed if you have CPU-bound realtime threads.
+	Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU
+	kernel configuration parameter.
 
 o	Where can I find more information on RCU?
 
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt
index b2a613f16d74..1acb26b09b48 100644
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -25,35 +25,35 @@ o	You must use one of the rcu_dereference() family of primitives
 	for an example where the compiler can in fact deduce the exact
 	value of the pointer, and thus cause misordering.
 
+o	You are only permitted to use rcu_dereference on pointer values.
+	The compiler simply knows too much about integral values to
+	trust it to carry dependencies through integer operations.
+	There are a very few exceptions, namely that you can temporarily
+	cast the pointer to uintptr_t in order to:
+
+	o	Set bits and clear bits down in the must-be-zero low-order
+		bits of that pointer.  This clearly means that the pointer
+		must have alignment constraints, for example, this does
+		-not- work in general for char* pointers.
+
+	o	XOR bits to translate pointers, as is done in some
+		classic buddy-allocator algorithms.
+
+	It is important to cast the value back to pointer before
+	doing much of anything else with it.
+
 o	Avoid cancellation when using the "+" and "-" infix arithmetic
 	operators.  For example, for a given variable "x", avoid
-	"(x-x)".  There are similar arithmetic pitfalls from other
-	arithmetic operators, such as "(x*0)", "(x/(x+1))" or "(x%1)".
-	The compiler is within its rights to substitute zero for all of
-	these expressions, so that subsequent accesses no longer depend
-	on the rcu_dereference(), again possibly resulting in bugs due
-	to misordering.
+	"(x-(uintptr_t)x)" for char* pointers.	The compiler is within its
+	rights to substitute zero for this sort of expression, so that
+	subsequent accesses no longer depend on the rcu_dereference(),
+	again possibly resulting in bugs due to misordering.
 
 	Of course, if "p" is a pointer from rcu_dereference(), and "a"
 	and "b" are integers that happen to be equal, the expression
 	"p+a-b" is safe because its value still necessarily depends on
 	the rcu_dereference(), thus maintaining proper ordering.
 
-o	Avoid all-zero operands to the bitwise "&" operator, and
-	similarly avoid all-ones operands to the bitwise "|" operator.
-	If the compiler is able to deduce the value of such operands,
-	it is within its rights to substitute the corresponding constant
-	for the bitwise operation.  Once again, this causes subsequent
-	accesses to no longer depend on the rcu_dereference(), causing
-	bugs due to misordering.
-
-	Please note that single-bit operands to bitwise "&" can also
-	be dangerous.  At this point, the compiler knows that the
-	resulting value can only take on one of two possible values.
-	Therefore, a very small amount of additional information will
-	allow the compiler to deduce the exact value, which again can
-	result in misordering.
-
 o	If you are using RCU to protect JITed functions, so that the
 	"()" function-invocation operator is applied to a value obtained
 	(directly or indirectly) from rcu_dereference(), you may need to
@@ -61,25 +61,6 @@ o	If you are using RCU to protect JITed functions, so that the
 	This issue arises on some systems when a newly JITed function is
 	using the same memory that was used by an earlier JITed function.
 
-o	Do not use the results from the boolean "&&" and "||" when
-	dereferencing.	For example, the following (rather improbable)
-	code is buggy:
-
-		int *p;
-		int *q;
-
-		...
-
-		p = rcu_dereference(gp)
-		q = &global_q;
-		q += p != &oom_p1 && p != &oom_p2;
-		r1 = *q;  /* BUGGY!!! */
-
-	The reason this is buggy is that "&&" and "||" are often compiled
-	using branches.  While weak-memory machines such as ARM or PowerPC
-	do order stores after such branches, they can speculate loads,
-	which can result in misordering bugs.
-
 o	Do not use the results from relational operators ("==", "!=",
 	">", ">=", "<", or "<=") when dereferencing.  For example,
 	the following (quite strange) code is buggy:
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
index b10cfe711e68..5d7759071a3e 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.txt
@@ -263,6 +263,11 @@ Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
 	are delayed for a full grace period? Couldn't this result in
 	rcu_barrier() returning prematurely?
 
+The current rcu_barrier() implementation is more complex, due to the need
+to avoid disturbing idle CPUs (especially on battery-powered systems)
+and the need to minimally disturb non-idle CPUs in real-time systems.
+However, the code above illustrates the concepts.
+
 
 rcu_barrier() Summary
 
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 278f6a9383b6..55918b54808b 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -276,15 +276,17 @@ o	"Free-Block Circulation": Shows the number of torture structures
 	somehow gets incremented farther than it should.
 
 Different implementations of RCU can provide implementation-specific
-additional information.  For example, SRCU provides the following
+additional information.  For example, Tree SRCU provides the following
 additional line:
 
-	srcu-torture: per-CPU(idx=1): 0(0,1) 1(0,1) 2(0,0) 3(0,1)
+	srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6)
 
-This line shows the per-CPU counter state.  The numbers in parentheses are
-the values of the "old" and "current" counters for the corresponding CPU.
-The "idx" value maps the "old" and "current" values to the underlying
-array, and is useful for debugging.
+This line shows the per-CPU counter state, in this case for Tree SRCU
+using a dynamically allocated srcu_struct (hence "srcud-" rather than
+"srcu-").  The numbers in parentheses are the values of the "old" and
+"current" counters for the corresponding CPU.  The "idx" value maps the
+"old" and "current" values to the underlying array, and is useful for
+debugging.  The final "T" entry contains the totals of the counters.
 
 
 USAGE
@@ -304,3 +306,9 @@ checked for such errors.  The "rmmod" command forces a "SUCCESS",
 "FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
 two are self-explanatory, while the last indicates that while there
 were no RCU failures, CPU-hotplug problems were detected.
+
+However, the tools/testing/selftests/rcutorture/bin/kvm.sh script
+provides better automation, including automatic failure analysis.
+It assumes a qemu/kvm-enabled platform, and runs guest OSes out of initrd.
+See tools/testing/selftests/rcutorture/doc/initrd.txt for instructions
+on setting up such an initrd.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 8ed6c9f6133c..df62466da4e0 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -890,6 +890,8 @@ SRCU:	Critical sections	Grace period		Barrier
 	srcu_read_lock_held
 
 SRCU:	Initialization/cleanup
+	DEFINE_SRCU
+	DEFINE_STATIC_SRCU
 	init_srcu_struct
 	cleanup_srcu_struct
 
@@ -913,7 +915,8 @@ a.	Will readers need to block?  If so, you need SRCU.
 b.	What about the -rt patchset?  If readers would need to block
 	in an non-rt kernel, you need SRCU.  If readers would block
 	in a -rt kernel, but not in a non-rt kernel, SRCU is not
-	necessary.
+	necessary.  (The -rt patchset turns spinlocks into sleeplocks,
+	hence this distinction.)
 
 c.	Do you need to treat NMI handlers, hardirq handlers,
 	and code segments with preemption disabled (whether
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d9c171ce4190..3a99cc96b6b1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2633,9 +2633,10 @@
 			In kernels built with CONFIG_NO_HZ_FULL=y, set
 			the specified list of CPUs whose tick will be stopped
 			whenever possible. The boot CPU will be forced outside
-			the range to maintain the timekeeping.
-			The CPUs in this range must also be included in the
-			rcu_nocbs= set.
+			the range to maintain the timekeeping.  Any CPUs
+			in this list will have their RCU callbacks offloaded,
+			just as if they had also been called out in the
+			rcu_nocbs= boot parameter.
 
 	noiotrap	[SH] Disables trapped I/O port accesses.
 
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 17b00914c6ab..8282099e0cbf 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -344,3 +344,52 @@ codecs, and devices with strict requirements for interface clocking.
 
 .. kernel-doc:: include/linux/clk.h
    :internal:
+
+Synchronization Primitives
+==========================
+
+Read-Copy Update (RCU)
+----------------------
+
+.. kernel-doc:: include/linux/rcupdate.h
+   :external:
+
+.. kernel-doc:: include/linux/rcupdate_wait.h
+   :external:
+
+.. kernel-doc:: include/linux/rcutree.h
+   :external:
+
+.. kernel-doc:: kernel/rcu/tree.c
+   :external:
+
+.. kernel-doc:: kernel/rcu/tree_plugin.h
+   :external:
+
+.. kernel-doc:: kernel/rcu/tree_exp.h
+   :external:
+
+.. kernel-doc:: kernel/rcu/update.c
+   :external:
+
+.. kernel-doc:: include/linux/srcu.h
+   :external:
+
+.. kernel-doc:: kernel/rcu/srcutree.c
+   :external:
+
+.. kernel-doc:: include/linux/rculist_bl.h
+   :external:
+
+.. kernel-doc:: include/linux/rculist.h
+   :external:
+
+.. kernel-doc:: include/linux/rculist_nulls.h
+   :external:
+
+.. kernel-doc:: include/linux/rcu_sync.h
+   :external:
+
+.. kernel-doc:: kernel/rcu/sync.c
+   :external:
+
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index d1d1716f904b..b759a60624fd 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -594,7 +594,24 @@ between the address load and the data load:
 This enforces the occurrence of one of the two implications, and prevents the
 third possibility from arising.
 
-A data-dependency barrier must also order against dependent writes:
+
+[!] Note that this extremely counterintuitive situation arises most easily on
+machines with split caches, so that, for example, one cache bank processes
+even-numbered cache lines and the other bank processes odd-numbered cache
+lines.  The pointer P might be stored in an odd-numbered cache line, and the
+variable B might be stored in an even-numbered cache line.  Then, if the
+even-numbered bank of the reading CPU's cache is extremely busy while the
+odd-numbered bank is idle, one can see the new value of the pointer P (&B),
+but the old value of the variable B (2).
+
+
+A data-dependency barrier is not required to order dependent writes
+because the CPUs that the Linux kernel supports don't do writes
+until they are certain (1) that the write will actually happen, (2)
+of the location of the write, and (3) of the value to be written.
+But please carefully read the "CONTROL DEPENDENCIES" section and the
+Documentation/RCU/rcu_dereference.txt file:  The compiler can and does
+break dependencies in a great many highly creative ways.
 
 	CPU 1		      CPU 2
 	===============	      ===============
@@ -603,29 +620,19 @@ A data-dependency barrier must also order against dependent writes:
 	<write barrier>
 	WRITE_ONCE(P, &B);
 			      Q = READ_ONCE(P);
-			      <data dependency barrier>
-			      *Q = 5;
+			      WRITE_ONCE(*Q, 5);
 
-The data-dependency barrier must order the read into Q with the store
-into *Q.  This prohibits this outcome:
+Therefore, no data-dependency barrier is required to order the read into
+Q with the store into *Q.  In other words, this outcome is prohibited,
+even without a data-dependency barrier:
 
 	(Q == &B) && (B == 4)
 
 Please note that this pattern should be rare.  After all, the whole point
 of dependency ordering is to -prevent- writes to the data structure, along
 with the expensive cache misses associated with those writes.  This pattern
-can be used to record rare error conditions and the like, and the ordering
-prevents such records from being lost.
-
-
-[!] Note that this extremely counterintuitive situation arises most easily on
-machines with split caches, so that, for example, one cache bank processes
-even-numbered cache lines and the other bank processes odd-numbered cache
-lines.  The pointer P might be stored in an odd-numbered cache line, and the
-variable B might be stored in an even-numbered cache line.  Then, if the
-even-numbered bank of the reading CPU's cache is extremely busy while the
-odd-numbered bank is idle, one can see the new value of the pointer P (&B),
-but the old value of the variable B (2).
+can be used to record rare error conditions and the like, and the CPUs'
+naturally occurring ordering prevents such records from being lost.
 
 
 The data dependency barrier is very important to the RCU system,
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index bac23c198360..ce61d1fe08ca 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
 - perf_cpu_time_max_percent
 - perf_event_paranoid
 - perf_event_max_stack
+- perf_event_mlock_kb
 - perf_event_max_contexts_per_stack
 - pid_max
 - powersave-nap               [ PPC only ]
@@ -654,7 +655,9 @@ Controls use of the performance events system by unprivileged
 users (without CAP_SYS_ADMIN).  The default value is 2.
 
  -1: Allow use of (almost) all events by all users
->=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+     Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
+>=0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
+     Disallow raw tracepoint access by users without CAP_SYS_ADMIN
 >=1: Disallow CPU event access by users without CAP_SYS_ADMIN
 >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
 
@@ -673,6 +676,14 @@ The default value is 127.
 
 ==============================================================
 
+perf_event_mlock_kb:
+
+Control size of per-cpu ring buffer not counted agains mlock limit.
+
+The default value is 512 + 1 page
+
+==============================================================
+
 perf_event_max_contexts_per_stack:
 
 Controls maximum number of stack frame context entries for
diff --git a/Documentation/x86/early-microcode.txt b/Documentation/x86/early-microcode.txt
deleted file mode 100644
index 07749e7f3d50..000000000000
--- a/Documentation/x86/early-microcode.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-Early load microcode
-====================
-By Fenghua Yu <fenghua.yu@intel.com>
-
-Kernel can update microcode in early phase of boot time. Loading microcode early
-can fix CPU issues before they are observed during kernel boot time.
-
-Microcode is stored in an initrd file. The microcode is read from the initrd
-file and loaded to CPUs during boot time.
-
-The format of the combined initrd image is microcode in cpio format followed by
-the initrd image (maybe compressed). Kernel parses the combined initrd image
-during boot time. The microcode file in cpio name space is:
-on Intel: kernel/x86/microcode/GenuineIntel.bin
-on AMD  : kernel/x86/microcode/AuthenticAMD.bin
-
-During BSP boot (before SMP starts), if the kernel finds the microcode file in
-the initrd file, it parses the microcode and saves matching microcode in memory.
-If matching microcode is found, it will be uploaded in BSP and later on in all
-APs.
-
-The cached microcode patch is applied when CPUs resume from a sleep state.
-
-There are two legacy user space interfaces to load microcode, either through
-/dev/cpu/microcode or through /sys/devices/system/cpu/microcode/reload file
-in sysfs.
-
-In addition to these two legacy methods, the early loading method described
-here is the third method with which microcode can be uploaded to a system's
-CPUs.
-
-The following example script shows how to generate a new combined initrd file in
-/boot/initrd-3.5.0.ucode.img with original microcode microcode.bin and
-original initrd image /boot/initrd-3.5.0.img.
-
-mkdir initrd
-cd initrd
-mkdir -p kernel/x86/microcode
-cp ../microcode.bin kernel/x86/microcode/GenuineIntel.bin (or AuthenticAMD.bin)
-find . | cpio -o -H newc >../ucode.cpio
-cd ..
-cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img
-
-Builtin microcode
-=================
-
-We can also load builtin microcode supplied through the regular firmware
-builtin method CONFIG_FIRMWARE_IN_KERNEL. Only 64-bit is currently
-supported.
-
-Here's an example:
-
-CONFIG_FIRMWARE_IN_KERNEL=y
-CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin"
-CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware"
-
-This basically means, you have the following tree structure locally:
-
-/lib/firmware/
-|-- amd-ucode
-...
-|   |-- microcode_amd_fam15h.bin
-...
-|-- intel-ucode
-...
-|   |-- 06-3a-09
-...
-
-so that the build system can find those files and integrate them into
-the final kernel image. The early loader finds them and applies them.
diff --git a/Documentation/x86/microcode.txt b/Documentation/x86/microcode.txt
new file mode 100644
index 000000000000..f57e1b45e628
--- /dev/null
+++ b/Documentation/x86/microcode.txt
@@ -0,0 +1,137 @@
+	The Linux Microcode Loader
+
+Authors: Fenghua Yu <fenghua.yu@intel.com>
+	 Borislav Petkov <bp@suse.de>
+
+The kernel has a x86 microcode loading facility which is supposed to
+provide microcode loading methods in the OS. Potential use cases are
+updating the microcode on platforms beyond the OEM End-Of-Life support,
+and updating the microcode on long-running systems without rebooting.
+
+The loader supports three loading methods:
+
+1. Early load microcode
+=======================
+
+The kernel can update microcode very early during boot. Loading
+microcode early can fix CPU issues before they are observed during
+kernel boot time.
+
+The microcode is stored in an initrd file. During boot, it is read from
+it and loaded into the CPU cores.
+
+The format of the combined initrd image is microcode in (uncompressed)
+cpio format followed by the (possibly compressed) initrd image. The
+loader parses the combined initrd image during boot.
+
+The microcode files in cpio name space are:
+
+on Intel: kernel/x86/microcode/GenuineIntel.bin
+on AMD  : kernel/x86/microcode/AuthenticAMD.bin
+
+During BSP (BootStrapping Processor) boot (pre-SMP), the kernel
+scans the microcode file in the initrd. If microcode matching the
+CPU is found, it will be applied in the BSP and later on in all APs
+(Application Processors).
+
+The loader also saves the matching microcode for the CPU in memory.
+Thus, the cached microcode patch is applied when CPUs resume from a
+sleep state.
+
+Here's a crude example how to prepare an initrd with microcode (this is
+normally done automatically by the distribution, when recreating the
+initrd, so you don't really have to do it yourself. It is documented
+here for future reference only).
+
+---
+  #!/bin/bash
+
+  if [ -z "$1" ]; then
+      echo "You need to supply an initrd file"
+      exit 1
+  fi
+
+  INITRD="$1"
+
+  DSTDIR=kernel/x86/microcode
+  TMPDIR=/tmp/initrd
+
+  rm -rf $TMPDIR
+
+  mkdir $TMPDIR
+  cd $TMPDIR
+  mkdir -p $DSTDIR
+
+  if [ -d /lib/firmware/amd-ucode ]; then
+          cat /lib/firmware/amd-ucode/microcode_amd*.bin > $DSTDIR/AuthenticAMD.bin
+  fi
+
+  if [ -d /lib/firmware/intel-ucode ]; then
+          cat /lib/firmware/intel-ucode/* > $DSTDIR/GenuineIntel.bin
+  fi
+
+  find . | cpio -o -H newc >../ucode.cpio
+  cd ..
+  mv $INITRD $INITRD.orig
+  cat ucode.cpio $INITRD.orig > $INITRD
+
+  rm -rf $TMPDIR
+---
+
+The system needs to have the microcode packages installed into
+/lib/firmware or you need to fixup the paths above if yours are
+somewhere else and/or you've downloaded them directly from the processor
+vendor's site.
+
+2. Late loading
+===============
+
+There are two legacy user space interfaces to load microcode, either through
+/dev/cpu/microcode or through /sys/devices/system/cpu/microcode/reload file
+in sysfs.
+
+The /dev/cpu/microcode method is deprecated because it needs a special
+userspace tool for that.
+
+The easier method is simply installing the microcode packages your distro
+supplies and running:
+
+# echo 1 > /sys/devices/system/cpu/microcode/reload
+
+as root.
+
+The loading mechanism looks for microcode blobs in
+/lib/firmware/{intel-ucode,amd-ucode}. The default distro installation
+packages already put them there.
+
+3. Builtin microcode
+====================
+
+The loader supports also loading of a builtin microcode supplied through
+the regular firmware builtin method CONFIG_FIRMWARE_IN_KERNEL. Only
+64-bit is currently supported.
+
+Here's an example:
+
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin"
+CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware"
+
+This basically means, you have the following tree structure locally:
+
+/lib/firmware/
+|-- amd-ucode
+...
+|   |-- microcode_amd_fam15h.bin
+...
+|-- intel-ucode
+...
+|   |-- 06-3a-09
+...
+
+so that the build system can find those files and integrate them into
+the final kernel image. The early loader finds them and applies them.
+
+Needless to say, this method is not the most flexible one because it
+requires rebuilding the kernel each time updated microcode from the CPU
+vendor is available.
diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt
new file mode 100644
index 000000000000..af0c9a4c65a6
--- /dev/null
+++ b/Documentation/x86/orc-unwinder.txt
@@ -0,0 +1,179 @@
+ORC unwinder
+============
+
+Overview
+--------
+
+The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
+similar in concept to a DWARF unwinder.  The difference is that the
+format of the ORC data is much simpler than DWARF, which in turn allows
+the ORC unwinder to be much simpler and faster.
+
+The ORC data consists of unwind tables which are generated by objtool.
+They contain out-of-band data which is used by the in-kernel ORC
+unwinder.  Objtool generates the ORC data by first doing compile-time
+stack metadata validation (CONFIG_STACK_VALIDATION).  After analyzing
+all the code paths of a .o file, it determines information about the
+stack state at each instruction address in the file and outputs that
+information to the .orc_unwind and .orc_unwind_ip sections.
+
+The per-object ORC sections are combined at link time and are sorted and
+post-processed at boot time.  The unwinder uses the resulting data to
+correlate instruction addresses with their stack states at run time.
+
+
+ORC vs frame pointers
+---------------------
+
+With frame pointers enabled, GCC adds instrumentation code to every
+function in the kernel.  The kernel's .text size increases by about
+3.2%, resulting in a broad kernel-wide slowdown.  Measurements by Mel
+Gorman [1] have shown a slowdown of 5-10% for some workloads.
+
+In contrast, the ORC unwinder has no effect on text size or runtime
+performance, because the debuginfo is out of band.  So if you disable
+frame pointers and enable the ORC unwinder, you get a nice performance
+improvement across the board, and still have reliable stack traces.
+
+Ingo Molnar says:
+
+  "Note that it's not just a performance improvement, but also an
+  instruction cache locality improvement: 3.2% .text savings almost
+  directly transform into a similarly sized reduction in cache
+  footprint. That can transform to even higher speedups for workloads
+  whose cache locality is borderline."
+
+Another benefit of ORC compared to frame pointers is that it can
+reliably unwind across interrupts and exceptions.  Frame pointer based
+unwinds can sometimes skip the caller of the interrupted function, if it
+was a leaf function or if the interrupt hit before the frame pointer was
+saved.
+
+The main disadvantage of the ORC unwinder compared to frame pointers is
+that it needs more memory to store the ORC unwind tables: roughly 2-4MB
+depending on the kernel config.
+
+
+ORC vs DWARF
+------------
+
+ORC debuginfo's advantage over DWARF itself is that it's much simpler.
+It gets rid of the complex DWARF CFI state machine and also gets rid of
+the tracking of unnecessary registers.  This allows the unwinder to be
+much simpler, meaning fewer bugs, which is especially important for
+mission critical oops code.
+
+The simpler debuginfo format also enables the unwinder to be much faster
+than DWARF, which is important for perf and lockdep.  In a basic
+performance test by Jiri Slaby [2], the ORC unwinder was about 20x
+faster than an out-of-tree DWARF unwinder.  (Note: That measurement was
+taken before some performance tweaks were added, which doubled
+performance, so the speedup over DWARF may be closer to 40x.)
+
+The ORC data format does have a few downsides compared to DWARF.  ORC
+unwind tables take up ~50% more RAM (+1.3MB on an x86 defconfig kernel)
+than DWARF-based eh_frame tables.
+
+Another potential downside is that, as GCC evolves, it's conceivable
+that the ORC data may end up being *too* simple to describe the state of
+the stack for certain optimizations.  But IMO this is unlikely because
+GCC saves the frame pointer for any unusual stack adjustments it does,
+so I suspect we'll really only ever need to keep track of the stack
+pointer and the frame pointer between call frames.  But even if we do
+end up having to track all the registers DWARF tracks, at least we will
+still be able to control the format, e.g. no complex state machines.
+
+
+ORC unwind table generation
+---------------------------
+
+The ORC data is generated by objtool.  With the existing compile-time
+stack metadata validation feature, objtool already follows all code
+paths, and so it already has all the information it needs to be able to
+generate ORC data from scratch.  So it's an easy step to go from stack
+validation to ORC data generation.
+
+It should be possible to instead generate the ORC data with a simple
+tool which converts DWARF to ORC data.  However, such a solution would
+be incomplete due to the kernel's extensive use of asm, inline asm, and
+special sections like exception tables.
+
+That could be rectified by manually annotating those special code paths
+using GNU assembler .cfi annotations in .S files, and homegrown
+annotations for inline asm in .c files.  But asm annotations were tried
+in the past and were found to be unmaintainable.  They were often
+incorrect/incomplete and made the code harder to read and keep updated.
+And based on looking at glibc code, annotating inline asm in .c files
+might be even worse.
+
+Objtool still needs a few annotations, but only in code which does
+unusual things to the stack like entry code.  And even then, far fewer
+annotations are needed than what DWARF would need, so they're much more
+maintainable than DWARF CFI annotations.
+
+So the advantages of using objtool to generate ORC data are that it
+gives more accurate debuginfo, with very few annotations.  It also
+insulates the kernel from toolchain bugs which can be very painful to
+deal with in the kernel since we often have to workaround issues in
+older versions of the toolchain for years.
+
+The downside is that the unwinder now becomes dependent on objtool's
+ability to reverse engineer GCC code flow.  If GCC optimizations become
+too complicated for objtool to follow, the ORC data generation might
+stop working or become incomplete.  (It's worth noting that livepatch
+already has such a dependency on objtool's ability to follow GCC code
+flow.)
+
+If newer versions of GCC come up with some optimizations which break
+objtool, we may need to revisit the current implementation.  Some
+possible solutions would be asking GCC to make the optimizations more
+palatable, or having objtool use DWARF as an additional input, or
+creating a GCC plugin to assist objtool with its analysis.  But for now,
+objtool follows GCC code quite well.
+
+
+Unwinder implementation details
+-------------------------------
+
+Objtool generates the ORC data by integrating with the compile-time
+stack metadata validation feature, which is described in detail in
+tools/objtool/Documentation/stack-validation.txt.  After analyzing all
+the code paths of a .o file, it creates an array of orc_entry structs,
+and a parallel array of instruction addresses associated with those
+structs, and writes them to the .orc_unwind and .orc_unwind_ip sections
+respectively.
+
+The ORC data is split into the two arrays for performance reasons, to
+make the searchable part of the data (.orc_unwind_ip) more compact.  The
+arrays are sorted in parallel at boot time.
+
+Performance is further improved by the use of a fast lookup table which
+is created at runtime.  The fast lookup table associates a given address
+with a range of indices for the .orc_unwind table, so that only a small
+subset of the table needs to be searched.
+
+
+Etymology
+---------
+
+Orcs, fearsome creatures of medieval folklore, are the Dwarves' natural
+enemies.  Similarly, the ORC unwinder was created in opposition to the
+complexity and slowness of DWARF.
+
+"Although Orcs rarely consider multiple solutions to a problem, they do
+excel at getting things done because they are creatures of action, not
+thought." [3]  Similarly, unlike the esoteric DWARF unwinder, the
+veracious ORC unwinder wastes no time or siloconic effort decoding
+variable-length zero-extended unsigned-integer byte-coded
+state-machine-based debug information entries.
+
+Similar to how Orcs frequently unravel the well-intentioned plans of
+their adversaries, the ORC unwinder frequently unravels stacks with
+brutal, unyielding efficiency.
+
+ORC stands for Oops Rewind Capability.
+
+
+[1] https://lkml.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de
+[2] https://lkml.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz
+[3] http://dustin.wikidot.com/half-orcs-and-orcs
diff --git a/MAINTAINERS b/MAINTAINERS
index a2add4a4f7f3..75994ad6333f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7660,17 +7660,6 @@ T:	git git://linuxtv.org/mkrufky/tuners.git
 S:	Maintained
 F:	drivers/media/dvb-frontends/lgdt3305.*
 
-LGUEST
-M:	Rusty Russell <rusty@rustcorp.com.au>
-L:	lguest@lists.ozlabs.org
-W:	http://lguest.ozlabs.org/
-S:	Odd Fixes
-F:	arch/x86/include/asm/lguest*.h
-F:	arch/x86/lguest/
-F:	drivers/lguest/
-F:	include/linux/lguest*.h
-F:	tools/lguest/
-
 LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
 M:	Viresh Kumar <vireshk@kernel.org>
 L:	linux-ide@vger.kernel.org
@@ -8649,7 +8638,7 @@ M:	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
 L:	linux-kernel@vger.kernel.org
 S:	Supported
-F:	kernel/membarrier.c
+F:	kernel/sched/membarrier.c
 F:	include/uapi/linux/membarrier.h
 
 MEMORY MANAGEMENT
diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index a40b9fc0c6c3..718ac0b64adf 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -16,11 +16,6 @@
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 #define arch_spin_is_locked(x)	((x)->lock != 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
         return lock.lock == 0;
diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h
index 233d5ffe6ec7..a325e6a36523 100644
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -16,11 +16,6 @@
 #define arch_spin_is_locked(x)	((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__)
 #define arch_spin_lock_flags(lock, flags)	arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
 #ifdef CONFIG_ARC_HAS_LLSC
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 4bec45442072..c030143c18c6 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -52,22 +52,6 @@ static inline void dsb_sev(void)
  * memory.
  */
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	u16 owner = READ_ONCE(lock->tickets.owner);
-
-	for (;;) {
-		arch_spinlock_t tmp = READ_ONCE(*lock);
-
-		if (tmp.tickets.owner == tmp.tickets.next ||
-		    tmp.tickets.owner != owner)
-			break;
-
-		wfe();
-	}
-	smp_acquire__after_ctrl_dep();
-}
-
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define TIF_NEED_RESCHED	1	/* rescheduling necessary */
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_UPROBE		3	/* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE	4	/* syscall trace active */
-#define TIF_SYSCALL_AUDIT	5	/* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
-#define TIF_SECCOMP		7	/* seccomp syscall filtering active */
+#define TIF_FSCHECK		4	/* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE	5	/* syscall trace active */
+#define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT	7	/* syscall tracepoint instrumentation */
+#define TIF_SECCOMP		8	/* seccomp syscall filtering active */
 
 #define TIF_NOHZ		12	/* in adaptive nohz mode */
 #define TIF_USING_IWMMXT	17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 /*
  * Change these and you break ASM code in entry-common.S
  */
-#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING |	\
+				 _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
+				 _TIF_FSCHECK)
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h
index f555bb3664dc..683d9230984a 100644
--- a/arch/arm/include/asm/traps.h
+++ b/arch/arm/include/asm/traps.h
@@ -18,7 +18,6 @@ struct undef_hook {
 void register_undef_hook(struct undef_hook *hook);
 void unregister_undef_hook(struct undef_hook *hook);
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static inline int __in_irqentry_text(unsigned long ptr)
 {
 	extern char __irqentry_text_start[];
@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
 	return ptr >= (unsigned long)&__irqentry_text_start &&
 	       ptr < (unsigned long)&__irqentry_text_end;
 }
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
-	return 0;
-}
-#endif
 
 static inline int in_exception_text(unsigned long ptr)
 {
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 0bf2347495f1..87936dd5d151 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -70,6 +70,8 @@ static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
 	modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
 }
 
 #define segment_eq(a, b)	((a) == (b))
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -41,7 +41,9 @@ ret_fast_syscall:
  UNWIND(.cantunwind	)
 	disable_irq_notrace			@ disable interrupts
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
-	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+	tst	r1, #_TIF_SYSCALL_WORK
+	bne	fast_work_pending
+	tst	r1, #_TIF_WORK_MASK
 	bne	fast_work_pending
 
 	/* perform architecture specific actions before user return */
@@ -67,12 +69,15 @@ ret_fast_syscall:
 	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
 	disable_irq_notrace			@ disable interrupts
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
-	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+	tst	r1, #_TIF_SYSCALL_WORK
+	bne	fast_work_pending
+	tst	r1, #_TIF_WORK_MASK
 	beq	no_work_pending
  UNWIND(.fnend		)
 ENDPROC(ret_fast_syscall)
 
 	/* Slower path - fall through to work_pending */
+fast_work_pending:
 #endif
 
 	tst	r1, #_TIF_SYSCALL_WORK
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 5814298ef0b7..e2de50bf8742 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/tracehook.h>
 #include <linux/uprobes.h>
+#include <linux/syscalls.h>
 
 #include <asm/elf.h>
 #include <asm/cacheflush.h>
@@ -613,6 +614,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 	 * Update the trace code with the current status.
 	 */
 	trace_hardirqs_off();
+
+	/* Check valid user FS if needed */
+	addr_limit_user_check();
+
 	do {
 		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 			schedule();
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index ae4241ab19a8..95ad7102b63c 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -26,58 +26,6 @@
  * The memory barriers are implicit with the load-acquire and store-release
  * instructions.
  */
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	unsigned int tmp;
-	arch_spinlock_t lockval;
-	u32 owner;
-
-	/*
-	 * Ensure prior spin_lock operations to other locks have completed
-	 * on this CPU before we test whether "lock" is locked.
-	 */
-	smp_mb();
-	owner = READ_ONCE(lock->owner) << 16;
-
-	asm volatile(
-"	sevl\n"
-"1:	wfe\n"
-"2:	ldaxr	%w0, %2\n"
-	/* Is the lock free? */
-"	eor	%w1, %w0, %w0, ror #16\n"
-"	cbz	%w1, 3f\n"
-	/* Lock taken -- has there been a subsequent unlock->lock transition? */
-"	eor	%w1, %w3, %w0, lsl #16\n"
-"	cbz	%w1, 1b\n"
-	/*
-	 * The owner has been updated, so there was an unlock->lock
-	 * transition that we missed. That means we can rely on the
-	 * store-release of the unlock operation paired with the
-	 * load-acquire of the lock operation to publish any of our
-	 * previous stores to the new lock owner and therefore don't
-	 * need to bother with the writeback below.
-	 */
-"	b	4f\n"
-"3:\n"
-	/*
-	 * Serialise against any concurrent lockers by writing back the
-	 * unlocked lock value
-	 */
-	ARM64_LSE_ATOMIC_INSN(
-	/* LL/SC */
-"	stxr	%w1, %w0, %2\n"
-	__nops(2),
-	/* LSE atomics */
-"	mov	%w1, %w0\n"
-"	cas	%w0, %w0, %2\n"
-"	eor	%w1, %w1, %w0\n")
-	/* Somebody else wrote to the lock, GOTO 10 and reload the value */
-"	cbnz	%w1, 2b\n"
-"4:"
-	: "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
-	: "r" (owner)
-	: "memory");
-}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
@@ -176,7 +124,11 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-	smp_mb(); /* See arch_spin_unlock_wait */
+	/*
+	 * Ensure prior spin_lock operations to other locks have completed
+	 * on this CPU before we test whether "lock" is locked.
+	 */
+	smp_mb(); /* ^^^ */
 	return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 46c3b93cf865..c5ba565544ee 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE	3	/* CPU's FP state is not current's */
 #define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
+#define TIF_FSCHECK		5	/* Check FS is USER_DS on return */
 #define TIF_NOHZ		7
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
@@ -107,11 +108,12 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 #define _TIF_32BIT		(1 << TIF_32BIT)
 
 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
-				 _TIF_UPROBE)
+				 _TIF_UPROBE | _TIF_FSCHECK)
 
 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 02e9035b0685..47a9066f7c86 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook);
 
 void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr);
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static inline int __in_irqentry_text(unsigned long ptr)
 {
 	return ptr >= (unsigned long)&__irqentry_text_start &&
 	       ptr < (unsigned long)&__irqentry_text_end;
 }
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
-	return 0;
-}
-#endif
 
 static inline int in_exception_text(unsigned long ptr)
 {
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index fab46a0ea223..a801a48a7972 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -45,6 +45,9 @@ static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
 
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+
 	/*
 	 * Enable/disable UAO so that copy_to_user() etc can access
 	 * kernel memory with the unprivileged instructions.
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 659ae8094ed5..c8f7d98d8cb9 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
 	 * the thread migrates to a different CPU.
+	 * This full barrier is also required by the membarrier system
+	 * call.
 	 */
 	dsb(ish);
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 089c3747995d..e3e3293d1123 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -29,6 +29,7 @@
 #include <linux/string.h>
 #include <linux/tracehook.h>
 #include <linux/ratelimit.h>
+#include <linux/syscalls.h>
 
 #include <asm/debug-monitors.h>
 #include <asm/elf.h>
@@ -749,6 +750,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 	 * Update the trace code with the current status.
 	 */
 	trace_hardirqs_off();
+
+	/* Check valid user FS if needed */
+	addr_limit_user_check();
+
 	do {
 		if (thread_flags & _TIF_NEED_RESCHED) {
 			schedule();
diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h
index c58f4a83ed6f..f6431439d15d 100644
--- a/arch/blackfin/include/asm/spinlock.h
+++ b/arch/blackfin/include/asm/spinlock.h
@@ -48,11 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	__raw_spin_unlock_asm(&lock->lock);
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 static inline int arch_read_can_lock(arch_rwlock_t *rw)
 {
 	return __raw_uncached_fetch_asm(&rw->lock) > 0;
diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c
index 0188c933b155..15af5768c403 100644
--- a/arch/blackfin/kernel/module.c
+++ b/arch/blackfin/kernel/module.c
@@ -4,8 +4,6 @@
  * Licensed under the GPL-2 or later
  */
 
-#define pr_fmt(fmt) "module %s: " fmt, mod->name
-
 #include <linux/moduleloader.h>
 #include <linux/elf.h>
 #include <linux/vmalloc.h>
@@ -16,6 +14,11 @@
 #include <asm/cacheflush.h>
 #include <linux/uaccess.h>
 
+#define mod_err(mod, fmt, ...)						\
+	pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+#define mod_debug(mod, fmt, ...)					\
+	pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+
 /* Transfer the section to the L1 memory */
 int
 module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
@@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_inst_sram_alloc(s->sh_size);
 			mod->arch.text_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 inst memory allocation failed\n");
+				mod_err(mod, "L1 inst memory allocation failed\n");
 				return -1;
 			}
 			dma_memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_sram_alloc(s->sh_size);
 			mod->arch.data_a_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_sram_zalloc(s->sh_size);
 			mod->arch.bss_a_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 
@@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_B_sram_alloc(s->sh_size);
 			mod->arch.data_b_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_B_sram_alloc(s->sh_size);
 			mod->arch.bss_b_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memset(dest, 0, s->sh_size);
@@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_alloc(s->sh_size);
 			mod->arch.text_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_alloc(s->sh_size);
 			mod->arch.data_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_zalloc(s->sh_size);
 			mod->arch.bss_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 
@@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 	Elf32_Sym *sym;
 	unsigned long location, value, size;
 
-	pr_debug("applying relocate section %u to %u\n",
-		relsec, sechdrs[relsec].sh_info);
+	mod_debug(mod, "applying relocate section %u to %u\n",
+		  relsec, sechdrs[relsec].sh_info);
 
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
 		/* This is where to make the change */
@@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 
 #ifdef CONFIG_SMP
 		if (location >= COREB_L1_DATA_A_START) {
-			pr_err("cannot relocate in L1: %u (SMP kernel)\n",
+			mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
 #endif
 
-		pr_debug("location is %lx, value is %lx type is %d\n",
-			location, value, ELF32_R_TYPE(rel[i].r_info));
+		mod_debug(mod, "location is %lx, value is %lx type is %d\n",
+			  location, value, ELF32_R_TYPE(rel[i].r_info));
 
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
 
@@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 		case R_BFIN_PCREL12_JUMP:
 		case R_BFIN_PCREL12_JUMP_S:
 		case R_BFIN_PCREL10:
-			pr_err("unsupported relocation: %u (no -mlong-calls?)\n",
+			mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 
 		default:
-			pr_err("unknown relocation: %u\n",
+			mod_err(mod, "unknown relocation: %u\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
@@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 			isram_memcpy((void *)location, &value, size);
 			break;
 		default:
-			pr_err("invalid relocation for %#lx\n", location);
+			mod_err(mod, "invalid relocation for %#lx\n", location);
 			return -ENOEXEC;
 		}
 	}
diff --git a/arch/cris/arch-v32/mach-a3/arbiter.c b/arch/cris/arch-v32/mach-a3/arbiter.c
index ab5c421a4de8..735a9b0abdb8 100644
--- a/arch/cris/arch-v32/mach-a3/arbiter.c
+++ b/arch/cris/arch-v32/mach-a3/arbiter.c
@@ -227,7 +227,7 @@ static void crisv32_arbiter_config(int arbiter, int region, int unused_slots)
 	}
 }
 
-extern char _stext, _etext;
+extern char _stext[], _etext[];
 
 static void crisv32_arbiter_init(void)
 {
@@ -265,7 +265,7 @@ static void crisv32_arbiter_init(void)
 
 #ifndef CONFIG_ETRAX_KGDB
 	/* Global watch for writes to kernel text segment. */
-	crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext,
+	crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext,
 		MARB_CLIENTS(arbiter_all_clients, arbiter_bar_all_clients),
 			      arbiter_all_write, NULL);
 #endif
diff --git a/arch/cris/arch-v32/mach-fs/arbiter.c b/arch/cris/arch-v32/mach-fs/arbiter.c
index c97f4d8120f9..047c70bdbb23 100644
--- a/arch/cris/arch-v32/mach-fs/arbiter.c
+++ b/arch/cris/arch-v32/mach-fs/arbiter.c
@@ -158,7 +158,7 @@ static void crisv32_arbiter_config(int region, int unused_slots)
 	}
 }
 
-extern char _stext, _etext;
+extern char _stext[], _etext[];
 
 static void crisv32_arbiter_init(void)
 {
@@ -190,7 +190,7 @@ static void crisv32_arbiter_init(void)
 
 #ifndef CONFIG_ETRAX_KGDB
 	/* Global watch for writes to kernel text segment. */
-	crisv32_arbiter_watch(virt_to_phys(&_stext), &_etext - &_stext,
+	crisv32_arbiter_watch(virt_to_phys(_stext), _etext - _stext,
 			      arbiter_all_clients, arbiter_all_write, NULL);
 #endif
 }
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index a01636a12a6e..d98131c45bb5 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -42,7 +42,7 @@ void (*nmi_handler)(struct pt_regs *);
 void show_trace(unsigned long *stack)
 {
 	unsigned long addr, module_start, module_end;
-	extern char _stext, _etext;
+	extern char _stext[], _etext[];
 	int i;
 
 	pr_err("\nCall Trace: ");
@@ -69,8 +69,8 @@ void show_trace(unsigned long *stack)
 		 * down the cause of the crash will be able to figure
 		 * out the call path that was taken.
 		 */
-		if (((addr >= (unsigned long)&_stext) &&
-		     (addr <= (unsigned long)&_etext)) ||
+		if (((addr >= (unsigned long)_stext) &&
+		     (addr <= (unsigned long)_etext)) ||
 		    ((addr >= module_start) && (addr <= module_end))) {
 #ifdef CONFIG_KALLSYMS
 			print_ip_sym(addr);
diff --git a/arch/h8300/include/asm/traps.h b/arch/h8300/include/asm/traps.h
index 15e701130b27..1c5a30ec2df8 100644
--- a/arch/h8300/include/asm/traps.h
+++ b/arch/h8300/include/asm/traps.h
@@ -33,9 +33,9 @@ extern unsigned long *_interrupt_redirect_table;
 #define TRAP2_VEC 10
 #define TRAP3_VEC 11
 
-extern char _start, _etext;
+extern char _start[], _etext[];
 #define check_kernel_text(addr) \
-	((addr >= (unsigned long)(&_start)) && \
-	 (addr <  (unsigned long)(&_etext)) && !(addr & 1))
+	((addr >= (unsigned long)(_start)) && \
+	 (addr <  (unsigned long)(_etext)) && !(addr & 1))
 
 #endif /* _H8300_TRAPS_H */
diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h
index a1c55788c5d6..53a8d5885887 100644
--- a/arch/hexagon/include/asm/spinlock.h
+++ b/arch/hexagon/include/asm/spinlock.h
@@ -179,11 +179,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
  */
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 #define arch_spin_is_locked(x) ((x)->lock != 0)
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index ca9e76149a4a..df2c121164b8 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -76,22 +76,6 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
 	ACCESS_ONCE(*p) = (tmp + 2) & ~1;
 }
 
-static __always_inline void __ticket_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	int	*p = (int *)&lock->lock, ticket;
-
-	ia64_invala();
-
-	for (;;) {
-		asm volatile ("ld4.c.nc %0=[%1]" : "=r"(ticket) : "r"(p) : "memory");
-		if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK))
-			return;
-		cpu_relax();
-	}
-
-	smp_acquire__after_ctrl_dep();
-}
-
 static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
 	long tmp = ACCESS_ONCE(lock->lock);
@@ -143,11 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 	arch_spin_lock(lock);
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	__ticket_spin_unlock_wait(lock);
-}
-
 #define arch_read_can_lock(rw)		(*(volatile int *)(rw) >= 0)
 #define arch_write_can_lock(rw)	(*(volatile int *)(rw) == 0)
 
diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h
index 323c7fc953cd..a56825592b90 100644
--- a/arch/m32r/include/asm/spinlock.h
+++ b/arch/m32r/include/asm/spinlock.h
@@ -30,11 +30,6 @@
 #define arch_spin_is_locked(x)		(*(volatile int *)(&(x)->slock) <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, VAL > 0);
-}
-
 /**
  * arch_spin_trylock - Try spin lock and return a result
  * @lock: Pointer to the lock variable
diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h
index c0c7a22be1ae..ddf7fe5708a6 100644
--- a/arch/metag/include/asm/spinlock.h
+++ b/arch/metag/include/asm/spinlock.h
@@ -15,11 +15,6 @@
  * locked.
  */
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 #define	arch_read_lock_flags(lock, flags) arch_read_lock(lock)
diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h
index 9c7b8f7942d8..fe413b41df6c 100644
--- a/arch/mn10300/include/asm/spinlock.h
+++ b/arch/mn10300/include/asm/spinlock.h
@@ -26,11 +26,6 @@
 
 #define arch_spin_is_locked(x)	(*(volatile signed char *)(&(x)->slock) != 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	asm volatile(
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index e32936cd7f10..55bfe4affca3 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -14,13 +14,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x)
 
 #define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *x)
-{
-	volatile unsigned int *a = __ldcw_align(x);
-
-	smp_cond_load_acquire(a, VAL);
-}
-
 static inline void arch_spin_lock_flags(arch_spinlock_t *x,
 					 unsigned long flags)
 {
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index c1b1ec94b06c..edbe571bcc54 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -170,39 +170,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	lock->slock = 0;
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	arch_spinlock_t lock_val;
-
-	smp_mb();
-
-	/*
-	 * Atomically load and store back the lock value (unchanged). This
-	 * ensures that our observation of the lock value is ordered with
-	 * respect to other lock operations.
-	 */
-	__asm__ __volatile__(
-"1:	" PPC_LWARX(%0, 0, %2, 0) "\n"
-"	stwcx. %0, 0, %2\n"
-"	bne- 1b\n"
-	: "=&r" (lock_val), "+m" (*lock)
-	: "r" (lock)
-	: "cr0", "xer");
-
-	if (arch_spin_value_unlocked(lock_val))
-		goto out;
-
-	while (lock->slock) {
-		HMT_low();
-		if (SHARED_PROCESSOR)
-			__spin_yield(lock);
-	}
-	HMT_medium();
-
-out:
-	smp_mb();
-}
-
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6c2d4168daec..2e3eb7431571 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
 
-		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+		if (event->attr.sample_type &
+		    (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
 			perf_get_data_addr(regs, &data.addr);
 
 		if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index f7838ecd83c6..217ee5210c32 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -98,13 +98,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
 		: "cc", "memory");
 }
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	while (arch_spin_is_locked(lock))
-		arch_spin_relax(lock);
-	smp_acquire__after_ctrl_dep();
-}
-
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h
index c46e8cc7b515..5ed7dbbd94ff 100644
--- a/arch/sh/include/asm/spinlock-cas.h
+++ b/arch/sh/include/asm/spinlock-cas.h
@@ -29,11 +29,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new
 #define arch_spin_is_locked(x)		((x)->lock <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, VAL > 0);
-}
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	while (!__sl_cas(&lock->lock, 1, 0));
diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h
index cec78143fa83..f77263aae760 100644
--- a/arch/sh/include/asm/spinlock-llsc.h
+++ b/arch/sh/include/asm/spinlock-llsc.h
@@ -21,11 +21,6 @@
 #define arch_spin_is_locked(x)		((x)->lock <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, VAL > 0);
-}
-
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 8011e79f59c9..67345b2dc408 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -14,11 +14,6 @@
 
 #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	__asm__ __volatile__(
diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h
index b14b1ba5bf9c..cba8ba9b8da6 100644
--- a/arch/tile/include/asm/spinlock_32.h
+++ b/arch/tile/include/asm/spinlock_32.h
@@ -64,8 +64,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	lock->current_ticket = old_ticket + TICKET_QUANTUM;
 }
 
-void arch_spin_unlock_wait(arch_spinlock_t *lock);
-
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index b9718fb4e74a..9a2c2d605752 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -58,8 +58,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	__insn_fetchadd4(&lock->lock, 1U << __ARCH_SPIN_CURRENT_SHIFT);
 }
 
-void arch_spin_unlock_wait(arch_spinlock_t *lock);
-
 void arch_spin_lock_slow(arch_spinlock_t *lock, u32 val);
 
 /* Grab the "next" ticket number and bump it atomically.
diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c
index 076c6cc43113..db9333f2447c 100644
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -62,29 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock)
 }
 EXPORT_SYMBOL(arch_spin_trylock);
 
-void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	u32 iterations = 0;
-	int curr = READ_ONCE(lock->current_ticket);
-	int next = READ_ONCE(lock->next_ticket);
-
-	/* Return immediately if unlocked. */
-	if (next == curr)
-		return;
-
-	/* Wait until the current locker has released the lock. */
-	do {
-		delay_backoff(iterations++);
-	} while (READ_ONCE(lock->current_ticket) == curr);
-
-	/*
-	 * The TILE architecture doesn't do read speculation; therefore
-	 * a control dependency guarantees a LOAD->{LOAD,STORE} order.
-	 */
-	barrier();
-}
-EXPORT_SYMBOL(arch_spin_unlock_wait);
-
 /*
  * The low byte is always reserved to be the marker for a "tns" operation
  * since the low bit is set to "1" by a tns.  The next seven bits are
diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c
index a4b5b2cbce93..de414c22892f 100644
--- a/arch/tile/lib/spinlock_64.c
+++ b/arch/tile/lib/spinlock_64.c
@@ -62,28 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock)
 }
 EXPORT_SYMBOL(arch_spin_trylock);
 
-void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	u32 iterations = 0;
-	u32 val = READ_ONCE(lock->lock);
-	u32 curr = arch_spin_current(val);
-
-	/* Return immediately if unlocked. */
-	if (arch_spin_next(val) == curr)
-		return;
-
-	/* Wait until the current locker has released the lock. */
-	do {
-		delay_backoff(iterations++);
-	} while (arch_spin_current(READ_ONCE(lock->lock)) == curr);
-
-	/*
-	 * The TILE architecture doesn't do read speculation; therefore
-	 * a control dependency guarantees a LOAD->{LOAD,STORE} order.
-	 */
-	barrier();
-}
-EXPORT_SYMBOL(arch_spin_unlock_wait);
 
 /*
  * If the read lock fails due to a writer, we retry periodically
diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h
new file mode 100644
index 000000000000..7ffa5437b761
--- /dev/null
+++ b/arch/um/include/asm/unwind.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_UML_UNWIND_H
+#define _ASM_UML_UNWIND_H
+
+static inline void
+unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+		   void *orc, size_t orc_size) {}
+
+#endif /* _ASM_UML_UNWIND_H */
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 586b786b3edf..f65a804b86f0 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
 # Hyper-V paravirtualization support
 obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
 
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
 obj-y += realmode/
 obj-y += kernel/
 obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efd9df53b9e7..cce15191e9e9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -75,7 +75,6 @@ config X86
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select BUILDTIME_EXTABLE_SORT
@@ -160,6 +159,7 @@ config X86
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
+	select HAVE_MOD_ARCH_SPECIFIC
 	select HAVE_NMI
 	select HAVE_OPROFILE
 	select HAVE_OPTPROBES
@@ -170,7 +170,7 @@ config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
-	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER && STACK_VALIDATION
+	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
@@ -780,8 +780,6 @@ config KVM_DEBUG_FS
 	  Statistics are displayed in debugfs filesystem. Enabling this option
 	  may incur significant overhead.
 
-source "arch/x86/lguest/Kconfig"
-
 config PARAVIRT_TIME_ACCOUNTING
 	bool "Paravirtual steal time accounting"
 	depends on PARAVIRT
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 1fc519f3c49e..71a48a30fc84 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -356,4 +356,61 @@ config PUNIT_ATOM_DEBUG
 	  The current power state can be read from
 	  /sys/kernel/debug/punit_atom/dev_power_state
 
+choice
+	prompt "Choose kernel unwinder"
+	default FRAME_POINTER_UNWINDER
+	---help---
+	  This determines which method will be used for unwinding kernel stack
+	  traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+	  livepatch, lockdep, and more.
+
+config FRAME_POINTER_UNWINDER
+	bool "Frame pointer unwinder"
+	select FRAME_POINTER
+	---help---
+	  This option enables the frame pointer unwinder for unwinding kernel
+	  stack traces.
+
+	  The unwinder itself is fast and it uses less RAM than the ORC
+	  unwinder, but the kernel text size will grow by ~3% and the kernel's
+	  overall performance will degrade by roughly 5-10%.
+
+	  This option is recommended if you want to use the livepatch
+	  consistency model, as this is currently the only way to get a
+	  reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config ORC_UNWINDER
+	bool "ORC unwinder"
+	depends on X86_64
+	select STACK_VALIDATION
+	---help---
+	  This option enables the ORC (Oops Rewind Capability) unwinder for
+	  unwinding kernel stack traces.  It uses a custom data format which is
+	  a simplified version of the DWARF Call Frame Information standard.
+
+	  This unwinder is more accurate across interrupt entry frames than the
+	  frame pointer unwinder.  It also enables a 5-10% performance
+	  improvement across the entire kernel compared to frame pointers.
+
+	  Enabling this option will increase the kernel's runtime memory usage
+	  by roughly 2-4MB, depending on your kernel config.
+
+config GUESS_UNWINDER
+	bool "Guess unwinder"
+	depends on EXPERT
+	---help---
+	  This option enables the "guess" unwinder for unwinding kernel stack
+	  traces.  It scans the stack and reports every kernel text address it
+	  finds.  Some of the addresses it reports may be incorrect.
+
+	  While this option often produces false positives, it can still be
+	  useful in many cases.  Unlike the other unwinders, it has no runtime
+	  overhead.
+
+endchoice
+
+config FRAME_POINTER
+	depends on !ORC_UNWINDER && !GUESS_UNWINDER
+	bool
+
 endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1e902f926be3..6276572259c8 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -14,9 +14,11 @@ endif
 # For gcc stack alignment is specified with -mpreferred-stack-boundary,
 # clang has the option -mstack-alignment for that purpose.
 ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
-        cc_stack_align_opt := -mpreferred-stack-boundary
-else ifneq ($(call cc-option, -mstack-alignment=4),)
-        cc_stack_align_opt := -mstack-alignment
+      cc_stack_align4 := -mpreferred-stack-boundary=2
+      cc_stack_align8 := -mpreferred-stack-boundary=3
+else ifneq ($(call cc-option, -mstack-alignment=16),)
+      cc_stack_align4 := -mstack-alignment=4
+      cc_stack_align8 := -mstack-alignment=8
 endif
 
 # How to compile the 16-bit code.  Note we always compile for -march=i386;
@@ -36,7 +38,7 @@ REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -D__KERNEL__ \
 
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
 REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
-REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align_opt)=2)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
 export REALMODE_CFLAGS
 
 # BITS is used as extension for files which are available in a 32 bit
@@ -76,7 +78,7 @@ ifeq ($(CONFIG_X86_32),y)
         # Align the stack to the register width instead of using the default
         # alignment of 16 bytes. This reduces stack usage and the number of
         # alignment instructions.
-        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=2)
+        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4))
 
         # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
         # a lot more stack due to the lack of sharing of stacklots:
@@ -115,7 +117,7 @@ else
         # default alignment which keep the stack *mis*aligned.
         # Furthermore an alignment to the register width reduces stack usage
         # and the number of alignment instructions.
-        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align_opt)=3)
+        KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8))
 
 	# Use -mskip-rax-setup if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
@@ -232,9 +234,6 @@ KBUILD_CFLAGS += -Wno-sign-compare
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 
-KBUILD_CFLAGS += $(mflags-y)
-KBUILD_AFLAGS += $(mflags-y)
-
 archscripts: scripts_basic
 	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
 
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c3e869eaef0c..e007887a33b0 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params,
 		m |= (u64)efi->efi_memmap_hi << 32;
 #endif
 
-		d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
+		d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i);
 		switch (d->type) {
 		case EFI_RESERVED_TYPE:
 		case EFI_RUNTIME_SERVICES_CODE:
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d85b9625e836..11c68cf53d4e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -61,71 +61,6 @@
 
 	__HEAD
 ENTRY(startup_32)
-#ifdef CONFIG_EFI_STUB
-	jmp	preferred_addr
-
-	/*
-	 * We don't need the return address, so set up the stack so
-	 * efi_main() can find its arguments.
-	 */
-ENTRY(efi_pe_entry)
-	add	$0x4, %esp
-
-	call	1f
-1:	popl	%esi
-	subl	$1b, %esi
-
-	popl	%ecx
-	movl	%ecx, efi32_config(%esi)	/* Handle */
-	popl	%ecx
-	movl	%ecx, efi32_config+8(%esi)	/* EFI System table pointer */
-
-	/* Relocate efi_config->call() */
-	leal	efi32_config(%esi), %eax
-	add	%esi, 40(%eax)
-	pushl	%eax
-
-	call	make_boot_params
-	cmpl	$0, %eax
-	je	fail
-	movl	%esi, BP_code32_start(%eax)
-	popl	%ecx
-	pushl	%eax
-	pushl	%ecx
-	jmp	2f		/* Skip efi_config initialization */
-
-ENTRY(efi32_stub_entry)
-	add	$0x4, %esp
-	popl	%ecx
-	popl	%edx
-
-	call	1f
-1:	popl	%esi
-	subl	$1b, %esi
-
-	movl	%ecx, efi32_config(%esi)	/* Handle */
-	movl	%edx, efi32_config+8(%esi)	/* EFI System table pointer */
-
-	/* Relocate efi_config->call() */
-	leal	efi32_config(%esi), %eax
-	add	%esi, 40(%eax)
-	pushl	%eax
-2:
-	call	efi_main
-	cmpl	$0, %eax
-	movl	%eax, %esi
-	jne	2f
-fail:
-	/* EFI init failed, so hang. */
-	hlt
-	jmp	fail
-2:
-	movl	BP_code32_start(%esi), %eax
-	leal	preferred_addr(%eax), %eax
-	jmp	*%eax
-
-preferred_addr:
-#endif
 	cld
 	/*
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
@@ -208,6 +143,70 @@ preferred_addr:
 	jmp	*%eax
 ENDPROC(startup_32)
 
+#ifdef CONFIG_EFI_STUB
+/*
+ * We don't need the return address, so set up the stack so efi_main() can find
+ * its arguments.
+ */
+ENTRY(efi_pe_entry)
+	add	$0x4, %esp
+
+	call	1f
+1:	popl	%esi
+	subl	$1b, %esi
+
+	popl	%ecx
+	movl	%ecx, efi32_config(%esi)	/* Handle */
+	popl	%ecx
+	movl	%ecx, efi32_config+8(%esi)	/* EFI System table pointer */
+
+	/* Relocate efi_config->call() */
+	leal	efi32_config(%esi), %eax
+	add	%esi, 40(%eax)
+	pushl	%eax
+
+	call	make_boot_params
+	cmpl	$0, %eax
+	je	fail
+	movl	%esi, BP_code32_start(%eax)
+	popl	%ecx
+	pushl	%eax
+	pushl	%ecx
+	jmp	2f		/* Skip efi_config initialization */
+ENDPROC(efi_pe_entry)
+
+ENTRY(efi32_stub_entry)
+	add	$0x4, %esp
+	popl	%ecx
+	popl	%edx
+
+	call	1f
+1:	popl	%esi
+	subl	$1b, %esi
+
+	movl	%ecx, efi32_config(%esi)	/* Handle */
+	movl	%edx, efi32_config+8(%esi)	/* EFI System table pointer */
+
+	/* Relocate efi_config->call() */
+	leal	efi32_config(%esi), %eax
+	add	%esi, 40(%eax)
+	pushl	%eax
+2:
+	call	efi_main
+	cmpl	$0, %eax
+	movl	%eax, %esi
+	jne	2f
+fail:
+	/* EFI init failed, so hang. */
+	hlt
+	jmp	fail
+2:
+	movl	BP_code32_start(%esi), %eax
+	leal	startup_32(%eax), %eax
+	jmp	*%eax
+ENDPROC(efi32_stub_entry)
+#endif
+
 	.text
 relocated:
 
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index fbf4c32d0b62..b4a5d284391c 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -243,65 +243,6 @@ ENTRY(startup_64)
 	 * that maps our entire kernel(text+data+bss+brk), zero page
 	 * and command line.
 	 */
-#ifdef CONFIG_EFI_STUB
-	/*
-	 * The entry point for the PE/COFF executable is efi_pe_entry, so
-	 * only legacy boot loaders will execute this jmp.
-	 */
-	jmp	preferred_addr
-
-ENTRY(efi_pe_entry)
-	movq	%rcx, efi64_config(%rip)	/* Handle */
-	movq	%rdx, efi64_config+8(%rip) /* EFI System table pointer */
-
-	leaq	efi64_config(%rip), %rax
-	movq	%rax, efi_config(%rip)
-
-	call	1f
-1:	popq	%rbp
-	subq	$1b, %rbp
-
-	/*
-	 * Relocate efi_config->call().
-	 */
-	addq	%rbp, efi64_config+40(%rip)
-
-	movq	%rax, %rdi
-	call	make_boot_params
-	cmpq	$0,%rax
-	je	fail
-	mov	%rax, %rsi
-	leaq	startup_32(%rip), %rax
-	movl	%eax, BP_code32_start(%rsi)
-	jmp	2f		/* Skip the relocation */
-
-handover_entry:
-	call	1f
-1:	popq	%rbp
-	subq	$1b, %rbp
-
-	/*
-	 * Relocate efi_config->call().
-	 */
-	movq	efi_config(%rip), %rax
-	addq	%rbp, 40(%rax)
-2:
-	movq	efi_config(%rip), %rdi
-	call	efi_main
-	movq	%rax,%rsi
-	cmpq	$0,%rax
-	jne	2f
-fail:
-	/* EFI init failed, so hang. */
-	hlt
-	jmp	fail
-2:
-	movl	BP_code32_start(%esi), %eax
-	leaq	preferred_addr(%rax), %rax
-	jmp	*%rax
-
-preferred_addr:
-#endif
 
 	/* Setup data segments. */
 	xorl	%eax, %eax
@@ -413,6 +354,59 @@ lvl5:
 	jmp	*%rax
 
 #ifdef CONFIG_EFI_STUB
+
+/* The entry point for the PE/COFF executable is efi_pe_entry. */
+ENTRY(efi_pe_entry)
+	movq	%rcx, efi64_config(%rip)	/* Handle */
+	movq	%rdx, efi64_config+8(%rip) /* EFI System table pointer */
+
+	leaq	efi64_config(%rip), %rax
+	movq	%rax, efi_config(%rip)
+
+	call	1f
+1:	popq	%rbp
+	subq	$1b, %rbp
+
+	/*
+	 * Relocate efi_config->call().
+	 */
+	addq	%rbp, efi64_config+40(%rip)
+
+	movq	%rax, %rdi
+	call	make_boot_params
+	cmpq	$0,%rax
+	je	fail
+	mov	%rax, %rsi
+	leaq	startup_32(%rip), %rax
+	movl	%eax, BP_code32_start(%rsi)
+	jmp	2f		/* Skip the relocation */
+
+handover_entry:
+	call	1f
+1:	popq	%rbp
+	subq	$1b, %rbp
+
+	/*
+	 * Relocate efi_config->call().
+	 */
+	movq	efi_config(%rip), %rax
+	addq	%rbp, 40(%rax)
+2:
+	movq	efi_config(%rip), %rdi
+	call	efi_main
+	movq	%rax,%rsi
+	cmpq	$0,%rax
+	jne	2f
+fail:
+	/* EFI init failed, so hang. */
+	hlt
+	jmp	fail
+2:
+	movl	BP_code32_start(%esi), %eax
+	leaq	startup_64(%rax), %rax
+	jmp	*%rax
+ENDPROC(efi_pe_entry)
+
 	.org 0x390
 ENTRY(efi64_stub_entry)
 	movq	%rdi, efi64_config(%rip)	/* Handle */
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 91f27ab970ef..17818ba6906f 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -37,7 +37,9 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/ctype.h>
+#include <linux/efi.h>
 #include <generated/utsrelease.h>
+#include <asm/efi.h>
 
 /* Macros used by the included decompressor code below. */
 #define STATIC
@@ -479,35 +481,31 @@ static unsigned long slots_fetch_random(void)
 	return 0;
 }
 
-static void process_e820_entry(struct boot_e820_entry *entry,
+static void process_mem_region(struct mem_vector *entry,
 			       unsigned long minimum,
 			       unsigned long image_size)
 {
 	struct mem_vector region, overlap;
 	struct slot_area slot_area;
 	unsigned long start_orig, end;
-	struct boot_e820_entry cur_entry;
-
-	/* Skip non-RAM entries. */
-	if (entry->type != E820_TYPE_RAM)
-		return;
+	struct mem_vector cur_entry;
 
 	/* On 32-bit, ignore entries entirely above our maximum. */
-	if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
+	if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
 		return;
 
 	/* Ignore entries entirely below our minimum. */
-	if (entry->addr + entry->size < minimum)
+	if (entry->start + entry->size < minimum)
 		return;
 
 	/* Ignore entries above memory limit */
-	end = min(entry->size + entry->addr, mem_limit);
-	if (entry->addr >= end)
+	end = min(entry->size + entry->start, mem_limit);
+	if (entry->start >= end)
 		return;
-	cur_entry.addr = entry->addr;
-	cur_entry.size = end - entry->addr;
+	cur_entry.start = entry->start;
+	cur_entry.size = end - entry->start;
 
-	region.start = cur_entry.addr;
+	region.start = cur_entry.start;
 	region.size = cur_entry.size;
 
 	/* Give up if slot area array is full. */
@@ -521,8 +519,8 @@ static void process_e820_entry(struct boot_e820_entry *entry,
 		/* Potentially raise address to meet alignment needs. */
 		region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
 
-		/* Did we raise the address above this e820 region? */
-		if (region.start > cur_entry.addr + cur_entry.size)
+		/* Did we raise the address above the passed in memory entry? */
+		if (region.start > cur_entry.start + cur_entry.size)
 			return;
 
 		/* Reduce size by any delta from the original address. */
@@ -562,31 +560,126 @@ static void process_e820_entry(struct boot_e820_entry *entry,
 	}
 }
 
-static unsigned long find_random_phys_addr(unsigned long minimum,
-					   unsigned long image_size)
+#ifdef CONFIG_EFI
+/*
+ * Returns true if mirror region found (and must have been processed
+ * for slots adding)
+ */
+static bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
 {
+	struct efi_info *e = &boot_params->efi_info;
+	bool efi_mirror_found = false;
+	struct mem_vector region;
+	efi_memory_desc_t *md;
+	unsigned long pmap;
+	char *signature;
+	u32 nr_desc;
 	int i;
-	unsigned long addr;
 
-	/* Check if we had too many memmaps. */
-	if (memmap_too_large) {
-		debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n");
-		return 0;
+	signature = (char *)&e->efi_loader_signature;
+	if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+	    strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
+		return false;
+
+#ifdef CONFIG_X86_32
+	/* Can't handle data above 4GB at this time */
+	if (e->efi_memmap_hi) {
+		warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
+		return false;
 	}
+	pmap =  e->efi_memmap;
+#else
+	pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
+#endif
 
-	/* Make sure minimum is aligned. */
-	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+	nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
+	for (i = 0; i < nr_desc; i++) {
+		md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+		if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+			efi_mirror_found = true;
+			break;
+		}
+	}
+
+	for (i = 0; i < nr_desc; i++) {
+		md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+
+		/*
+		 * Here we are more conservative in picking free memory than
+		 * the EFI spec allows:
+		 *
+		 * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also
+		 * free memory and thus available to place the kernel image into,
+		 * but in practice there's firmware where using that memory leads
+		 * to crashes.
+		 *
+		 * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free.
+		 */
+		if (md->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (efi_mirror_found &&
+		    !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
+			continue;
+
+		region.start = md->phys_addr;
+		region.size = md->num_pages << EFI_PAGE_SHIFT;
+		process_mem_region(&region, minimum, image_size);
+		if (slot_area_index == MAX_SLOT_AREA) {
+			debug_putstr("Aborted EFI scan (slot_areas full)!\n");
+			break;
+		}
+	}
+	return true;
+}
+#else
+static inline bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+	return false;
+}
+#endif
+
+static void process_e820_entries(unsigned long minimum,
+				 unsigned long image_size)
+{
+	int i;
+	struct mem_vector region;
+	struct boot_e820_entry *entry;
 
 	/* Verify potential e820 positions, appending to slots list. */
 	for (i = 0; i < boot_params->e820_entries; i++) {
-		process_e820_entry(&boot_params->e820_table[i], minimum,
-				   image_size);
+		entry = &boot_params->e820_table[i];
+		/* Skip non-RAM entries. */
+		if (entry->type != E820_TYPE_RAM)
+			continue;
+		region.start = entry->addr;
+		region.size = entry->size;
+		process_mem_region(&region, minimum, image_size);
 		if (slot_area_index == MAX_SLOT_AREA) {
 			debug_putstr("Aborted e820 scan (slot_areas full)!\n");
 			break;
 		}
 	}
+}
+
+static unsigned long find_random_phys_addr(unsigned long minimum,
+					   unsigned long image_size)
+{
+	/* Check if we had too many memmaps. */
+	if (memmap_too_large) {
+		debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
+		return 0;
+	}
+
+	/* Make sure minimum is aligned. */
+	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+
+	if (process_efi_entries(minimum, image_size))
+		return slots_fetch_random();
 
+	process_e820_entries(minimum, image_size);
 	return slots_fetch_random();
 }
 
@@ -645,7 +738,7 @@ void choose_random_location(unsigned long input,
 	 */
 	min_addr = min(*output, 512UL << 20);
 
-	/* Walk e820 and find a random address. */
+	/* Walk available memory entries to find a random address. */
 	random_addr = find_random_phys_addr(min_addr, output_size);
 	if (!random_addr) {
 		warn("Physical KASLR disabled: no suitable memory region!");
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 4b429df40d7a..550cd5012b73 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,3 +1,5 @@
 CONFIG_NOHIGHMEM=y
 # CONFIG_HIGHMEM4G is not set
 # CONFIG_HIGHMEM64G is not set
+CONFIG_GUESS_UNWINDER=y
+# CONFIG_FRAME_POINTER_UNWINDER is not set
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..03505ffbe1b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/uprobes.h>
 #include <linux/livepatch.h>
+#include <linux/syscalls.h>
 
 #include <asm/desc.h>
 #include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	struct thread_info *ti = current_thread_info();
 	u32 cached_flags;
 
+	addr_limit_user_check();
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
 		local_irq_disable();
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 64b233ab7cad..ca0b250eefc4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,14 +142,8 @@ ENTRY(entry_SYSCALL_64)
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
 	 * it is too small to ever cause noticeable irq latency.
 	 */
-	SWAPGS_UNSAFE_STACK
-	/*
-	 * A hypervisor implementation might want to use a label
-	 * after the swapgs, so that it can do the swapgs
-	 * for the guest and jump here on syscall.
-	 */
-GLOBAL(entry_SYSCALL_64_after_swapgs)
 
+	swapgs
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
@@ -161,6 +155,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	pushq	%r11				/* pt_regs->flags */
 	pushq	$__USER_CS			/* pt_regs->cs */
 	pushq	%rcx				/* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%rax				/* pt_regs->orig_ax */
 	pushq	%rdi				/* pt_regs->di */
 	pushq	%rsi				/* pt_regs->si */
@@ -766,13 +761,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
 #endif
 
 /* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY	.popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY	.popsection
 
 .macro apicinterrupt num sym do_sym
 PUSH_SECTION_IRQENTRY
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..4b86d8da3ea3 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
  */
 ENTRY(entry_SYSCALL_compat)
 	/* Interrupts are off on entry. */
-	SWAPGS_UNSAFE_STACK
+	swapgs
 
 	/* Stash user ESP and switch to the kernel stack. */
 	movl	%esp, %r8d
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
-	/* Zero-extending 32-bit regs, do not remove */
-	movl	%eax, %eax
-
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER32_DS		/* pt_regs->ss */
 	pushq	%r8			/* pt_regs->sp */
 	pushq	%r11			/* pt_regs->flags */
 	pushq	$__USER32_CS		/* pt_regs->cs */
 	pushq	%rcx			/* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+	movl	%eax, %eax		/* discard orig_ax high bits */
 	pushq	%rax			/* pt_regs->orig_ax */
 	pushq	%rdi			/* pt_regs->di */
 	pushq	%rsi			/* pt_regs->si */
@@ -342,8 +341,7 @@ ENTRY(entry_INT80_compat)
 	jmp	restore_regs_and_iret
 END(entry_INT80_compat)
 
-	ALIGN
-GLOBAL(stub32_clone)
+ENTRY(stub32_clone)
 	/*
 	 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
 	 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
@@ -353,3 +351,4 @@ GLOBAL(stub32_clone)
 	 */
 	xchg	%r8, %rcx
 	jmp	sys_clone
+ENDPROC(stub32_clone)
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index ad44af0dd667..f5cbbba99283 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu)
 
 	if (amd_uncore_llc) {
 		unsigned int apicid = cpu_data(cpu).apicid;
-		unsigned int nshared;
+		unsigned int nshared, subleaf, prev_eax = 0;
 
 		uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
-		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
-		nshared = ((eax >> 14) & 0xfff) + 1;
+		/*
+		 * Iterate over Cache Topology Definition leaves until no
+		 * more cache descriptions are available.
+		 */
+		for (subleaf = 0; subleaf < 5; subleaf++) {
+			cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx);
+
+			/* EAX[0:4] gives type of cache */
+			if (!(eax & 0x1f))
+				break;
+
+			prev_eax = eax;
+		}
+		nshared = ((prev_eax >> 14) & 0xfff) + 1;
+
 		uncore->id = apicid - (apicid % nshared);
 
 		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
@@ -555,7 +568,7 @@ static int __init amd_uncore_init(void)
 		ret = 0;
 	}
 
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
 		amd_uncore_llc = alloc_percpu(struct amd_uncore *);
 		if (!amd_uncore_llc) {
 			ret = -ENOMEM;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 939050169d12..80534d3c2480 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event)
 	return m == b;
 }
 
-int x86_pmu_hw_config(struct perf_event *event)
+int x86_pmu_max_precise(void)
 {
-	if (event->attr.precise_ip) {
-		int precise = 0;
+	int precise = 0;
+
+	/* Support for constant skid */
+	if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+		precise++;
 
-		/* Support for constant skid */
-		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+		/* Support for IP fixup */
+		if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
 			precise++;
 
-			/* Support for IP fixup */
-			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
-				precise++;
+		if (x86_pmu.pebs_prec_dist)
+			precise++;
+	}
+	return precise;
+}
 
-			if (x86_pmu.pebs_prec_dist)
-				precise++;
-		}
+int x86_pmu_hw_config(struct perf_event *event)
+{
+	if (event->attr.precise_ip) {
+		int precise = x86_pmu_max_precise();
 
 		if (event->attr.precise_ip > precise)
 			return -EOPNOTSUPP;
@@ -1751,6 +1757,7 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
 }
 
 static struct attribute_group x86_pmu_attr_group;
+static struct attribute_group x86_pmu_caps_group;
 
 static int __init init_hw_perf_events(void)
 {
@@ -1799,6 +1806,14 @@ static int __init init_hw_perf_events(void)
 
 	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
 
+	if (x86_pmu.caps_attrs) {
+		struct attribute **tmp;
+
+		tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs);
+		if (!WARN_ON(!tmp))
+			x86_pmu_caps_group.attrs = tmp;
+	}
+
 	if (x86_pmu.event_attrs)
 		x86_pmu_events_group.attrs = x86_pmu.event_attrs;
 
@@ -2213,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = {
 	.attrs = x86_pmu_attrs,
 };
 
+static ssize_t max_precise_show(struct device *cdev,
+				  struct device_attribute *attr,
+				  char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
+}
+
+static DEVICE_ATTR_RO(max_precise);
+
+static struct attribute *x86_pmu_caps_attrs[] = {
+	&dev_attr_max_precise.attr,
+	NULL
+};
+
+static struct attribute_group x86_pmu_caps_group = {
+	.name = "caps",
+	.attrs = x86_pmu_caps_attrs,
+};
+
 static const struct attribute_group *x86_pmu_attr_groups[] = {
 	&x86_pmu_attr_group,
 	&x86_pmu_format_group,
 	&x86_pmu_events_group,
+	&x86_pmu_caps_group,
 	NULL,
 };
 
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index ddd8d3516bfc..16076eb34699 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags)
 	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
 	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
 
-	event->hw.itrace_started = 1;
+	perf_event_itrace_started(event);
 	event->hw.state = 0;
 
 	__bts_event_start(event);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 98b0f0729527..829e89cfcee2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3415,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = {
 	&format_attr_any.attr,
 	&format_attr_inv.attr,
 	&format_attr_cmask.attr,
+	NULL,
+};
+
+static struct attribute *hsw_format_attr[] = {
 	&format_attr_in_tx.attr,
 	&format_attr_in_tx_cp.attr,
+	&format_attr_offcore_rsp.attr,
+	&format_attr_ldlat.attr,
+	NULL
+};
 
-	&format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
-	&format_attr_ldlat.attr, /* PEBS load latency */
-	NULL,
+static struct attribute *nhm_format_attr[] = {
+	&format_attr_offcore_rsp.attr,
+	&format_attr_ldlat.attr,
+	NULL
+};
+
+static struct attribute *slm_format_attr[] = {
+	&format_attr_offcore_rsp.attr,
+	NULL
 };
 
 static struct attribute *skl_format_attr[] = {
@@ -3781,6 +3795,36 @@ done:
 
 static DEVICE_ATTR_RW(freeze_on_smi);
 
+static ssize_t branches_show(struct device *cdev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *lbr_attrs[] = {
+	&dev_attr_branches.attr,
+	NULL
+};
+
+static char pmu_name_str[30];
+
+static ssize_t pmu_name_show(struct device *cdev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
+}
+
+static DEVICE_ATTR_RO(pmu_name);
+
+static struct attribute *intel_pmu_caps_attrs[] = {
+       &dev_attr_pmu_name.attr,
+       NULL
+};
+
 static struct attribute *intel_pmu_attrs[] = {
 	&dev_attr_freeze_on_smi.attr,
 	NULL,
@@ -3795,6 +3839,8 @@ __init int intel_pmu_init(void)
 	unsigned int unused;
 	struct extra_reg *er;
 	int version, i;
+	struct attribute **extra_attr = NULL;
+	char *name;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
 		switch (boot_cpu_data.x86) {
@@ -3862,6 +3908,7 @@ __init int intel_pmu_init(void)
 	switch (boot_cpu_data.x86_model) {
 	case INTEL_FAM6_CORE_YONAH:
 		pr_cont("Core events, ");
+		name = "core";
 		break;
 
 	case INTEL_FAM6_CORE2_MEROM:
@@ -3877,6 +3924,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_core2_event_constraints;
 		x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
 		pr_cont("Core2 events, ");
+		name = "core2";
 		break;
 
 	case INTEL_FAM6_NEHALEM:
@@ -3905,8 +3953,11 @@ __init int intel_pmu_init(void)
 
 		intel_pmu_pebs_data_source_nhm();
 		x86_add_quirk(intel_nehalem_quirk);
+		x86_pmu.pebs_no_tlb = 1;
+		extra_attr = nhm_format_attr;
 
 		pr_cont("Nehalem events, ");
+		name = "nehalem";
 		break;
 
 	case INTEL_FAM6_ATOM_PINEVIEW:
@@ -3923,6 +3974,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
 		pr_cont("Atom events, ");
+		name = "bonnell";
 		break;
 
 	case INTEL_FAM6_ATOM_SILVERMONT1:
@@ -3940,7 +3992,9 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_slm_extra_regs;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.cpu_events = slm_events_attrs;
+		extra_attr = slm_format_attr;
 		pr_cont("Silvermont events, ");
+		name = "silvermont";
 		break;
 
 	case INTEL_FAM6_ATOM_GOLDMONT:
@@ -3965,7 +4019,9 @@ __init int intel_pmu_init(void)
 		x86_pmu.lbr_pt_coexist = true;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.cpu_events = glm_events_attrs;
+		extra_attr = slm_format_attr;
 		pr_cont("Goldmont events, ");
+		name = "goldmont";
 		break;
 
 	case INTEL_FAM6_ATOM_GEMINI_LAKE:
@@ -3991,7 +4047,9 @@ __init int intel_pmu_init(void)
 		x86_pmu.cpu_events = glm_events_attrs;
 		/* Goldmont Plus has 4-wide pipeline */
 		event_attr_td_total_slots_scale_glm.event_str = "4";
+		extra_attr = slm_format_attr;
 		pr_cont("Goldmont plus events, ");
+		name = "goldmont_plus";
 		break;
 
 	case INTEL_FAM6_WESTMERE:
@@ -4020,7 +4078,9 @@ __init int intel_pmu_init(void)
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
 		intel_pmu_pebs_data_source_nhm();
+		extra_attr = nhm_format_attr;
 		pr_cont("Westmere events, ");
+		name = "westmere";
 		break;
 
 	case INTEL_FAM6_SANDYBRIDGE:
@@ -4056,7 +4116,10 @@ __init int intel_pmu_init(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
 			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
 
+		extra_attr = nhm_format_attr;
+
 		pr_cont("SandyBridge events, ");
+		name = "sandybridge";
 		break;
 
 	case INTEL_FAM6_IVYBRIDGE:
@@ -4090,7 +4153,10 @@ __init int intel_pmu_init(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
 			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
 
+		extra_attr = nhm_format_attr;
+
 		pr_cont("IvyBridge events, ");
+		name = "ivybridge";
 		break;
 
 
@@ -4118,7 +4184,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
 		x86_pmu.lbr_double_abort = true;
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
 		pr_cont("Haswell events, ");
+		name = "haswell";
 		break;
 
 	case INTEL_FAM6_BROADWELL_CORE:
@@ -4154,7 +4223,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
 		x86_pmu.limit_period = bdw_limit_period;
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
 		pr_cont("Broadwell events, ");
+		name = "broadwell";
 		break;
 
 	case INTEL_FAM6_XEON_PHI_KNL:
@@ -4172,8 +4244,9 @@ __init int intel_pmu_init(void)
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
+		extra_attr = slm_format_attr;
 		pr_cont("Knights Landing/Mill events, ");
+		name = "knights-landing";
 		break;
 
 	case INTEL_FAM6_SKYLAKE_MOBILE:
@@ -4203,11 +4276,14 @@ __init int intel_pmu_init(void)
 
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
-						  skl_format_attr);
-		WARN_ON(!x86_pmu.format_attrs);
+		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+			hsw_format_attr : nhm_format_attr;
+		extra_attr = merge_attr(extra_attr, skl_format_attr);
 		x86_pmu.cpu_events = hsw_events_attrs;
+		intel_pmu_pebs_data_source_skl(
+			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
 		pr_cont("Skylake events, ");
+		name = "skylake";
 		break;
 
 	default:
@@ -4215,6 +4291,7 @@ __init int intel_pmu_init(void)
 		case 1:
 			x86_pmu.event_constraints = intel_v1_event_constraints;
 			pr_cont("generic architected perfmon v1, ");
+			name = "generic_arch_v1";
 			break;
 		default:
 			/*
@@ -4222,10 +4299,19 @@ __init int intel_pmu_init(void)
 			 */
 			x86_pmu.event_constraints = intel_gen_event_constraints;
 			pr_cont("generic architected perfmon, ");
+			name = "generic_arch_v2+";
 			break;
 		}
 	}
 
+	snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name);
+
+	if (version >= 2 && extra_attr) {
+		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+						  extra_attr);
+		WARN_ON(!x86_pmu.format_attrs);
+	}
+
 	if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
 		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
 		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
@@ -4272,8 +4358,13 @@ __init int intel_pmu_init(void)
 			x86_pmu.lbr_nr = 0;
 	}
 
-	if (x86_pmu.lbr_nr)
+	x86_pmu.caps_attrs = intel_pmu_caps_attrs;
+
+	if (x86_pmu.lbr_nr) {
+		x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs);
 		pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+	}
+
 	/*
 	 * Access extra MSR may cause #GP under certain circumstances.
 	 * E.g. KVM doesn't support offcore event
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a322fed5f8ed..e1965e5ff570 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -49,34 +49,47 @@ union intel_x86_pebs_dse {
  */
 #define P(a, b) PERF_MEM_S(a, b)
 #define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
 #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
 
 /* Version for Sandy Bridge and later */
 static u64 pebs_data_source[] = {
-	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
-	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
-	OP_LH | P(LVL, L2)  | P(SNOOP, NONE),	/* 0x03: L2 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, NONE),	/* 0x04: L3 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, MISS),	/* 0x05: L3 hit, snoop miss */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HIT),	/* 0x06: L3 hit, snoop hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HITM),	/* 0x07: L3 hit, snoop hitm */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-	OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-	OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-	OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-	OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-	OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+	P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+	OP_LH | P(LVL, L1)  | LEVEL(L1) | P(SNOOP, NONE),  /* 0x01: L1 local */
+	OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, NONE),  /* 0x03: L2 hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, NONE),  /* 0x04: L3 hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, MISS),  /* 0x05: L3 hit, snoop miss */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HIT),   /* 0x06: L3 hit, snoop hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HITM),  /* 0x07: L3 hit, snoop hitm */
+	OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+	OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+	OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | P(SNOOP, HIT),       /* 0x0a: L3 miss, shared */
+	OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+	OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | SNOOP_NONE_MISS,     /* 0x0c: L3 miss, excl */
+	OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+	OP_LH | P(LVL, IO)  | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+	OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
 };
 
 /* Patch up minor differences in the bits */
 void __init intel_pmu_pebs_data_source_nhm(void)
 {
-	pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
-	pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
-	pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+	pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+	pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+	pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+	u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+	pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+	pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+	pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+	pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+	pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
 }
 
 static u64 precise_store_data(u64 status)
@@ -149,8 +162,6 @@ static u64 load_latency_data(u64 status)
 {
 	union intel_x86_pebs_dse dse;
 	u64 val;
-	int model = boot_cpu_data.x86_model;
-	int fam = boot_cpu_data.x86;
 
 	dse.val = status;
 
@@ -162,8 +173,7 @@ static u64 load_latency_data(u64 status)
 	/*
 	 * Nehalem models do not support TLB, Lock infos
 	 */
-	if (fam == 0x6 && (model == 26 || model == 30
-	    || model == 31 || model == 46)) {
+	if (x86_pmu.pebs_no_tlb) {
 		val |= P(TLB, NA) | P(LOCK, NA);
 		return val;
 	}
@@ -1175,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	else
 		regs->flags &= ~PERF_EFLAGS_EXACT;
 
-	if ((sample_type & PERF_SAMPLE_ADDR) &&
+	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 955457a30197..8a6bbacd17dc 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -109,6 +109,9 @@ enum {
 	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
 	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
 	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
+
+	X86_BR_TYPE_SAVE	= 1 << 18,/* indicate to save branch type */
+
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -514,6 +517,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[i].in_tx	= 0;
 		cpuc->lbr_entries[i].abort	= 0;
 		cpuc->lbr_entries[i].cycles	= 0;
+		cpuc->lbr_entries[i].type	= 0;
 		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
@@ -600,6 +604,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[out].in_tx	 = in_tx;
 		cpuc->lbr_entries[out].abort	 = abort;
 		cpuc->lbr_entries[out].cycles	 = cycles;
+		cpuc->lbr_entries[out].type	 = 0;
 		cpuc->lbr_entries[out].reserved	 = 0;
 		out++;
 	}
@@ -677,6 +682,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 
 	if (br_type & PERF_SAMPLE_BRANCH_CALL)
 		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+		mask |= X86_BR_TYPE_SAVE;
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
@@ -930,6 +939,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
 	return ret;
 }
 
+#define X86_BR_TYPE_MAP_MAX	16
+
+static int branch_map[X86_BR_TYPE_MAP_MAX] = {
+	PERF_BR_CALL,		/* X86_BR_CALL */
+	PERF_BR_RET,		/* X86_BR_RET */
+	PERF_BR_SYSCALL,	/* X86_BR_SYSCALL */
+	PERF_BR_SYSRET,		/* X86_BR_SYSRET */
+	PERF_BR_UNKNOWN,	/* X86_BR_INT */
+	PERF_BR_UNKNOWN,	/* X86_BR_IRET */
+	PERF_BR_COND,		/* X86_BR_JCC */
+	PERF_BR_UNCOND,		/* X86_BR_JMP */
+	PERF_BR_UNKNOWN,	/* X86_BR_IRQ */
+	PERF_BR_IND_CALL,	/* X86_BR_IND_CALL */
+	PERF_BR_UNKNOWN,	/* X86_BR_ABORT */
+	PERF_BR_UNKNOWN,	/* X86_BR_IN_TX */
+	PERF_BR_UNKNOWN,	/* X86_BR_NO_TX */
+	PERF_BR_CALL,		/* X86_BR_ZERO_CALL */
+	PERF_BR_UNKNOWN,	/* X86_BR_CALL_STACK */
+	PERF_BR_IND,		/* X86_BR_IND_JMP */
+};
+
+static int
+common_branch_type(int type)
+{
+	int i;
+
+	type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+
+	if (type) {
+		i = __ffs(type);
+		if (i < X86_BR_TYPE_MAP_MAX)
+			return branch_map[i];
+	}
+
+	return PERF_BR_UNKNOWN;
+}
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -946,7 +992,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	bool compress = false;
 
 	/* if sampling all branches, then nothing to filter */
-	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+	if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+	    ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
 		return;
 
 	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
@@ -967,6 +1014,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 			cpuc->lbr_entries[i].from = 0;
 			compress = true;
 		}
+
+		if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+			cpuc->lbr_entries[i].type = common_branch_type(type);
 	}
 
 	if (!compress)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index ae8324d65e61..81fd41d5a0d9 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event)
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	u64 reg;
 
-	if (!event->hw.itrace_started) {
-		event->hw.itrace_started = 1;
+	/* First round: clear STATUS, in particular the PSB byte counter. */
+	if (!event->hw.config) {
+		perf_event_itrace_started(event);
 		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 	}
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 476aec3a4cab..4196f81ec0e1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
 	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
+	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
@@ -558,6 +558,7 @@ struct x86_pmu {
 	int		attr_rdpmc;
 	struct attribute **format_attrs;
 	struct attribute **event_attrs;
+	struct attribute **caps_attrs;
 
 	ssize_t		(*events_sysfs_show)(char *page, u64 config);
 	struct attribute **cpu_events;
@@ -591,7 +592,8 @@ struct x86_pmu {
 			pebs		:1,
 			pebs_active	:1,
 			pebs_broken	:1,
-			pebs_prec_dist	:1;
+			pebs_prec_dist	:1,
+			pebs_no_tlb	:1;
 	int		pebs_record_size;
 	int		pebs_buffer_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
@@ -741,6 +743,8 @@ int x86_reserve_hardware(void);
 
 void x86_release_hardware(void);
 
+int x86_pmu_max_precise(void);
+
 void hw_perf_lbr_event_destroy(struct perf_event *event);
 
 int x86_setup_perfctr(struct perf_event *event);
@@ -947,6 +951,8 @@ void intel_pmu_lbr_init_knl(void);
 
 void intel_pmu_pebs_data_source_nhm(void);
 
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 724153797209..e0bb46c02857 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -226,7 +226,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 	if (ksig->ka.sa.sa_flags & SA_ONSTACK)
 		sp = sigsp(sp, ksig);
 	/* This is the legacy signal stack switching. */
-	else if ((regs->ss & 0xffff) != __USER32_DS &&
+	else if (regs->ss != __USER32_DS &&
 		!(ksig->ka.sa.sa_flags & SA_RESTORER) &&
 		 ksig->ka.sa.sa_restorer)
 		sp = (unsigned long) ksig->ka.sa.sa_restorer;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5a28e8e55e36..8ea315a11fe0 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -177,7 +177,7 @@
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
 #define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
 #define X86_FEATURE_PTSC	( 6*32+27) /* performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_PERFCTR_LLC	( 6*32+28) /* Last Level Cache performance counter extensions */
 #define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
 
 /*
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9aeb91935ce0..bda9f94bcb10 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -126,15 +126,15 @@ do {						\
 	pr_reg[4] = regs->di;			\
 	pr_reg[5] = regs->bp;			\
 	pr_reg[6] = regs->ax;			\
-	pr_reg[7] = regs->ds & 0xffff;		\
-	pr_reg[8] = regs->es & 0xffff;		\
-	pr_reg[9] = regs->fs & 0xffff;		\
+	pr_reg[7] = regs->ds;			\
+	pr_reg[8] = regs->es;			\
+	pr_reg[9] = regs->fs;			\
 	pr_reg[11] = regs->orig_ax;		\
 	pr_reg[12] = regs->ip;			\
-	pr_reg[13] = regs->cs & 0xffff;		\
+	pr_reg[13] = regs->cs;			\
 	pr_reg[14] = regs->flags;		\
 	pr_reg[15] = regs->sp;			\
-	pr_reg[16] = regs->ss & 0xffff;		\
+	pr_reg[16] = regs->ss;			\
 } while (0);
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)	\
@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)			\
 do {								\
+	unsigned long base;					\
 	unsigned v;						\
 	(pr_reg)[0] = (regs)->r15;				\
 	(pr_reg)[1] = (regs)->r14;				\
@@ -226,8 +227,8 @@ do {								\
 	(pr_reg)[18] = (regs)->flags;				\
 	(pr_reg)[19] = (regs)->sp;				\
 	(pr_reg)[20] = (regs)->ss;				\
-	(pr_reg)[21] = current->thread.fsbase;			\
-	(pr_reg)[22] = current->thread.gsbase;			\
+	rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base;		\
+	rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base;	\
 	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
 	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
 	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
deleted file mode 100644
index 73d0c9b92087..000000000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS	10
-#define GDT_ENTRY_LGUEST_DS	11
-#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
-	/* Manually saved part. */
-	unsigned long eax, ebx, ecx, edx;
-	unsigned long esi, edi, ebp;
-	unsigned long gs;
-	unsigned long fs, ds, es;
-	unsigned long trapnum, errcode;
-	/* Trap pushed part */
-	unsigned long eip;
-	unsigned long cs;
-	unsigned long eflags;
-	unsigned long esp;
-	unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
-	/* Host information we need to restore when we switch back. */
-	u32 host_cr3;
-	struct desc_ptr host_idt_desc;
-	struct desc_ptr host_gdt_desc;
-	u32 host_sp;
-
-	/* Fields which are used when guest is running. */
-	struct desc_ptr guest_idt_desc;
-	struct desc_ptr guest_gdt_desc;
-	struct x86_hw_tss guest_tss;
-	struct desc_struct guest_idt[IDT_ENTRIES];
-	struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
-	/* The GDT entries copied into lguest_ro_state when running. */
-	struct desc_struct gdt[GDT_ENTRIES];
-
-	/* The IDT entries: some copied into lguest_ro_state when running. */
-	struct desc_struct idt[IDT_ENTRIES];
-
-	/* The address of the last guest-visible pagefault (ie. cr2). */
-	unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
-	u32 cr0;
-
-	cr0 = read_cr0();
-	if (!(cr0 & 8))
-		write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
-	((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
deleted file mode 100644
index 6c119cfae218..000000000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC	0
-#define LHCALL_LGUEST_INIT	1
-#define LHCALL_SHUTDOWN		2
-#define LHCALL_NEW_PGTABLE	4
-#define LHCALL_FLUSH_TLB	5
-#define LHCALL_LOAD_IDT_ENTRY	6
-#define LHCALL_SET_STACK	7
-#define LHCALL_SET_CLOCKEVENT	9
-#define LHCALL_HALT		10
-#define LHCALL_SET_PMD		13
-#define LHCALL_SET_PTE		14
-#define LHCALL_SET_PGD		15
-#define LHCALL_LOAD_TLS		16
-#define LHCALL_LOAD_GDT_ENTRY	18
-#define LHCALL_SEND_INTERRUPTS	19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF	1
-#define LGUEST_SHUTDOWN_RESTART		2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations?  There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts).  Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure.  This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
-      unsigned long arg1, unsigned long arg2, unsigned long arg3,
-      unsigned long arg4)
-{
-	/* "int" is the Intel instruction to trigger a trap. */
-	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
-		     /* The call in %eax (aka "a") might be overwritten */
-		     : "=a"(call)
-		       /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
-		     : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
-		       /* "memory" means this might write somewhere in memory.
-			* This isn't true for all calls, but it's safe to tell
-			* gcc that it might happen so it doesn't get clever. */
-		     : "memory");
-	return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
-	/* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
-	unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e3b7819caeef..9eb7c718aaf8 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -2,6 +2,15 @@
 #define _ASM_X86_MODULE_H
 
 #include <asm-generic/module.h>
+#include <asm/orc_types.h>
+
+struct mod_arch_specific {
+#ifdef CONFIG_ORC_UNWINDER
+	unsigned int num_orcs;
+	int *orc_unwind_ip;
+	struct orc_entry *orc_unwind;
+#endif
+};
 
 #ifdef CONFIG_X86_64
 /* X86_64 does not define MODULE_PROC_FAMILY */
diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h
new file mode 100644
index 000000000000..91c8d868424d
--- /dev/null
+++ b/arch/x86/include/asm/orc_lookup.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _ORC_LOOKUP_H
+#define _ORC_LOOKUP_H
+
+/*
+ * This is a lookup table for speeding up access to the .orc_unwind table.
+ * Given an input address offset, the corresponding lookup table entry
+ * specifies a subset of the .orc_unwind table to search.
+ *
+ * Each block represents the end of the previous range and the start of the
+ * next range.  An extra block is added to give the last range an end.
+ *
+ * The block size should be a power of 2 to avoid a costly 'div' instruction.
+ *
+ * A block size of 256 was chosen because it roughly doubles unwinder
+ * performance while only adding ~5% to the ORC data footprint.
+ */
+#define LOOKUP_BLOCK_ORDER	8
+#define LOOKUP_BLOCK_SIZE	(1 << LOOKUP_BLOCK_ORDER)
+
+#ifndef LINKER_SCRIPT
+
+extern unsigned int orc_lookup[];
+extern unsigned int orc_lookup_end[];
+
+#define LOOKUP_START_IP		(unsigned long)_stext
+#define LOOKUP_STOP_IP		(unsigned long)_etext
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* _ORC_LOOKUP_H */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 7dc777a6cb40..9c9dc579bd7d 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -88,7 +88,7 @@ struct orc_entry {
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
-};
+} __packed;
 
 /*
  * This struct is used by asm and inline asm code to manually annotate the
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0b03d655db7c..abc99b9c7ffd 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -662,7 +662,7 @@ static inline void sync_core(void)
 	 * In case NMI unmasking or performance ever becomes a problem,
 	 * the next best option appears to be MOV-to-CR2 and an
 	 * unconditional jump.  That sequence also works on all CPUs,
-	 * but it will fault at CPL3 (i.e. Xen PV and lguest).
+	 * but it will fault at CPL3 (i.e. Xen PV).
 	 *
 	 * CPUID is the conventional way, but it's nasty: it doesn't
 	 * exist on some 486-like CPUs, and it usually exits to a
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2b5d686ea9f3..91c04c8e67fa 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -9,6 +9,20 @@
 #ifdef __i386__
 
 struct pt_regs {
+	/*
+	 * NB: 32-bit x86 CPUs are inconsistent as what happens in the
+	 * following cases (where %seg represents a segment register):
+	 *
+	 * - pushl %seg: some do a 16-bit write and leave the high
+	 *   bits alone
+	 * - movl %seg, [mem]: some do a 16-bit write despite the movl
+	 * - IDT entry: some (e.g. 486) will leave the high bits of CS
+	 *   and (if applicable) SS undefined.
+	 *
+	 * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
+	 * so we can just treat all of the segment registers as 16-bit
+	 * values.
+	 */
 	unsigned long bx;
 	unsigned long cx;
 	unsigned long dx;
@@ -16,16 +30,22 @@ struct pt_regs {
 	unsigned long di;
 	unsigned long bp;
 	unsigned long ax;
-	unsigned long ds;
-	unsigned long es;
-	unsigned long fs;
-	unsigned long gs;
+	unsigned short ds;
+	unsigned short __dsh;
+	unsigned short es;
+	unsigned short __esh;
+	unsigned short fs;
+	unsigned short __fsh;
+	unsigned short gs;
+	unsigned short __gsh;
 	unsigned long orig_ax;
 	unsigned long ip;
-	unsigned long cs;
+	unsigned short cs;
+	unsigned short __csh;
 	unsigned long flags;
 	unsigned long sp;
-	unsigned long ss;
+	unsigned short ss;
+	unsigned short __ssh;
 };
 
 #else /* __i386__ */
@@ -176,6 +196,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
 	if (offset == offsetof(struct pt_regs, sp) &&
 	    regs->cs == __KERNEL_CS)
 		return kernel_stack_pointer(regs);
+
+	/* The selector fields are 16-bit. */
+	if (offset == offsetof(struct pt_regs, cs) ||
+	    offset == offsetof(struct pt_regs, ss) ||
+	    offset == offsetof(struct pt_regs, ds) ||
+	    offset == offsetof(struct pt_regs, es) ||
+	    offset == offsetof(struct pt_regs, fs) ||
+	    offset == offsetof(struct pt_regs, gs)) {
+		return *(u16 *)((unsigned long)regs + offset);
+
+	}
 #endif
 	return *(unsigned long *)((unsigned long)regs + offset);
 }
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd6e7b3..5161da1a0fa0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -98,6 +98,7 @@ struct thread_info {
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 #define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
 #define TIF_X32			30	/* 32-bit native x86-64 binary */
+#define TIF_FSCHECK		31	/* Check FS is USER_DS on return */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -122,6 +123,7 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_ADDR32		(1 << TIF_ADDR32)
 #define _TIF_X32		(1 << TIF_X32)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 
 /*
  * work to do in syscall_trace_enter().  Also includes TIF_NOHZ for
@@ -137,7 +139,8 @@ struct thread_info {
 	(_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING |	\
 	 _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU |	\
 	 _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE |	\
-	 _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_FSCHECK)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 6358a85e2270..c1d2a9892352 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
 
 extern void setup_node_to_cpumask_map(void);
 
-/*
- * Returns the number of the node containing Node 'node'. This
- * architecture is flat, so it is a pretty simple function!
- */
-#define parent_node(node) (node)
-
 #define pcibus_to_node(bus) __pcibus_to_node(bus)
 
 extern int __node_distance(int, int);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 30269dafec47..184eb9894dae 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -26,7 +26,12 @@
 
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current->thread.addr_limit)
-#define set_fs(x)	(current->thread.addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+	current->thread.addr_limit = fs;
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+}
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e6676495b125..e9f793e2df7a 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,11 +12,14 @@ struct unwind_state {
 	struct task_struct *task;
 	int graph_idx;
 	bool error;
-#ifdef CONFIG_FRAME_POINTER
+#if defined(CONFIG_ORC_UNWINDER)
+	bool signal, full_regs;
+	unsigned long sp, bp, ip;
+	struct pt_regs *regs;
+#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
 	bool got_irq;
-	unsigned long *bp, *orig_sp;
+	unsigned long *bp, *orig_sp, ip;
 	struct pt_regs *regs;
-	unsigned long ip;
 #else
 	unsigned long *sp;
 #endif
@@ -24,41 +27,30 @@ struct unwind_state {
 
 void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		    struct pt_regs *regs, unsigned long *first_frame);
-
 bool unwind_next_frame(struct unwind_state *state);
-
 unsigned long unwind_get_return_address(struct unwind_state *state);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
 
 static inline bool unwind_done(struct unwind_state *state)
 {
 	return state->stack_info.type == STACK_TYPE_UNKNOWN;
 }
 
-static inline
-void unwind_start(struct unwind_state *state, struct task_struct *task,
-		  struct pt_regs *regs, unsigned long *first_frame)
-{
-	first_frame = first_frame ? : get_stack_pointer(task, regs);
-
-	__unwind_start(state, task, regs, first_frame);
-}
-
 static inline bool unwind_error(struct unwind_state *state)
 {
 	return state->error;
 }
 
-#ifdef CONFIG_FRAME_POINTER
-
 static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+		  struct pt_regs *regs, unsigned long *first_frame)
 {
-	if (unwind_done(state))
-		return NULL;
+	first_frame = first_frame ? : get_stack_pointer(task, regs);
 
-	return state->regs ? &state->regs->ip : state->bp + 1;
+	__unwind_start(state, task, regs, first_frame);
 }
 
+#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	if (unwind_done(state))
@@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 
 	return state->regs;
 }
-
-#else /* !CONFIG_FRAME_POINTER */
-
-static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+#else
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	return NULL;
 }
+#endif
 
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+#ifdef CONFIG_ORC_UNWINDER
+void unwind_init(void);
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+			void *orc, size_t orc_size);
+#else
+static inline void unwind_init(void) {}
+static inline
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+			void *orc, size_t orc_size) {}
+#endif
+
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x)			\
+({							\
+	unsigned long val;				\
+	if (task == current)				\
+		val = READ_ONCE(x);			\
+	else						\
+		val = READ_ONCE_NOCHECK(x);		\
+	val;						\
+})
+
+static inline bool task_on_another_cpu(struct task_struct *task)
 {
-	return NULL;
+#ifdef CONFIG_SMP
+	return task != current && task->on_cpu;
+#else
+	return false;
+#endif
 }
 
-#endif /* CONFIG_FRAME_POINTER */
-
 #endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 5e02b11c9b86..bae46fc6b9de 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -44,10 +44,12 @@
 .endm
 
 .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
-	.if \base == %rsp && \indirect
-		.set sp_reg, ORC_REG_SP_INDIRECT
-	.elseif \base == %rsp
-		.set sp_reg, ORC_REG_SP
+	.if \base == %rsp
+		.if \indirect
+			.set sp_reg, ORC_REG_SP_INDIRECT
+		.else
+			.set sp_reg, ORC_REG_SP
+		.endif
 	.elseif \base == %rbp
 		.set sp_reg, ORC_REG_BP
 	.elseif \base == %rdi
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ddef37b16af2..66b8f93333d1 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -201,7 +201,7 @@ struct boot_params {
  *
  * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
  *	PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
  * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
  * 	which start at asm startup_xen() entry point and later jump to the C
  * 	xen_start_kernel() entry point. Both domU and dom0 type of guests are
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a01892bdd61a..287eac7d207f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -126,11 +126,9 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
 
-ifdef CONFIG_FRAME_POINTER
-obj-y					+= unwind_frame.o
-else
-obj-y					+= unwind_guess.o
-endif
+obj-$(CONFIG_ORC_UNWINDER)		+= unwind_orc.o
+obj-$(CONFIG_FRAME_POINTER_UNWINDER)	+= unwind_frame.o
+obj-$(CONFIG_GUESS_UNWINDER)		+= unwind_guess.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 32e14d137416..3344d3382e91 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr;
 
 int poke_int3_handler(struct pt_regs *regs)
 {
-	/* bp_patching_in_progress */
+	/*
+	 * Having observed our INT3 instruction, we now must observe
+	 * bp_patching_in_progress.
+	 *
+	 * 	in_progress = TRUE		INT3
+	 * 	WMB				RMB
+	 * 	write INT3			if (in_progress)
+	 *
+	 * Idem for bp_int3_handler.
+	 */
 	smp_rmb();
 
 	if (likely(!bp_patching_in_progress))
@@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
 	bp_int3_addr = (u8 *)addr + sizeof(int3);
 	bp_patching_in_progress = true;
 	/*
-	 * Corresponding read barrier in int3 notifier for
-	 * making sure the in_progress flags is correctly ordered wrt.
-	 * patching
+	 * Corresponding read barrier in int3 notifier for making sure the
+	 * in_progress and handler are correctly ordered wrt. patching.
 	 */
 	smp_wmb();
 
@@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
 	text_poke(addr, opcode, sizeof(int3));
 
 	on_each_cpu(do_sync_core, NULL, 1);
-
+	/*
+	 * sync_core() implies an smp_mb() and orders this store against
+	 * the writing of the new instruction.
+	 */
 	bp_patching_in_progress = false;
-	smp_wmb();
 
 	return addr;
 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..710edab9e644 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
 
 #include <asm/ucontext.h>
 
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
 #define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
 	OFFSET(stack_canary_offset, stack_canary, canary);
 #endif
 
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
-	BLANK();
-	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
-	BLANK();
-	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
-	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
-	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
-	OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
-	OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
-	OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
-	OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
-	OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
-	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
-	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
 	BLANK();
 	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
 	DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3b9e220621f8..e44338dd62dd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -297,13 +297,29 @@ static int nearby_node(int apicid)
 }
 #endif
 
+#ifdef CONFIG_SMP
+/*
+ * Fix up cpu_core_id for pre-F17h systems to be in the
+ * [0 .. cores_per_node - 1] range. Not really needed but
+ * kept so as not to break existing setups.
+ */
+static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
+{
+	u32 cus_per_node;
+
+	if (c->x86 >= 0x17)
+		return;
+
+	cus_per_node = c->x86_max_cores / nodes_per_socket;
+	c->cpu_core_id %= cus_per_node;
+}
+
 /*
  * Fixup core topology information for
  * (1) AMD multi-node processors
  *     Assumption: Number of cores in each internal node is the same.
  * (2) AMD processors supporting compute units
  */
-#ifdef CONFIG_SMP
 static void amd_get_topology(struct cpuinfo_x86 *c)
 {
 	u8 node_id;
@@ -354,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 	} else
 		return;
 
-	/* fixup multi-node processor information */
 	if (nodes_per_socket > 1) {
-		u32 cus_per_node;
-
 		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-		cus_per_node = c->x86_max_cores / nodes_per_socket;
-
-		/* core id has to be in the [0 .. cores_per_node - 1] range */
-		c->cpu_core_id %= cus_per_node;
+		legacy_fixup_core_id(c);
 	}
 }
 #endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c55fb2cb2acc..24f749324c0f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -811,7 +811,24 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
 	struct cacheinfo *this_leaf;
 	int i, sibling;
 
-	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+	/*
+	 * For L3, always use the pre-calculated cpu_llc_shared_mask
+	 * to derive shared_cpu_map.
+	 */
+	if (index == 3) {
+		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+			this_cpu_ci = get_cpu_cacheinfo(i);
+			if (!this_cpu_ci->info_list)
+				continue;
+			this_leaf = this_cpu_ci->info_list + index;
+			for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+				if (!cpu_online(sibling))
+					continue;
+				cpumask_set_cpu(sibling,
+						&this_leaf->shared_cpu_map);
+			}
+		}
+	} else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		unsigned int apicid, nshared, first, last;
 
 		this_leaf = this_cpu_ci->info_list + index;
@@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
 						&this_leaf->shared_cpu_map);
 			}
 		}
-	} else if (index == 3) {
-		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
-			this_cpu_ci = get_cpu_cacheinfo(i);
-			if (!this_cpu_ci->info_list)
-				continue;
-			this_leaf = this_cpu_ci->info_list + index;
-			for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
-				if (!cpu_online(sibling))
-					continue;
-				cpumask_set_cpu(sibling,
-						&this_leaf->shared_cpu_map);
-			}
-		}
 	} else
 		return 0;
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9e314bcf67cc..5ce1a5689162 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -201,8 +201,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
 		wrmsr(smca_config, low, high);
 	}
 
-	/* Collect bank_info using CPU 0 for now. */
-	if (cpu)
+	/* Return early if this bank was already initialized. */
+	if (smca_banks[bank].hwid)
 		return;
 
 	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
@@ -216,11 +216,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
 	for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
 		s_hwid = &smca_hwid_mcatypes[i];
 		if (hwid_mcatype == s_hwid->hwid_mcatype) {
-
-			WARN(smca_banks[bank].hwid,
-			     "Bank %s already initialized!\n",
-			     smca_get_name(s_hwid->bank_type));
-
 			smca_banks[bank].hwid = s_hwid;
 			smca_banks[bank].id = low;
 			smca_banks[bank].sysfs_id = s_hwid->count++;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 21b185793c80..c6daec4bdba5 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch)
 
 	list_for_each_entry(p, &microcode_cache, plist) {
 		if (p->equiv_cpu == new_patch->equiv_cpu) {
-			if (p->patch_id >= new_patch->patch_id)
+			if (p->patch_id >= new_patch->patch_id) {
 				/* we already have the latest patch */
+				kfree(new_patch->data);
+				kfree(new_patch);
 				return;
+			}
 
 			list_replace(&p->plist, &new_patch->plist);
 			kfree(p->data);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 59edbe9d4ccb..8f7a9bbad514 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -146,18 +146,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header,
 	return false;
 }
 
-static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
+static struct ucode_patch *memdup_patch(void *data, unsigned int size)
 {
 	struct ucode_patch *p;
 
 	p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
 	if (!p)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
 	p->data = kmemdup(data, size, GFP_KERNEL);
 	if (!p->data) {
 		kfree(p);
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 	}
 
 	return p;
@@ -183,8 +183,8 @@ static void save_microcode_patch(void *data, unsigned int size)
 			if (mc_hdr->rev <= mc_saved_hdr->rev)
 				continue;
 
-			p = __alloc_microcode_buf(data, size);
-			if (IS_ERR(p))
+			p = memdup_patch(data, size);
+			if (!p)
 				pr_err("Error allocating buffer %p\n", data);
 			else
 				list_replace(&iter->plist, &p->plist);
@@ -196,24 +196,25 @@ static void save_microcode_patch(void *data, unsigned int size)
 	 * newly found.
 	 */
 	if (!prev_found) {
-		p = __alloc_microcode_buf(data, size);
-		if (IS_ERR(p))
+		p = memdup_patch(data, size);
+		if (!p)
 			pr_err("Error allocating buffer for %p\n", data);
 		else
 			list_add_tail(&p->plist, &microcode_cache);
 	}
 
+	if (!p)
+		return;
+
 	/*
 	 * Save for early loading. On 32-bit, that needs to be a physical
 	 * address as the APs are running from physical addresses, before
 	 * paging has been enabled.
 	 */
-	if (p) {
-		if (IS_ENABLED(CONFIG_X86_32))
-			intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
-		else
-			intel_ucode_patch = p->data;
-	}
+	if (IS_ENABLED(CONFIG_X86_32))
+		intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+	else
+		intel_ucode_patch = p->data;
 }
 
 static int microcode_sanity_check(void *mc, int print_err)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index bd265a4cf108..f13b4c00a5de 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -267,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
 #ifdef CONFIG_X86_32
 	if (user_mode(regs)) {
 		sp = regs->sp;
-		ss = regs->ss & 0xffff;
+		ss = regs->ss;
 	} else {
 		sp = kernel_stack_pointer(regs);
 		savesegment(ss, ss);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 1f85ee8f9439..29da9599fec0 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
 	jmp *%eax
 
 .Lbad_subarch:
-WEAK(lguest_entry)
 WEAK(xen_entry)
 	/* Unknown implementation; there's really
 	   nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
 
 subarch_entries:
 	.long .Ldefault_entry		/* normal x86/PC */
-	.long lguest_entry		/* lguest hypervisor */
 	.long xen_entry			/* Xen hypervisor */
 	.long .Ldefault_entry		/* Moorestown MID */
 num_subarch_entries = (. - subarch_entries) / 4
@@ -457,12 +455,9 @@ early_idt_handler_common:
 	/* The vector number is in pt_regs->gs */
 
 	cld
-	pushl	%fs		/* pt_regs->fs */
-	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
-	pushl	%es		/* pt_regs->es */
-	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
-	pushl	%ds		/* pt_regs->ds */
-	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
+	pushl	%fs		/* pt_regs->fs (__fsh varies by model) */
+	pushl	%es		/* pt_regs->es (__esh varies by model) */
+	pushl	%ds		/* pt_regs->ds (__dsh varies by model) */
 	pushl	%eax		/* pt_regs->ax */
 	pushl	%ebp		/* pt_regs->bp */
 	pushl	%edi		/* pt_regs->di */
@@ -479,9 +474,8 @@ early_idt_handler_common:
 	/* Load the vector number into EDX */
 	movl	PT_GS(%esp), %edx
 
-	/* Load GS into pt_regs->gs and clear high bits */
+	/* Load GS into pt_regs->gs (and maybe clobber __gsh) */
 	movw	%gs, PT_GS(%esp)
-	movw	$0, PT_GS+2(%esp)
 
 	movl	%esp, %eax	/* args are pt_regs (EAX), trapnr (EDX) */
 	call	early_fixup_exception
@@ -493,10 +487,10 @@ early_idt_handler_common:
 	popl	%edi		/* pt_regs->di */
 	popl	%ebp		/* pt_regs->bp */
 	popl	%eax		/* pt_regs->ax */
-	popl	%ds		/* pt_regs->ds */
-	popl	%es		/* pt_regs->es */
-	popl	%fs		/* pt_regs->fs */
-	popl	%gs		/* pt_regs->gs */
+	popl	%ds		/* pt_regs->ds (always ignores __dsh) */
+	popl	%es		/* pt_regs->es (always ignores __esh) */
+	popl	%fs		/* pt_regs->fs (always ignores __fsh) */
+	popl	%gs		/* pt_regs->gs (always ignores __gsh) */
 	decl	%ss:early_recursion_flag
 	addl	$4, %esp	/* pop pt_regs->orig_ax */
 	iret
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 69ea0bc1cfa3..4f98aad38237 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -39,6 +39,7 @@
 #include <asm/insn.h>
 #include <asm/debugreg.h>
 #include <asm/set_memory.h>
+#include <asm/sections.h>
 
 #include "common.h"
 
@@ -251,10 +252,12 @@ static int can_optimize(unsigned long paddr)
 
 	/*
 	 * Do not optimize in the entry code due to the unstable
-	 * stack handling.
+	 * stack handling and registers setup.
 	 */
-	if ((paddr >= (unsigned long)__entry_text_start) &&
-	    (paddr <  (unsigned long)__entry_text_end))
+	if (((paddr >= (unsigned long)__entry_text_start) &&
+	     (paddr <  (unsigned long)__entry_text_end)) ||
+	    ((paddr >= (unsigned long)__irqentry_text_start) &&
+	     (paddr <  (unsigned long)__irqentry_text_end)))
 		return 0;
 
 	/* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a870910c8565..f0e64db18ac8 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -21,6 +21,25 @@
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
 
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+	unsigned short sel;
+
+	/*
+	 * Make sure that the cached DS and ES descriptors match the updated
+	 * LDT.
+	 */
+	savesegment(ds, sel);
+	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+		loadsegment(ds, sel);
+
+	savesegment(es, sel);
+	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+		loadsegment(es, sel);
+#endif
+}
+
 /* context.lock is held for us, so we don't need any locking. */
 static void flush_ldt(void *__mm)
 {
@@ -32,6 +51,8 @@ static void flush_ldt(void *__mm)
 
 	pc = &mm->context;
 	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+
+	refresh_ldt_segments();
 }
 
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f67bd3205df7..62e7d70aadd5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -35,6 +35,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/setup.h>
+#include <asm/unwind.h>
 
 #if 0
 #define DEBUGP(fmt, ...)				\
@@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    struct module *me)
 {
 	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
-		*para = NULL;
+		*para = NULL, *orc = NULL, *orc_ip = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 			locks = s;
 		if (!strcmp(".parainstructions", secstrings + s->sh_name))
 			para = s;
+		if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+			orc = s;
+		if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+			orc_ip = s;
 	}
 
 	if (alt) {
@@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 	/* make jump label nops */
 	jump_label_apply_nops(me);
 
+	if (orc && orc_ip)
+		unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+				   (void *)orc->sh_addr, orc->sh_size);
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 446c8aa09b9b..35aafc95e4b8 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -39,26 +39,26 @@
 #include <trace/events/nmi.h>
 
 struct nmi_desc {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct list_head head;
 };
 
 static struct nmi_desc nmi_desc[NMI_MAX] = 
 {
 	{
-		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[0].head),
 	},
 	{
-		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[1].head),
 	},
 	{
-		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[2].head),
 	},
 	{
-		.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
 		.head = LIST_HEAD_INIT(nmi_desc[3].head),
 	},
 
@@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
 
 	init_irq_work(&action->irq_work, nmi_max_handler);
 
-	spin_lock_irqsave(&desc->lock, flags);
+	raw_spin_lock_irqsave(&desc->lock, flags);
 
 	/*
 	 * Indicate if there are multiple registrations on the
@@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
 	else
 		list_add_tail_rcu(&action->list, &desc->head);
 	
-	spin_unlock_irqrestore(&desc->lock, flags);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
 EXPORT_SYMBOL(__register_nmi_handler);
@@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
 	struct nmiaction *n;
 	unsigned long flags;
 
-	spin_lock_irqsave(&desc->lock, flags);
+	raw_spin_lock_irqsave(&desc->lock, flags);
 
 	list_for_each_entry_rcu(n, &desc->head, list) {
 		/*
@@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
 		}
 	}
 
-	spin_unlock_irqrestore(&desc->lock, flags);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(unregister_nmi_handler);
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 91271122f0df..502a77d0adb0 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
 		x86_platform.legacy.reserve_bios_regions = 1;
 		break;
 	case X86_SUBARCH_XEN:
-	case X86_SUBARCH_LGUEST:
 		x86_platform.legacy.devices.pnpbios = 0;
 		x86_platform.legacy.rtc = 0;
 		break;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c6d6dc5f8bb2..efc5eeb58292 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
 	if (user_mode(regs)) {
 		sp = regs->sp;
-		ss = regs->ss & 0xffff;
+		ss = regs->ss;
 		gs = get_user_gs(regs);
 	} else {
 		sp = kernel_stack_pointer(regs);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2987e3991c2b..c85269a76511 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all)
 	unsigned int fsindex, gsindex;
 	unsigned int ds, cs, es;
 
-	printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
-		(void *)regs->ip);
+	printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
 	printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
 		regs->sp, regs->flags);
 	if (regs->orig_ax != -1)
@@ -149,6 +148,123 @@ void release_thread(struct task_struct *dead_task)
 	}
 }
 
+enum which_selector {
+	FS,
+	GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+					     unsigned short selector,
+					     enum which_selector which)
+{
+	if (likely(selector == 0)) {
+		/*
+		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
+		 * be the pre-existing saved base or it could be zero.  On AMD
+		 * (with X86_BUG_NULL_SEG), the segment base could be almost
+		 * anything.
+		 *
+		 * This branch is very hot (it's hit twice on almost every
+		 * context switch between 64-bit programs), and avoiding
+		 * the RDMSR helps a lot, so we just assume that whatever
+		 * value is already saved is correct.  This matches historical
+		 * Linux behavior, so it won't break existing applications.
+		 *
+		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+		 * report that the base is zero, it needs to actually be zero:
+		 * see the corresponding logic in load_seg_legacy.
+		 */
+	} else {
+		/*
+		 * If the selector is 1, 2, or 3, then the base is zero on
+		 * !X86_BUG_NULL_SEG CPUs and could be anything on
+		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
+		 * has never attempted to preserve the base across context
+		 * switches.
+		 *
+		 * If selector > 3, then it refers to a real segment, and
+		 * saving the base isn't necessary.
+		 */
+		if (which == FS)
+			prev_p->thread.fsbase = 0;
+		else
+			prev_p->thread.gsbase = 0;
+	}
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+	savesegment(fs, task->thread.fsindex);
+	savesegment(gs, task->thread.gsindex);
+	save_base_legacy(task, task->thread.fsindex, FS);
+	save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+static __always_inline void loadseg(enum which_selector which,
+				    unsigned short sel)
+{
+	if (which == FS)
+		loadsegment(fs, sel);
+	else
+		load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+					    unsigned long prev_base,
+					    unsigned short next_index,
+					    unsigned long next_base,
+					    enum which_selector which)
+{
+	if (likely(next_index <= 3)) {
+		/*
+		 * The next task is using 64-bit TLS, is not using this
+		 * segment at all, or is having fun with arcane CPU features.
+		 */
+		if (next_base == 0) {
+			/*
+			 * Nasty case: on AMD CPUs, we need to forcibly zero
+			 * the base.
+			 */
+			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+				loadseg(which, __USER_DS);
+				loadseg(which, next_index);
+			} else {
+				/*
+				 * We could try to exhaustively detect cases
+				 * under which we can skip the segment load,
+				 * but there's really only one case that matters
+				 * for performance: if both the previous and
+				 * next states are fully zeroed, we can skip
+				 * the load.
+				 *
+				 * (This assumes that prev_base == 0 has no
+				 * false positives.  This is the case on
+				 * Intel-style CPUs.)
+				 */
+				if (likely(prev_index | next_index | prev_base))
+					loadseg(which, next_index);
+			}
+		} else {
+			if (prev_index != next_index)
+				loadseg(which, next_index);
+			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+			       next_base);
+		}
+	} else {
+		/*
+		 * The next task is using a real segment.  Loading the selector
+		 * is sufficient.
+		 */
+		loadseg(which, next_index);
+	}
+}
+
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 		unsigned long arg, struct task_struct *p, unsigned long tls)
 {
@@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 		    unsigned long new_sp,
 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
 {
+	WARN_ON_ONCE(regs != current_pt_regs());
+
+	if (static_cpu_has(X86_BUG_NULL_SEG)) {
+		/* Loading zero below won't clear the base. */
+		loadsegment(fs, __USER_DS);
+		load_gs_index(__USER_DS);
+	}
+
 	loadsegment(fs, 0);
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
 	load_gs_index(0);
+
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	regs->cs		= _cs;
@@ -277,7 +402,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	unsigned prev_fsindex, prev_gsindex;
 
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 		     this_cpu_read(irq_count) != -1);
@@ -289,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 *
 	 * (e.g. xen_load_tls())
 	 */
-	savesegment(fs, prev_fsindex);
-	savesegment(gs, prev_gsindex);
+	save_fsgs(prev_p);
 
 	/*
 	 * Load TLS before restoring any segments so that segment loads
@@ -329,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (unlikely(next->ds | prev->ds))
 		loadsegment(ds, next->ds);
 
-	/*
-	 * Switch FS and GS.
-	 *
-	 * These are even more complicated than DS and ES: they have
-	 * 64-bit bases are that controlled by arch_prctl.  The bases
-	 * don't necessarily match the selectors, as user code can do
-	 * any number of things to cause them to be inconsistent.
-	 *
-	 * We don't promise to preserve the bases if the selectors are
-	 * nonzero.  We also don't promise to preserve the base if the
-	 * selector is zero and the base doesn't match whatever was
-	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
-	 * FSGSBASE instructions are enabled, we'll need to offer
-	 * stronger guarantees.)
-	 *
-	 * As an invariant,
-	 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
-	 * impossible.
-	 */
-	if (next->fsindex) {
-		/* Loading a nonzero value into FS sets the index and base. */
-		loadsegment(fs, next->fsindex);
-	} else {
-		if (next->fsbase) {
-			/* Next index is zero but next base is nonzero. */
-			if (prev_fsindex)
-				loadsegment(fs, 0);
-			wrmsrl(MSR_FS_BASE, next->fsbase);
-		} else {
-			/* Next base and index are both zero. */
-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
-				/*
-				 * We don't know the previous base and can't
-				 * find out without RDMSR.  Forcibly clear it.
-				 */
-				loadsegment(fs, __USER_DS);
-				loadsegment(fs, 0);
-			} else {
-				/*
-				 * If the previous index is zero and ARCH_SET_FS
-				 * didn't change the base, then the base is
-				 * also zero and we don't need to do anything.
-				 */
-				if (prev->fsbase || prev_fsindex)
-					loadsegment(fs, 0);
-			}
-		}
-	}
-	/*
-	 * Save the old state and preserve the invariant.
-	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
-	 * without RDMSR because Intel user code can zero it without telling
-	 * us and AMD user code can program any 32-bit value without telling
-	 * us.
-	 */
-	if (prev_fsindex)
-		prev->fsbase = 0;
-	prev->fsindex = prev_fsindex;
-
-	if (next->gsindex) {
-		/* Loading a nonzero value into GS sets the index and base. */
-		load_gs_index(next->gsindex);
-	} else {
-		if (next->gsbase) {
-			/* Next index is zero but next base is nonzero. */
-			if (prev_gsindex)
-				load_gs_index(0);
-			wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
-		} else {
-			/* Next base and index are both zero. */
-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
-				/*
-				 * We don't know the previous base and can't
-				 * find out without RDMSR.  Forcibly clear it.
-				 *
-				 * This contains a pointless SWAPGS pair.
-				 * Fixing it would involve an explicit check
-				 * for Xen or a new pvop.
-				 */
-				load_gs_index(__USER_DS);
-				load_gs_index(0);
-			} else {
-				/*
-				 * If the previous index is zero and ARCH_SET_GS
-				 * didn't change the base, then the base is
-				 * also zero and we don't need to do anything.
-				 */
-				if (prev->gsbase || prev_gsindex)
-					load_gs_index(0);
-			}
-		}
-	}
-	/*
-	 * Save the old state and preserve the invariant.
-	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
-	 * without RDMSR because Intel user code can zero it without telling
-	 * us and AMD user code can program any 32-bit value without telling
-	 * us.
-	 */
-	if (prev_gsindex)
-		prev->gsbase = 0;
-	prev->gsindex = prev_gsindex;
+	load_seg_legacy(prev->fsindex, prev->fsbase,
+			next->fsindex, next->fsbase, FS);
+	load_seg_legacy(prev->gsindex, prev->gsbase,
+			next->gsindex, next->gsbase, GS);
 
 	switch_fpu_finish(next_fpu, cpu);
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..ecab32282f0f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -115,6 +115,7 @@
 #include <asm/microcode.h>
 #include <asm/mmu_context.h>
 #include <asm/kaslr.h>
+#include <asm/unwind.h>
 
 /*
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1310,6 +1311,8 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled(EFI_BOOT))
 		efi_apply_memmap_quirks();
 #endif
+
+	unwind_init();
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cc30a74e4adb..e04442345fc0 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -256,7 +256,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	} else if (IS_ENABLED(CONFIG_X86_32) &&
 		   !onsigstack &&
-		   (regs->ss & 0xffff) != __USER_DS &&
+		   regs->ss != __USER_DS &&
 		   !(ka->sa.sa_flags & SA_RESTORER) &&
 		   ka->sa.sa_restorer) {
 		/* This is the legacy signal stack switching. */
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 5f25cfbd952e..5ee663836c08 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -13,7 +13,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 	unsigned long addr, seg;
 
 	addr = regs->ip;
-	seg = regs->cs & 0xffff;
+	seg = regs->cs;
 	if (v8086_mode(regs)) {
 		addr = (addr & 0xffff) + (seg << 4);
 		return addr;
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..d145a0b1f529 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -10,20 +10,22 @@
 
 #define FRAME_HEADER_SIZE (sizeof(long) * 2)
 
-/*
- * This disables KASAN checking when reading a value from another task's stack,
- * since the other task could be running on another CPU and could have poisoned
- * the stack in the meantime.
- */
-#define READ_ONCE_TASK_STACK(task, x)			\
-({							\
-	unsigned long val;				\
-	if (task == current)				\
-		val = READ_ONCE(x);			\
-	else						\
-		val = READ_ONCE_NOCHECK(x);		\
-	val;						\
-})
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return 0;
+
+	return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return NULL;
+
+	return state->regs ? &state->regs->ip : state->bp + 1;
+}
 
 static void unwind_dump(struct unwind_state *state)
 {
@@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state)
 	}
 }
 
-unsigned long unwind_get_return_address(struct unwind_state *state)
-{
-	if (unwind_done(state))
-		return 0;
-
-	return __kernel_text_address(state->ip) ? state->ip : 0;
-}
-EXPORT_SYMBOL_GPL(unwind_get_return_address);
-
 static size_t regs_size(struct pt_regs *regs)
 {
 	/* x86_32 regs from kernel mode are two words shorter: */
@@ -91,10 +84,8 @@ static bool in_entry_code(unsigned long ip)
 	if (addr >= __entry_text_start && addr < __entry_text_end)
 		return true;
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 	if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
 		return true;
-#endif
 
 	return false;
 }
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 039f36738e49..4f0e17b90463 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
 }
 EXPORT_SYMBOL_GPL(unwind_get_return_address);
 
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	return NULL;
+}
+
 bool unwind_next_frame(struct unwind_state *state)
 {
 	struct stack_info *info = &state->stack_info;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000000..570b70d3f604
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,582 @@
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/sections.h>
+
+#define orc_warn(fmt, ...) \
+	printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static DEFINE_MUTEX(sort_mutex);
+int *cur_orc_ip_table = __start_orc_unwind_ip;
+struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+unsigned int lookup_num_blocks;
+bool orc_init;
+
+static inline unsigned long orc_ip(const int *ip)
+{
+	return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+				    unsigned int num_entries, unsigned long ip)
+{
+	int *first = ip_table;
+	int *last = ip_table + num_entries - 1;
+	int *mid = first, *found = first;
+
+	if (!num_entries)
+		return NULL;
+
+	/*
+	 * Do a binary range search to find the rightmost duplicate of a given
+	 * starting address.  Some entries are section terminators which are
+	 * "weak" entries for ensuring there are no gaps.  They should be
+	 * ignored when they conflict with a real entry.
+	 */
+	while (first <= last) {
+		mid = first + ((last - first) / 2);
+
+		if (orc_ip(mid) <= ip) {
+			found = mid;
+			first = mid + 1;
+		} else
+			last = mid - 1;
+	}
+
+	return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+	struct module *mod;
+
+	mod = __module_address(ip);
+	if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+		return NULL;
+	return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+			  mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+	return NULL;
+}
+#endif
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+	if (!orc_init)
+		return NULL;
+
+	/* For non-init vmlinux addresses, use the fast lookup table: */
+	if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+		unsigned int idx, start, stop;
+
+		idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+		if (unlikely((idx >= lookup_num_blocks-1))) {
+			orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n",
+				 idx, lookup_num_blocks, ip);
+			return NULL;
+		}
+
+		start = orc_lookup[idx];
+		stop = orc_lookup[idx + 1] + 1;
+
+		if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+			     (__start_orc_unwind + stop > __stop_orc_unwind))) {
+			orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n",
+				 idx, lookup_num_blocks, start, stop, ip);
+			return NULL;
+		}
+
+		return __orc_find(__start_orc_unwind_ip + start,
+				  __start_orc_unwind + start, stop - start, ip);
+	}
+
+	/* vmlinux .init slow lookup: */
+	if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
+		return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+				  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+	/* Module lookup: */
+	return orc_module_find(ip);
+}
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+	struct orc_entry *orc_a, *orc_b;
+	struct orc_entry orc_tmp;
+	int *a = _a, *b = _b, tmp;
+	int delta = _b - _a;
+
+	/* Swap the .orc_unwind_ip entries: */
+	tmp = *a;
+	*a = *b + delta;
+	*b = tmp - delta;
+
+	/* Swap the corresponding .orc_unwind entries: */
+	orc_a = cur_orc_table + (a - cur_orc_ip_table);
+	orc_b = cur_orc_table + (b - cur_orc_ip_table);
+	orc_tmp = *orc_a;
+	*orc_a = *orc_b;
+	*orc_b = orc_tmp;
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+	struct orc_entry *orc_a;
+	const int *a = _a, *b = _b;
+	unsigned long a_val = orc_ip(a);
+	unsigned long b_val = orc_ip(b);
+
+	if (a_val > b_val)
+		return 1;
+	if (a_val < b_val)
+		return -1;
+
+	/*
+	 * The "weak" section terminator entries need to always be on the left
+	 * to ensure the lookup code skips them in favor of real entries.
+	 * These terminator entries exist to handle any gaps created by
+	 * whitelisted .o files which didn't get objtool generation.
+	 */
+	orc_a = cur_orc_table + (a - cur_orc_ip_table);
+	return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1;
+}
+
+#ifdef CONFIG_MODULES
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+			void *_orc, size_t orc_size)
+{
+	int *orc_ip = _orc_ip;
+	struct orc_entry *orc = _orc;
+	unsigned int num_entries = orc_ip_size / sizeof(int);
+
+	WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+		     orc_size % sizeof(*orc) != 0 ||
+		     num_entries != orc_size / sizeof(*orc));
+
+	/*
+	 * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+	 * associate an .orc_unwind_ip table entry with its corresponding
+	 * .orc_unwind entry so they can both be swapped.
+	 */
+	mutex_lock(&sort_mutex);
+	cur_orc_ip_table = orc_ip;
+	cur_orc_table = orc;
+	sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+	mutex_unlock(&sort_mutex);
+
+	mod->arch.orc_unwind_ip = orc_ip;
+	mod->arch.orc_unwind = orc;
+	mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+	size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+	size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+	size_t num_entries = orc_ip_size / sizeof(int);
+	struct orc_entry *orc;
+	int i;
+
+	if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+	    orc_size % sizeof(struct orc_entry) != 0 ||
+	    num_entries != orc_size / sizeof(struct orc_entry)) {
+		orc_warn("WARNING: Bad or missing .orc_unwind table.  Disabling unwinder.\n");
+		return;
+	}
+
+	/* Sort the .orc_unwind and .orc_unwind_ip tables: */
+	sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
+	     orc_sort_swap);
+
+	/* Initialize the fast lookup table: */
+	lookup_num_blocks = orc_lookup_end - orc_lookup;
+	for (i = 0; i < lookup_num_blocks-1; i++) {
+		orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+				 num_entries,
+				 LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+		if (!orc) {
+			orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
+			return;
+		}
+
+		orc_lookup[i] = orc - __start_orc_unwind;
+	}
+
+	/* Initialize the ending block: */
+	orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+			 LOOKUP_STOP_IP);
+	if (!orc) {
+		orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
+		return;
+	}
+	orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+	orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return 0;
+
+	return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return NULL;
+
+	if (state->regs)
+		return &state->regs->ip;
+
+	if (state->sp)
+		return (unsigned long *)state->sp - 1;
+
+	return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+			    size_t len)
+{
+	struct stack_info *info = &state->stack_info;
+
+	/*
+	 * If the address isn't on the current stack, switch to the next one.
+	 *
+	 * We may have to traverse multiple stacks to deal with the possibility
+	 * that info->next_sp could point to an empty stack and the address
+	 * could be on a subsequent stack.
+	 */
+	while (!on_stack(info, (void *)addr, len))
+		if (get_stack_info(info->next_sp, state->task, info,
+				   &state->stack_mask))
+			return false;
+
+	return true;
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+			    unsigned long *val)
+{
+	if (!stack_access_ok(state, addr, sizeof(long)))
+		return false;
+
+	*val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr);
+	return true;
+}
+
+#define REGS_SIZE (sizeof(struct pt_regs))
+#define SP_OFFSET (offsetof(struct pt_regs, sp))
+#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
+#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+			     unsigned long *ip, unsigned long *sp, bool full)
+{
+	size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
+	size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
+	struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
+
+	if (IS_ENABLED(CONFIG_X86_64)) {
+		if (!stack_access_ok(state, addr, regs_size))
+			return false;
+
+		*ip = regs->ip;
+		*sp = regs->sp;
+
+		return true;
+	}
+
+	if (!stack_access_ok(state, addr, sp_offset))
+		return false;
+
+	*ip = regs->ip;
+
+	if (user_mode(regs)) {
+		if (!stack_access_ok(state, addr + sp_offset,
+				     REGS_SIZE - SP_OFFSET))
+			return false;
+
+		*sp = regs->sp;
+	} else
+		*sp = (unsigned long)&regs->sp;
+
+	return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+	unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
+	enum stack_type prev_type = state->stack_info.type;
+	struct orc_entry *orc;
+	struct pt_regs *ptregs;
+	bool indirect = false;
+
+	if (unwind_done(state))
+		return false;
+
+	/* Don't let modules unload while we're reading their ORC data. */
+	preempt_disable();
+
+	/* Have we reached the end? */
+	if (state->regs && user_mode(state->regs))
+		goto done;
+
+	/*
+	 * Find the orc_entry associated with the text address.
+	 *
+	 * Decrement call return addresses by one so they work for sibling
+	 * calls and calls to noreturn functions.
+	 */
+	orc = orc_find(state->signal ? state->ip : state->ip - 1);
+	if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
+		goto done;
+	orig_ip = state->ip;
+
+	/* Find the previous frame's stack: */
+	switch (orc->sp_reg) {
+	case ORC_REG_SP:
+		sp = state->sp + orc->sp_offset;
+		break;
+
+	case ORC_REG_BP:
+		sp = state->bp + orc->sp_offset;
+		break;
+
+	case ORC_REG_SP_INDIRECT:
+		sp = state->sp + orc->sp_offset;
+		indirect = true;
+		break;
+
+	case ORC_REG_BP_INDIRECT:
+		sp = state->bp + orc->sp_offset;
+		indirect = true;
+		break;
+
+	case ORC_REG_R10:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg R10 at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->r10;
+		break;
+
+	case ORC_REG_R13:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg R13 at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->r13;
+		break;
+
+	case ORC_REG_DI:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg DI at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->di;
+		break;
+
+	case ORC_REG_DX:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg DX at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->dx;
+		break;
+
+	default:
+		orc_warn("unknown SP base reg %d for ip %p\n",
+			 orc->sp_reg, (void *)state->ip);
+		goto done;
+	}
+
+	if (indirect) {
+		if (!deref_stack_reg(state, sp, &sp))
+			goto done;
+	}
+
+	/* Find IP, SP and possibly regs: */
+	switch (orc->type) {
+	case ORC_TYPE_CALL:
+		ip_p = sp - sizeof(long);
+
+		if (!deref_stack_reg(state, ip_p, &state->ip))
+			goto done;
+
+		state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+						  state->ip, (void *)ip_p);
+
+		state->sp = sp;
+		state->regs = NULL;
+		state->signal = false;
+		break;
+
+	case ORC_TYPE_REGS:
+		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+			orc_warn("can't dereference registers at %p for ip %p\n",
+				 (void *)sp, (void *)orig_ip);
+			goto done;
+		}
+
+		state->regs = (struct pt_regs *)sp;
+		state->full_regs = true;
+		state->signal = true;
+		break;
+
+	case ORC_TYPE_REGS_IRET:
+		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+			orc_warn("can't dereference iret registers at %p for ip %p\n",
+				 (void *)sp, (void *)orig_ip);
+			goto done;
+		}
+
+		ptregs = container_of((void *)sp, struct pt_regs, ip);
+		if ((unsigned long)ptregs >= prev_sp &&
+		    on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
+			state->regs = ptregs;
+			state->full_regs = false;
+		} else
+			state->regs = NULL;
+
+		state->signal = true;
+		break;
+
+	default:
+		orc_warn("unknown .orc_unwind entry type %d\n", orc->type);
+		break;
+	}
+
+	/* Find BP: */
+	switch (orc->bp_reg) {
+	case ORC_REG_UNDEFINED:
+		if (state->regs && state->full_regs)
+			state->bp = state->regs->bp;
+		break;
+
+	case ORC_REG_PREV_SP:
+		if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+			goto done;
+		break;
+
+	case ORC_REG_BP:
+		if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+			goto done;
+		break;
+
+	default:
+		orc_warn("unknown BP base reg %d for ip %p\n",
+			 orc->bp_reg, (void *)orig_ip);
+		goto done;
+	}
+
+	/* Prevent a recursive loop due to bad ORC data: */
+	if (state->stack_info.type == prev_type &&
+	    on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+	    state->sp <= prev_sp) {
+		orc_warn("stack going in the wrong direction? ip=%p\n",
+			 (void *)orig_ip);
+		goto done;
+	}
+
+	preempt_enable();
+	return true;
+
+done:
+	preempt_enable();
+	state->stack_info.type = STACK_TYPE_UNKNOWN;
+	return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+		    struct pt_regs *regs, unsigned long *first_frame)
+{
+	memset(state, 0, sizeof(*state));
+	state->task = task;
+
+	/*
+	 * Refuse to unwind the stack of a task while it's executing on another
+	 * CPU.  This check is racy, but that's ok: the unwinder has other
+	 * checks to prevent it from going off the rails.
+	 */
+	if (task_on_another_cpu(task))
+		goto done;
+
+	if (regs) {
+		if (user_mode(regs))
+			goto done;
+
+		state->ip = regs->ip;
+		state->sp = kernel_stack_pointer(regs);
+		state->bp = regs->bp;
+		state->regs = regs;
+		state->full_regs = true;
+		state->signal = true;
+
+	} else if (task == current) {
+		asm volatile("lea (%%rip), %0\n\t"
+			     "mov %%rsp, %1\n\t"
+			     "mov %%rbp, %2\n\t"
+			     : "=r" (state->ip), "=r" (state->sp),
+			       "=r" (state->bp));
+
+	} else {
+		struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+		state->sp = task->thread.sp;
+		state->bp = READ_ONCE_NOCHECK(frame->bp);
+		state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+	}
+
+	if (get_stack_info((unsigned long *)state->sp, state->task,
+			   &state->stack_info, &state->stack_mask))
+		return;
+
+	/*
+	 * The caller can provide the address of the first frame directly
+	 * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+	 * to start unwinding at.  Skip ahead until we reach it.
+	 */
+
+	/* When starting from regs, skip the regs frame: */
+	if (regs) {
+		unwind_next_frame(state);
+		return;
+	}
+
+	/* Otherwise, skip ahead to the user-specified starting frame: */
+	while (!unwind_done(state) &&
+	       (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+			state->sp <= (unsigned long)first_frame))
+		unwind_next_frame(state);
+
+	return;
+
+done:
+	state->stack_info.type = STACK_TYPE_UNKNOWN;
+	return;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8a3b61be0aa..f05f00acac89 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/page_types.h>
+#include <asm/orc_lookup.h>
 #include <asm/cache.h>
 #include <asm/boot.h>
 
@@ -148,6 +149,8 @@ SECTIONS
 
 	BUG_TABLE
 
+	ORC_UNWIND_TABLE
+
 	. = ALIGN(PAGE_SIZE);
 	__vvar_page = .;
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2688c7dc5323..3ea624452f93 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
deleted file mode 100644
index 08f41caada45..000000000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
-	bool "Lguest guest support"
-	depends on X86_32 && PARAVIRT && PCI
-	select TTY
-	select VIRTUALIZATION
-	select VIRTIO
-	select VIRTIO_CONSOLE
-	help
-	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
-	  allow your kernel to boot under lguest.  This option will increase
-	  your kernel size by about 10k.  If in doubt, say N.
-
-	  If you say Y here, make sure you say Y (or M) to the virtio block
-	  and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
deleted file mode 100644
index 8f38d577a2fa..000000000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y		:= head_32.o boot.o
-CFLAGS_boot.o	:= $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
deleted file mode 100644
index 99472698c931..000000000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways.  First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes.  We call the first kernel the Host,
- * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time.  This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest?  We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h>		/* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways.  In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
-	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
-	.noirq_iret = (u32)lguest_noirq_iret,
-	.kernel_address = PAGE_OFFSET,
-	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really.  We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly.  This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
-			unsigned long arg2, unsigned long arg3,
-			unsigned long arg4)
-{
-	/* Note: This code assumes we're uniprocessor. */
-	static unsigned int next_call;
-	unsigned long flags;
-
-	/*
-	 * Disable interrupts if not already disabled: we don't want an
-	 * interrupt handler making a hypercall while we're already doing
-	 * one!
-	 */
-	local_irq_save(flags);
-	if (lguest_data.hcall_status[next_call] != 0xFF) {
-		/* Table full, so do normal hcall which will flush table. */
-		hcall(call, arg1, arg2, arg3, arg4);
-	} else {
-		lguest_data.hcalls[next_call].arg0 = call;
-		lguest_data.hcalls[next_call].arg1 = arg1;
-		lguest_data.hcalls[next_call].arg2 = arg2;
-		lguest_data.hcalls[next_call].arg3 = arg3;
-		lguest_data.hcalls[next_call].arg4 = arg4;
-		/* Arguments must all be written before we mark it to go */
-		wmb();
-		lguest_data.hcall_status[next_call] = 0;
-		if (++next_call == LHCALL_RING_SIZE)
-			next_call = 0;
-	}
-	local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall().  This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
- * are reasonably expensive, batching them up makes sense.  For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, 0, 0, 0);
-	else
-		async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, 0, 0);
-	else
-		async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, 0);
-	else
-		async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3,
-			unsigned long arg4)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, arg4);
-	else
-		async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls.  Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction.  The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags").  The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag.  Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
-	return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
-	lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment.  Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules.  In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register.  To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch.  One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled.  There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag.  If an interrupt does come
- * in, we then disable them for real.  This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in.  Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares?  The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
-				   int entrynum, const gate_desc *g)
-{
-	/*
-	 * The gate_desc structure is 8 bytes long: we hand it to the Host in
-	 * two 32-bit chunks.  The whole 32-bit kernel used to hand descriptors
-	 * around like this; typesafety wasn't a big concern in Linux's early
-	 * years.
-	 */
-	u32 *desc = (u32 *)g;
-	/* Keep the local copy up to date. */
-	native_write_idt_entry(dt, entrynum, g);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *idt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT).  You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table.  There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *gdt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
-				   const void *desc, int type)
-{
-	native_write_gdt_entry(dt, entrynum, desc, type);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
-	      dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables).  As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	/*
-	 * There's one problem which normal hardware doesn't have: the Host
-	 * can't handle us removing entries we're currently using.  So we clear
-	 * the GS register here: if it's needed it'll be reloaded anyway.
-	 */
-	lazy_load_gs(0);
-	lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
- * uses this for some strange applications like Wine.  We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment.  Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops.  So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features.  It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 6 languages.  I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction.  Then I pretty much turned
- * off feature bits until the Guest booted.  (Don't say that: you'll damage
- * lguest sales!)  Shut up, inner voice!  (Hey, just pointing out that this is
- * hardly future proof.)  No one's listening!  They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged.  So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
-			 unsigned int *cx, unsigned int *dx)
-{
-	int function = *ax;
-
-	native_cpuid(ax, bx, cx, dx);
-	switch (function) {
-	/*
-	 * CPUID 0 gives the highest legal CPUID number (and the ID string).
-	 * We futureproof our code a little by sticking to known CPUID values.
-	 */
-	case 0:
-		if (*ax > 5)
-			*ax = 5;
-		break;
-
-	/*
-	 * CPUID 1 is a basic feature request.
-	 *
-	 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
-	 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
-	 */
-	case 1:
-		*cx &= 0x00002201;
-		*dx &= 0x07808151;
-		/*
-		 * The Host can do a nice optimization if it knows that the
-		 * kernel mappings (addresses above 0xC0000000 or whatever
-		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
-		 * flush_tlb_user() for both user and kernel mappings unless
-		 * the Page Global Enable (PGE) feature bit is set.
-		 */
-		*dx |= 0x00002000;
-		/*
-		 * We also lie, and say we're family id 5.  6 or greater
-		 * leads to a rdmsr in early_init_intel which we can't handle.
-		 * Family ID is returned as bits 8-12 in ax.
-		 */
-		*ax &= 0xFFFFF0FF;
-		*ax |= 0x00000500;
-		break;
-
-	/*
-	 * This is used to detect if we're running under KVM.  We might be,
-	 * but that's a Host matter, not us.  So say we're not.
-	 */
-	case KVM_CPUID_SIGNATURE:
-		*bx = *cx = *dx = 0;
-		break;
-
-	/*
-	 * 0x80000000 returns the highest Extended Function, so we futureproof
-	 * like we do above by limiting it to known fields.
-	 */
-	case 0x80000000:
-		if (*ax > 0x80000008)
-			*ax = 0x80000008;
-		break;
-
-	/*
-	 * PAE systems can mark pages as non-executable.  Linux calls this the
-	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
-	 * Virus Protection).  We just switch it off here, since we don't
-	 * support it.
-	 */
-	case 0x80000001:
-		*dx &= ~(1 << 20);
-		break;
-	}
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it.  The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0.  cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do?  Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used.  Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
-	return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads.  The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
-	return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
-	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
-	current_cr3 = cr3;
-
-	/* These two page tables are simple, linear, and used during boot */
-	if (cr3 != __pa_symbol(swapper_pg_dir) &&
-	    cr3 != __pa_symbol(initial_page_table))
-		cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
-	return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
-	return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant.  The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each.  The CPU
- * maps virtual addresses to physical addresses using "page tables".  We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables.   But since most virtual addresses
- * are unused, we use a two level index which saves space.  The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages.  Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- *	    |  	   --------->+---------+
- *	    |	      |	     | PADDR1  |
- *	  Mid-level   |	     | PADDR2  |
- *	  (PMD) page  |	     | 	       |
- *	    |	      |	   Lower-level |
- *	    |	      |	   (PTE) page  |
- *	    |	      |	     |	       |
- *	      ....    	     	 ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page.  If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- *  1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- *    Index into top     Index into second      Offset within page
- *  page directory page    pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper  ]
- *	   [   Level  ]
- *	   [  Entries ]
- *	   [(PUD Page)]---> +---------+
- *	 		    |  	   --------->+---------+
- *	 		    |	      |	     | PADDR1  |
- *	 		  Mid-level   |	     | PADDR2  |
- *	 		  (PMD) page  |	     | 	       |
- *	 		    |	      |	   Lower-level |
- *	 		    |	      |	   (PTE) page  |
- *	 		    |	      |	     |	       |
- *	 		      ....    	     	 ....
- *
- *
- * And the virtual address is decoded as:
- *
- *         1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- *      |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into    Index into mid    Index into lower    Offset within page
- * top entries   directory page     pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set.  Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages.  The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space.  We tell the Host the toplevel and
- * address this corresponds to.  The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
-			       pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
-	/* PAE needs to hand a 64 bit page table entry, so it uses two args. */
-	lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
-		    ptep->pte_low, ptep->pte_high);
-#else
-	lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
-	native_set_pud(pudp, pudval);
-
-	/* 32 bytes aligned pdpt address and the index. */
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
-		   (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more.  This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them.  Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds!  So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry.  These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-	native_set_pte_atomic(ptep, pte);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
-			     pte_t *ptep)
-{
-	native_pte_clear(mm, addr, ptep);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
-	lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations.  On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present).  Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
-	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed.  That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit.  We don't actually "ack" interrupts as such, we
- * just mask and unmask them.  I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
-	set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
-	clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
-	.name		= "lguest",
-	.irq_mask	= disable_lguest_irq,
-	.irq_mask_ack	= disable_lguest_irq,
-	.irq_unmask	= enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
-	struct irq_desc *desc;
-	int err;
-
-	/* Returns -ve error or vector number. */
-	err = irq_alloc_desc_at(irq, 0);
-	if (err < 0 && err != -EEXIST)
-		return err;
-
-	/*
-	 * Tell the Linux infrastructure that the interrupt is
-	 * controlled by our level-based lguest interrupt controller.
-	 */
-	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
-				      handle_level_irq, "level");
-
-	/* Some systems map "vectors" to interrupts weirdly.  Not us! */
-	desc = irq_to_desc(irq);
-	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
-	return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
-	int err;
-	u8 line = 0;
-
-	/* We literally use the PCI interrupt line as the irq number. */
-	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-	err = lguest_setup_irq(line);
-	if (!err)
-		dev->irq = line;
-	return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
-	WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
-	unsigned int i;
-
-	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-		if (i != IA32_SYSCALL_VECTOR)
-			set_intr_gate(i, irq_entries_start +
-					8 * (i - FIRST_EXTERNAL_VECTOR));
-	}
-
-	/*
-	 * This call is required to set up for 4k stacks, where we have
-	 * separate stacks for hard and soft interrupts.
-	 */
-	irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
-	*now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
-	return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
-	unsigned long sec, nsec;
-
-	/*
-	 * Since the time is in two parts (seconds and nanoseconds), we risk
-	 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
-	 * and getting 99 and 0.  As Linux tends to come apart under the stress
-	 * of time travel, we must be careful:
-	 */
-	do {
-		/* First we read the seconds part. */
-		sec = lguest_data.time.tv_sec;
-		/*
-		 * This read memory barrier tells the compiler and the CPU that
-		 * this can't be reordered: we have to complete the above
-		 * before going on.
-		 */
-		rmb();
-		/* Now we read the nanoseconds part. */
-		nsec = lguest_data.time.tv_nsec;
-		/* Make sure we've done that. */
-		rmb();
-		/* Now if the seconds part has changed, try again. */
-	} while (unlikely(lguest_data.time.tv_sec != sec));
-
-	/* Our lguest clock is in real nanoseconds. */
-	return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
-	.name		= "lguest",
-	.rating		= 200,
-	.read		= lguest_clock_read,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future.  Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
-                                           struct clock_event_device *evt)
-{
-	/* FIXME: I don't think this can ever happen, but James tells me he had
-	 * to put this code in.  Maybe we should remove it now.  Anyone? */
-	if (delta < LG_CLOCK_MIN_DELTA) {
-		if (printk_ratelimit())
-			printk(KERN_DEBUG "%s: small delta %lu ns\n",
-			       __func__, delta);
-		return -ETIME;
-	}
-
-	/* Please wake us this far in the future. */
-	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
-	return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
-	/* A 0 argument shuts the clock down. */
-	hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
-	return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
-	.name                   = "lguest",
-	.features               = CLOCK_EVT_FEAT_ONESHOT,
-	.set_next_event         = lguest_clockevent_set_next_event,
-	.set_state_shutdown	= lguest_clockevent_shutdown,
-	.rating                 = INT_MAX,
-	.mult                   = 1,
-	.shift                  = 0,
-	.min_delta_ns           = LG_CLOCK_MIN_DELTA,
-	.min_delta_ticks        = LG_CLOCK_MIN_DELTA,
-	.max_delta_ns           = LG_CLOCK_MAX_DELTA,
-	.max_delta_ticks        = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0).  We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
-	unsigned long flags;
-
-	/* Don't interrupt us while this is running. */
-	local_irq_save(flags);
-	lguest_clockevent.event_handler(&lguest_clockevent);
-	local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure.  The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
-	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	if (lguest_setup_irq(0) != 0)
-		panic("Could not set up timer irq");
-	irq_set_handler(0, lguest_time_irq);
-
-	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
-	/* We can't set cpumask in the initializer: damn C limitations!  Set it
-	 * here and register our timer device. */
-	lguest_clockevent.cpumask = cpumask_of(0);
-	clockevents_register_device(&lguest_clockevent);
-
-	/* Finally, we unblock the timer interrupt. */
-	clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work.  They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use.  For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
-			    struct thread_struct *thread)
-{
-	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
-		   THREAD_SIZE / PAGE_SIZE);
-	tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
-	/* FIXME: Implement */
-	return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
-	/* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices).  This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that.  Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0.  So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads.  So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced.  It will also never interrupt anything.  It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
-	return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
-	return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
-	return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
-	return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
-	apic->read = lguest_apic_read;
-	apic->write = lguest_apic_write;
-	apic->icr_read = lguest_apic_icr_read;
-	apic->icr_write = lguest_apic_icr_write;
-	apic->wait_icr_idle = lguest_apic_wait_icr_idle;
-	apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP!  Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
-	hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
-	hcall(LHCALL_SHUTDOWN, __pa("Power down"),
-	      LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't.  But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-	/* The hcall won't return, but to keep gcc happy, we're "done". */
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
-	.notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
-	/*
-	 * The Linux bootloader header contains an "e820" memory map: the
-	 * Launcher populated the first entry with our memory limit.
-	 */
-	e820__range_add(boot_params.e820_table[0].addr,
-			  boot_params.e820_table[0].size,
-			  boot_params.e820_table[0].type);
-
-	/* This string is for the boot messages. */
-	return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
-	write_pci_config_byte(0, 1, 0,
-			      cfg_offset + offsetof(struct virtio_pci_cap, bar),
-			      0);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, length),
-			 4);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, offset),
-			 off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
-	/*
-	 * We could set this up once, then leave it; nothing else in the *
-	 * kernel should touch these registers.  But if it went wrong, that
-	 * would be a horrible bug to find.
-	 */
-	set_cfg_window(cfg_offset, off);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
-	u8 cap, common_cap = 0, device_cap = 0;
-	u32 device_len;
-
-	/* Avoid recursive printk into here. */
-	console_cfg_offset = -1;
-
-	if (!early_pci_allowed()) {
-		printk(KERN_ERR "lguest: early PCI access not allowed!\n");
-		return;
-	}
-
-	/* We expect a console PCI device at BUS0, slot 1. */
-	if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
-		printk(KERN_ERR "lguest: PCI device is %#x!\n",
-		       read_pci_config(0, 1, 0, 0));
-		return;
-	}
-
-	/* Find the capabilities we need (must be in bar0) */
-	cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
-	while (cap) {
-		u8 vndr = read_pci_config_byte(0, 1, 0, cap);
-		if (vndr == PCI_CAP_ID_VNDR) {
-			u8 type, bar;
-
-			type = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, cfg_type));
-			bar = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, bar));
-
-			switch (type) {
-			case VIRTIO_PCI_CAP_DEVICE_CFG:
-				if (bar == 0)
-					device_cap = cap;
-				break;
-			case VIRTIO_PCI_CAP_PCI_CFG:
-				console_access_cap = cap;
-				break;
-			}
-		}
-		cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
-	}
-	if (!device_cap || !console_access_cap) {
-		printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
-		       common_cap, device_cap, console_access_cap);
-		return;
-	}
-
-	/*
-	 * Note that we can't check features, until we've set the DRIVER
-	 * status bit.  We don't want to do that until we have a real driver,
-	 * so we just check that the device-specific config has room for
-	 * emerg_wr.  If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
-	 * it should ignore the access.
-	 */
-	device_len = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, length));
-	if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
-			  + sizeof(u32))) {
-		printk(KERN_ERR "lguest: console missing emerg_wr field\n");
-		return;
-	}
-
-	console_cfg_offset = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, offset));
-	printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
-	/* If we couldn't find PCI console, forget it. */
-	if (console_cfg_offset < 0)
-		return count;
-
-	if (unlikely(!console_cfg_offset)) {
-		probe_pci_console();
-		if (console_cfg_offset < 0)
-			return count;
-	}
-
-	write_bar_via_cfg(console_access_cap,
-			  console_cfg_offset
-			  + offsetof(struct virtio_console_config, emerg_wr),
-			  buf[0]);
-	return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel.  This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"?  The rest of that quote is
- * "... But that usually will create another problem."  This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient.  We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
-	const char *start, *end;
-} lguest_insns[] = {
-	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c).  If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
-			     unsigned long addr, unsigned len)
-{
-	unsigned int insn_len;
-
-	/* Don't do anything special if we don't have a replacement */
-	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
-	/* Similarly if it can't fit (doesn't happen, but let's be thorough). */
-	if (len < insn_len)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	/* Copy in our instructions. */
-	memcpy(ibuf, lguest_insns[type].start, insn_len);
-	return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest.  The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
-	/* We're under lguest. */
-	pv_info.name = "lguest";
-	/* We're running at privilege level 1, not 0 as normal. */
-	pv_info.kernel_rpl = 1;
-	/* Everyone except Xen runs with this set. */
-	pv_info.shared_kernel_pmd = 1;
-
-	/*
-	 * We set up all the lguest overrides for sensitive operations.  These
-	 * are detailed with the operations themselves.
-	 */
-
-	/* Interrupt-related operations */
-	pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
-	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
-	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
-	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
-	pv_irq_ops.safe_halt = lguest_safe_halt;
-
-	/* Setup operations */
-	pv_init_ops.patch = lguest_patch;
-
-	/* Intercepts of various CPU instructions */
-	pv_cpu_ops.load_gdt = lguest_load_gdt;
-	pv_cpu_ops.cpuid = lguest_cpuid;
-	pv_cpu_ops.load_idt = lguest_load_idt;
-	pv_cpu_ops.iret = lguest_iret;
-	pv_cpu_ops.load_sp0 = lguest_load_sp0;
-	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
-	pv_cpu_ops.set_ldt = lguest_set_ldt;
-	pv_cpu_ops.load_tls = lguest_load_tls;
-	pv_cpu_ops.get_debugreg = lguest_get_debugreg;
-	pv_cpu_ops.set_debugreg = lguest_set_debugreg;
-	pv_cpu_ops.read_cr0 = lguest_read_cr0;
-	pv_cpu_ops.write_cr0 = lguest_write_cr0;
-	pv_cpu_ops.read_cr4 = lguest_read_cr4;
-	pv_cpu_ops.write_cr4 = lguest_write_cr4;
-	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
-	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
-	pv_cpu_ops.wbinvd = lguest_wbinvd;
-	pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
-	pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
-	/* Pagetable management */
-	pv_mmu_ops.write_cr3 = lguest_write_cr3;
-	pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
-	pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
-	pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-	pv_mmu_ops.set_pte = lguest_set_pte;
-	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
-	pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
-	pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
-	pv_mmu_ops.pte_clear = lguest_pte_clear;
-	pv_mmu_ops.pmd_clear = lguest_pmd_clear;
-	pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
-	pv_mmu_ops.read_cr2 = lguest_read_cr2;
-	pv_mmu_ops.read_cr3 = lguest_read_cr3;
-	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
-	pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
-	pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/* APIC read/write intercepts */
-	set_lguest_basic_apic_ops();
-#endif
-
-	x86_init.resources.memory_setup = lguest_memory_setup;
-	x86_init.irqs.intr_init = lguest_init_IRQ;
-	x86_init.timers.timer_init = lguest_time_init;
-	x86_platform.calibrate_tsc = lguest_tsc_khz;
-	x86_platform.get_wallclock =  lguest_get_wallclock;
-
-	/*
-	 * Now is a good time to look at the implementations of these functions
-	 * before returning to the rest of lguest_init().
-	 */
-
-	/*G:070
-	 * Now we've seen all the paravirt_ops, we return to
-	 * lguest_init() where the rest of the fairly chaotic boot setup
-	 * occurs.
-	 */
-
-	/*
-	 * The stack protector is a weird thing where gcc places a canary
-	 * value on the stack and then checks it on return.  This file is
-	 * compiled with -fno-stack-protector it, so we got this far without
-	 * problems.  The value of the canary is kept at offset 20 from the
-	 * %gs register, so we need to set that up before calling C functions
-	 * in other files.
-	 */
-	setup_stack_canary_segment(0);
-
-	/*
-	 * We could just call load_stack_canary_segment(), but we might as well
-	 * call switch_to_new_gdt() which loads the whole table and sets up the
-	 * per-cpu segment descriptor register %fs as well.
-	 */
-	switch_to_new_gdt(0);
-
-	/*
-	 * The Host<->Guest Switcher lives at the top of our address space, and
-	 * the Host told us how big it is when we made LGUEST_INIT hypercall:
-	 * it put the answer in lguest_data.reserve_mem
-	 */
-	reserve_top_address(lguest_data.reserve_mem);
-
-	/* Hook in our special panic hypercall code. */
-	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
-	/*
-	 * This is messy CPU setup stuff which the native boot code does before
-	 * start_kernel, so we have to do, too:
-	 */
-	cpu_detect(&new_cpu_data);
-	/* head.S usually sets up the first capability word, so do it here. */
-	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
-	/* Math is always hard! */
-	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
-	/* We don't have features.  We have puppies!  Puppies! */
-#ifdef CONFIG_X86_MCE
-	mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
-	acpi_disabled = 1;
-#endif
-
-	/*
-	 * We set the preferred console to "hvc".  This is the "hypervisor
-	 * virtual console" driver written by the PowerPC people, which we also
-	 * adapted for lguest's use.
-	 */
-	add_preferred_console("hvc", 0, NULL);
-
-	/* Register our very early console. */
-	virtio_cons_early_init(early_put_chars);
-
-	/* Don't let ACPI try to control our PCI interrupts. */
-	disable_acpi();
-
-	/* We control them ourselves, by overriding these two hooks. */
-	pcibios_enable_irq = lguest_enable_irq;
-	pcibios_disable_irq = lguest_disable_irq;
-
-	/*
-	 * Last of all, we set the power management poweroff hook to point to
-	 * the Guest routine to power off, and the reboot hook to our restart
-	 * routine.
-	 */
-	pm_power_off = lguest_power_off;
-	machine_ops.restart = lguest_restart;
-
-	/*
-	 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
-	 * to boot as normal.  It never returns.
-	 */
-	i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
deleted file mode 100644
index d5ae63f5ec5d..000000000000
--- a/arch/x86/lguest/head_32.S
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables.  Finally it checks the 'hardware_subarch' field.  This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here!  We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
-	/*
-	 * We make the "initialization" hypercall now to tell the Host where
-	 * our lguest_data struct is.
-	 */
-	movl $LHCALL_LGUEST_INIT, %eax
-	movl $lguest_data - __PAGE_OFFSET, %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
-	movl $LHCALL_NEW_PGTABLE, %eax
-	movl $(initial_page_table - __PAGE_OFFSET), %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Set up the initial stack so we can run C code. */
-	movl $(init_thread_union+THREAD_SIZE),%esp
-
-	/* Jumps are relative: we're running __PAGE_OFFSET too low. */
-	jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers.  These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...)			\
-	lgstart_##name:	insns; lgend_##name:;		\
-	.globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later).  If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages.  First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
-	/*
-	 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
-	 * X86_EFLAGS_IF (ie. "Interrupts enabled").
-	 */
-	movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * But now we need to check if the Host wants to know: there might have
-	 * been interrupts waiting to be delivered, in which case it will have
-	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
-	 * jump to send_interrupts, otherwise we're done.
-	 */
-	cmpl $0, lguest_data+LGUEST_DATA_irq_pending
-	jnz send_interrupts
-	/*
-	 * One cool thing about x86 is that you can do many things without using
-	 * a register.  In this case, the normal path hasn't needed to save or
-	 * restore any registers at all!
-	 */
-	ret
-send_interrupts:
-	/*
-	 * OK, now we need a register: eax is used for the hypercall number,
-	 * which is LHCALL_SEND_INTERRUPTS.
-	 *
-	 * We used not to bother with this pending detection at all, which was
-	 * much simpler.  Sooner or later the Host would realize it had to
-	 * send us an interrupt.  But that turns out to make performance 7
-	 * times worse on a simple tcp benchmark.  So now we do this the hard
-	 * way.
-	 */
-	pushl %eax
-	movl $LHCALL_SEND_INTERRUPTS, %eax
-	/* This is the actual hypercall trap. */
-	int  $LGUEST_TRAP_ENTRY
-	/* Put eax back the way we found it. */
-	popl %eax
-	ret
-
-/*
- * Finally, the "popf" or "restore flags" routine.  The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
-	/* This is just "lguest_data.irq_enabled = flags;" */
-	movl %eax, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * Now, if the %eax value has enabled interrupts and
-	 * lguest_data.irq_pending is set, we want to tell the Host so it can
-	 * deliver any outstanding interrupts.  Fortunately, both values will
-	 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
-	 * instruction will AND them together for us.  If both are set, we
-	 * jump to send_interrupts.
-	 */
-	testl lguest_data+LGUEST_DATA_irq_pending, %eax
-	jnz send_interrupts
-	/* Again, the normal path has used no extra registers.  Clever, huh? */
-	ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it.  However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway.  If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last.  It's *cool*!  It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap.  The
- * stack looks like this:
- *   old address
- *   old code segment & privilege level
- *   old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once.  The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic.  The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever.  Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
-	pushl	2*4(%esp)
-	/*
-	 * Note the %ss: segment prefix here.  Normal data accesses use the
-	 * "ds" segment, but that will have already been restored for whatever
-	 * we're returning to (such as userspace): we can't trust it.  The %ss:
-	 * prefix makes sure we use the stack segment, which is still valid.
-	 */
-	popl	%ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
-	iret
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
index f77ba3058b31..066996dba6a2 100644
--- a/arch/x86/math-emu/div_Xsig.S
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -363,3 +363,4 @@ L_bugged_2:
 	pop	%ebx
 	jmp	L_exit
 #endif /* PARANOID */ 
+ENDPROC(div_Xsig)
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
index 47099628fa4c..2c71527bd917 100644
--- a/arch/x86/math-emu/div_small.S
+++ b/arch/x86/math-emu/div_small.S
@@ -44,4 +44,4 @@ ENTRY(FPU_div_small)
 
 	leave
 	ret
-
+ENDPROC(FPU_div_small)
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
index 717785a53eb4..22e0631bb85a 100644
--- a/arch/x86/math-emu/mul_Xsig.S
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -62,6 +62,7 @@ ENTRY(mul32_Xsig)
 	popl %esi
 	leave
 	ret
+ENDPROC(mul32_Xsig)
 
 
 ENTRY(mul64_Xsig)
@@ -114,6 +115,7 @@ ENTRY(mul64_Xsig)
 	popl %esi
 	leave
 	ret
+ENDPROC(mul64_Xsig)
 
 
 
@@ -173,4 +175,4 @@ ENTRY(mul_Xsig_Xsig)
 	popl %esi
 	leave
 	ret
-
+ENDPROC(mul_Xsig_Xsig)
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
index 17315c89ff3d..a9aaf414135d 100644
--- a/arch/x86/math-emu/polynom_Xsig.S
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -133,3 +133,4 @@ L_accum_done:
 	popl	%esi
 	leave
 	ret
+ENDPROC(polynomial_Xsig)
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
index 8b6352efceef..53ac1a343c69 100644
--- a/arch/x86/math-emu/reg_norm.S
+++ b/arch/x86/math-emu/reg_norm.S
@@ -94,6 +94,7 @@ L_overflow:
 	call	arith_overflow
 	pop	%ebx
 	jmp	L_exit
+ENDPROC(FPU_normalize)
 
 
 
@@ -145,3 +146,4 @@ L_exit_nuo_zero:
 	popl	%ebx
 	leave
 	ret
+ENDPROC(FPU_normalize_nuo)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
index d1d4e48b4f67..41af5b208d88 100644
--- a/arch/x86/math-emu/reg_round.S
+++ b/arch/x86/math-emu/reg_round.S
@@ -706,3 +706,5 @@ L_exception_exit:
 	mov	$-1,%eax
 	jmp	fpu_reg_round_special_exit
 #endif /* PARANOID */ 
+
+ENDPROC(FPU_round)
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
index 47c4c2434d85..3b1bc5e9b2f6 100644
--- a/arch/x86/math-emu/reg_u_add.S
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -165,3 +165,4 @@ L_exit:
 	leave
 	ret
 #endif /* PARANOID */
+ENDPROC(FPU_u_add)
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
index cc00654b6f9a..796eb5ab921b 100644
--- a/arch/x86/math-emu/reg_u_div.S
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -469,3 +469,5 @@ L_exit:
 	leave
 	ret
 #endif /* PARANOID */ 
+
+ENDPROC(FPU_u_div)
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
index 973f12af97df..6196f68cf3c1 100644
--- a/arch/x86/math-emu/reg_u_mul.S
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -146,3 +146,4 @@ L_exit:
 	ret
 #endif /* PARANOID */ 
 
+ENDPROC(FPU_u_mul)
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
index 1b6c24801d22..d115b900919a 100644
--- a/arch/x86/math-emu/reg_u_sub.S
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -270,3 +270,4 @@ L_exit:
 	popl	%esi
 	leave
 	ret
+ENDPROC(FPU_u_sub)
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
index bbe0e87718e4..87c99749a495 100644
--- a/arch/x86/math-emu/round_Xsig.S
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -78,7 +78,7 @@ L_exit:
 	popl	%ebx
 	leave
 	ret
-
+ENDPROC(round_Xsig)
 
 
 
@@ -138,4 +138,4 @@ L_n_exit:
 	popl	%ebx
 	leave
 	ret
-
+ENDPROC(norm_Xsig)
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
index 31cdd118e918..c8552edeec75 100644
--- a/arch/x86/math-emu/shr_Xsig.S
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -85,3 +85,4 @@ L_more_than_95:
 	popl	%esi
 	leave
 	ret
+ENDPROC(shr_Xsig)
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
index 518428317985..340dd6897f85 100644
--- a/arch/x86/math-emu/wm_shrx.S
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -92,6 +92,7 @@ L_more_than_95:
 	popl	%esi
 	leave
 	ret
+ENDPROC(FPU_shrx)
 
 
 /*---------------------------------------------------------------------------+
@@ -202,3 +203,4 @@ Ls_more_than_95:
 	popl	%esi
 	leave
 	ret
+ENDPROC(FPU_shrxs)
diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S
index d258f59564e1..695afae38fdf 100644
--- a/arch/x86/math-emu/wm_sqrt.S
+++ b/arch/x86/math-emu/wm_sqrt.S
@@ -468,3 +468,4 @@ sqrt_more_prec_large:
 /* Our estimate is too large */
 	movl	$0x7fffff00,%eax
 	jmp	sqrt_round_result
+ENDPROC(wm_sqrt)
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 761fc88cd820..c076f710de4c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -184,7 +184,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
 	 * undefined.  I'm not sure which CPUs do this, but at least
 	 * the 486 DX works this way.
 	 */
-	if ((regs->cs & 0xFFFF) != __KERNEL_CS)
+	if (regs->cs != __KERNEL_CS)
 		goto fail;
 
 	/*
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index a8f90ce3dedf..d805162e6045 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -75,13 +75,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 
 /*
  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.  The return value is the number of nodes allocated.
+ * to max_addr.
+ *
+ * Returns zero on success or negative on error.
  */
 static int __init split_nodes_interleave(struct numa_meminfo *ei,
 					 struct numa_meminfo *pi,
 					 u64 addr, u64 max_addr, int nr_nodes)
 {
-	nodemask_t physnode_mask = NODE_MASK_NONE;
+	nodemask_t physnode_mask = numa_nodes_parsed;
 	u64 size;
 	int big;
 	int nid = 0;
@@ -116,9 +118,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 		return -1;
 	}
 
-	for (i = 0; i < pi->nr_blks; i++)
-		node_set(pi->blk[i].nid, physnode_mask);
-
 	/*
 	 * Continue to fill physical nodes with fake nodes until there is no
 	 * memory left on any of them.
@@ -200,13 +199,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
 
 /*
  * Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'.  The return value is the number of nodes allocated.
+ * `addr' to `max_addr'.
+ *
+ * Returns zero on success or negative on error.
  */
 static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 					      struct numa_meminfo *pi,
 					      u64 addr, u64 max_addr, u64 size)
 {
-	nodemask_t physnode_mask = NODE_MASK_NONE;
+	nodemask_t physnode_mask = numa_nodes_parsed;
 	u64 min_size;
 	int nid = 0;
 	int i, ret;
@@ -231,9 +232,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 	}
 	size &= FAKE_NODE_MIN_HASH_MASK;
 
-	for (i = 0; i < pi->nr_blks; i++)
-		node_set(pi->blk[i].nid, physnode_mask);
-
 	/*
 	 * Fill physical nodes with fake nodes of size until there is no memory
 	 * left on any of them.
@@ -280,6 +278,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 	return 0;
 }
 
+int __init setup_emu2phys_nid(int *dfl_phys_nid)
+{
+	int i, max_emu_nid = 0;
+
+	*dfl_phys_nid = NUMA_NO_NODE;
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+			max_emu_nid = i;
+			if (*dfl_phys_nid == NUMA_NO_NODE)
+				*dfl_phys_nid = emu_nid_to_phys[i];
+		}
+	}
+
+	return max_emu_nid;
+}
+
 /**
  * numa_emulation - Emulate NUMA nodes
  * @numa_meminfo: NUMA configuration to massage
@@ -376,23 +390,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	 * Determine the max emulated nid and the default phys nid to use
 	 * for unmapped nodes.
 	 */
-	max_emu_nid = 0;
-	dfl_phys_nid = NUMA_NO_NODE;
-	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
-		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
-			max_emu_nid = i;
-			if (dfl_phys_nid == NUMA_NO_NODE)
-				dfl_phys_nid = emu_nid_to_phys[i];
-		}
-	}
-	if (dfl_phys_nid == NUMA_NO_NODE) {
-		pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
-		goto no_emu;
-	}
+	max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
 
 	/* commit */
 	*numa_meminfo = ei;
 
+	/* Make sure numa_nodes_parsed only contains emulated nodes */
+	nodes_clear(numa_nodes_parsed);
+	for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
+		if (ei.blk[i].start != ei.blk[i].end &&
+		    ei.blk[i].nid != NUMA_NO_NODE)
+			node_set(ei.blk[i].nid, numa_nodes_parsed);
+
 	/*
 	 * Transform __apicid_to_node table to use emulated nids by
 	 * reverse-mapping phys_nid.  The maps should always exist but fall
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 811e4ddb3f37..98491521bb43 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -981,59 +981,6 @@ void __ref xen_setup_vcpu_info_placement(void)
 	}
 }
 
-static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
-			  unsigned long addr, unsigned len)
-{
-	char *start, *end, *reloc;
-	unsigned ret;
-
-	start = end = reloc = NULL;
-
-#define SITE(op, x)							\
-	case PARAVIRT_PATCH(op.x):					\
-	if (xen_have_vcpu_info_placement) {				\
-		start = (char *)xen_##x##_direct;			\
-		end = xen_##x##_direct_end;				\
-		reloc = xen_##x##_direct_reloc;				\
-	}								\
-	goto patch_site
-
-	switch (type) {
-		SITE(pv_irq_ops, irq_enable);
-		SITE(pv_irq_ops, irq_disable);
-		SITE(pv_irq_ops, save_fl);
-		SITE(pv_irq_ops, restore_fl);
-#undef SITE
-
-	patch_site:
-		if (start == NULL || (end-start) > len)
-			goto default_patch;
-
-		ret = paravirt_patch_insns(insnbuf, len, start, end);
-
-		/* Note: because reloc is assigned from something that
-		   appears to be an array, gcc assumes it's non-null,
-		   but doesn't know its relationship with start and
-		   end. */
-		if (reloc > start && reloc < end) {
-			int reloc_off = reloc - start;
-			long *relocp = (long *)(insnbuf + reloc_off);
-			long delta = start - (char *)addr;
-
-			*relocp += delta;
-		}
-		break;
-
-	default_patch:
-	default:
-		ret = paravirt_patch_default(type, clobbers, insnbuf,
-					     addr, len);
-		break;
-	}
-
-	return ret;
-}
-
 static const struct pv_info xen_info __initconst = {
 	.shared_kernel_pmd = 0,
 
@@ -1043,10 +990,6 @@ static const struct pv_info xen_info __initconst = {
 	.name = "Xen",
 };
 
-static const struct pv_init_ops xen_init_ops __initconst = {
-	.patch = xen_patch,
-};
-
 static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.cpuid = xen_cpuid,
 
@@ -1244,7 +1187,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
-	pv_init_ops = xen_init_ops;
+	pv_init_ops.patch = paravirt_patch_default;
 	pv_cpu_ops = xen_cpu_ops;
 
 	x86_platform.get_nmi_reason = xen_get_nmi_reason;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index eff224df813f..dcd31fa39b5d 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -1,14 +1,8 @@
 /*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining.  The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
  *
  * We only bother with direct forms (ie, vcpu in percpu data) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
  */
 
 #include <asm/asm-offsets.h>
@@ -16,7 +10,7 @@
 #include <asm/processor-flags.h>
 #include <asm/frame.h>
 
-#include "xen-asm.h"
+#include <linux/linkage.h>
 
 /*
  * Enable events.  This clears the event mask and tests the pending
@@ -38,13 +32,11 @@ ENTRY(xen_irq_enable_direct)
 	testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
 	jz 1f
 
-2:	call check_events
+	call check_events
 1:
-ENDPATCH(xen_irq_enable_direct)
 	FRAME_END
 	ret
 	ENDPROC(xen_irq_enable_direct)
-	RELOC(xen_irq_enable_direct, 2b+1)
 
 
 /*
@@ -53,10 +45,8 @@ ENDPATCH(xen_irq_enable_direct)
  */
 ENTRY(xen_irq_disable_direct)
 	movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
 	ret
-	ENDPROC(xen_irq_disable_direct)
-	RELOC(xen_irq_disable_direct, 0)
+ENDPROC(xen_irq_disable_direct)
 
 /*
  * (xen_)save_fl is used to get the current interrupt enable status.
@@ -71,10 +61,8 @@ ENTRY(xen_save_fl_direct)
 	testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
 	setz %ah
 	addb %ah, %ah
-ENDPATCH(xen_save_fl_direct)
 	ret
 	ENDPROC(xen_save_fl_direct)
-	RELOC(xen_save_fl_direct, 0)
 
 
 /*
@@ -101,13 +89,11 @@ ENTRY(xen_restore_fl_direct)
 	/* check for unmasked and pending */
 	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
 	jnz 1f
-2:	call check_events
+	call check_events
 1:
-ENDPATCH(xen_restore_fl_direct)
 	FRAME_END
 	ret
 	ENDPROC(xen_restore_fl_direct)
-	RELOC(xen_restore_fl_direct, 2b+1)
 
 
 /*
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
deleted file mode 100644
index 465276467a47..000000000000
--- a/arch/x86/xen/xen-asm.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _XEN_XEN_ASM_H
-#define _XEN_XEN_ASM_H
-
-#include <linux/linkage.h>
-
-#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x)	.globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI	0x80000000
-
-#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index feb6d40a0860..1200e262a116 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,14 +1,8 @@
 /*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining.  The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
  *
  * We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
  */
 
 #include <asm/thread_info.h>
@@ -18,21 +12,10 @@
 
 #include <xen/interface/xen.h>
 
-#include "xen-asm.h"
+#include <linux/linkage.h>
 
-/*
- * Force an event check by making a hypercall, but preserve regs
- * before making the call.
- */
-check_events:
-	push %eax
-	push %ecx
-	push %edx
-	call xen_force_evtchn_callback
-	pop %edx
-	pop %ecx
-	pop %eax
-	ret
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI  0x80000000
 
 /*
  * This is run where a normal iret would be run, with the same stack setup:
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c3df43141e70..3a3b6a211584 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,14 +1,8 @@
 /*
- * Asm versions of Xen pv-ops, suitable for either direct use or
- * inlining.  The inline versions are the same as the direct-use
- * versions, with the pre- and post-amble chopped off.
- *
- * This code is encoded for size rather than absolute efficiency, with
- * a view to being able to inline as much as possible.
+ * Asm versions of Xen pv-ops, suitable for direct use.
  *
  * We only bother with direct forms (ie, vcpu in pda) of the
- * operations here; the indirect forms are better handled in C, since
- * they're generally too large to inline anyway.
+ * operations here; the indirect forms are better handled in C.
  */
 
 #include <asm/errno.h>
@@ -20,7 +14,7 @@
 
 #include <xen/interface/xen.h>
 
-#include "xen-asm.h"
+#include <linux/linkage.h>
 
 ENTRY(xen_adjust_exception_frame)
 	mov 8+0(%rsp), %rcx
@@ -46,9 +40,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
  */
 ENTRY(xen_iret)
 	pushq $0
-1:	jmp hypercall_iret
-ENDPATCH(xen_iret)
-RELOC(xen_iret, 1b+1)
+	jmp hypercall_iret
 
 ENTRY(xen_sysret64)
 	/*
@@ -65,9 +57,7 @@ ENTRY(xen_sysret64)
 	pushq %rcx
 
 	pushq $VGCF_in_syscall
-1:	jmp hypercall_iret
-ENDPATCH(xen_sysret64)
-RELOC(xen_sysret64, 1b+1)
+	jmp hypercall_iret
 
 /*
  * Xen handles syscall callbacks much like ordinary exceptions, which
@@ -82,34 +72,47 @@ RELOC(xen_sysret64, 1b+1)
  *	rip
  *	r11
  * rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
  */
 
-.macro undo_xen_syscall
-	mov 0*8(%rsp), %rcx
-	mov 1*8(%rsp), %r11
-	mov 5*8(%rsp), %rsp
-.endm
-
 /* Normal 64-bit system call target */
 ENTRY(xen_syscall_target)
-	undo_xen_syscall
-	jmp entry_SYSCALL_64_after_swapgs
+	popq %rcx
+	popq %r11
+
+	/*
+	 * Neither Xen nor the kernel really knows what the old SS and
+	 * CS were.  The kernel expects __USER_DS and __USER_CS, so
+	 * report those values even though Xen will guess its own values.
+	 */
+	movq $__USER_DS, 4*8(%rsp)
+	movq $__USER_CS, 1*8(%rsp)
+
+	jmp entry_SYSCALL_64_after_hwframe
 ENDPROC(xen_syscall_target)
 
 #ifdef CONFIG_IA32_EMULATION
 
 /* 32-bit compat syscall target */
 ENTRY(xen_syscall32_target)
-	undo_xen_syscall
-	jmp entry_SYSCALL_compat
+	popq %rcx
+	popq %r11
+
+	/*
+	 * Neither Xen nor the kernel really knows what the old SS and
+	 * CS were.  The kernel expects __USER32_DS and __USER32_CS, so
+	 * report those values even though Xen will guess its own values.
+	 */
+	movq $__USER32_DS, 4*8(%rsp)
+	movq $__USER32_CS, 1*8(%rsp)
+
+	jmp entry_SYSCALL_compat_after_hwframe
 ENDPROC(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
 ENTRY(xen_sysenter_target)
-	undo_xen_syscall
+	mov 0*8(%rsp), %rcx
+	mov 1*8(%rsp), %r11
+	mov 5*8(%rsp), %rsp
 	jmp entry_SYSENTER_compat
 ENDPROC(xen_sysenter_target)
 
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 0d5004477db6..70301ac0d414 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -129,17 +129,10 @@ static inline void __init xen_efi_init(void)
 }
 #endif
 
-/* Declare an asm function, along with symbols needed to make it
-   inlineable */
-#define DECL_ASM(ret, name, ...)		\
-	__visible ret name(__VA_ARGS__);	\
-	extern char name##_end[] __visible;	\
-	extern char name##_reloc[] __visible
-
-DECL_ASM(void, xen_irq_enable_direct, void);
-DECL_ASM(void, xen_irq_disable_direct, void);
-DECL_ASM(unsigned long, xen_save_fl_direct, void);
-DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+__visible void xen_irq_enable_direct(void);
+__visible void xen_irq_disable_direct(void);
+__visible unsigned long xen_save_fl_direct(void);
+__visible void xen_restore_fl_direct(unsigned long);
 
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h
index a36221cf6363..3bb49681ee24 100644
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -33,11 +33,6 @@
 
 #define arch_spin_is_locked(x) ((x)->slock != 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, !VAL);
-}
-
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c
index 33bfa5270d95..08175df7a69e 100644
--- a/arch/xtensa/kernel/setup.c
+++ b/arch/xtensa/kernel/setup.c
@@ -273,8 +273,8 @@ void __init init_arch(bp_tag_t *bp_start)
  * Initialize system. Setup memory and reserve regions.
  */
 
-extern char _end;
-extern char _stext;
+extern char _end[];
+extern char _stext[];
 extern char _WindowVectors_text_start;
 extern char _WindowVectors_text_end;
 extern char _DebugInterruptVector_literal_start;
@@ -333,7 +333,7 @@ void __init setup_arch(char **cmdline_p)
 	}
 #endif
 
-	mem_reserve(__pa(&_stext), __pa(&_end));
+	mem_reserve(__pa(_stext), __pa(_end));
 
 #ifdef CONFIG_VECTORS_OFFSET
 	mem_reserve(__pa(&_WindowVectors_text_start),
diff --git a/drivers/Makefile b/drivers/Makefile
index dfdcda00bfe3..d90fdc413648 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY)	+= accessibility/
 obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
 obj-$(CONFIG_EISA)		+= eisa/
-obj-y				+= lguest/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
 obj-y				+= mmc/
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 3dbd05532c09..e4effef0c83f 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -645,12 +645,11 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap,
 	 * completions are honored.  A scmd is determined to have
 	 * timed out iff its associated qc is active and not failed.
 	 */
+	spin_lock_irqsave(ap->lock, flags);
 	if (ap->ops->error_handler) {
 		struct scsi_cmnd *scmd, *tmp;
 		int nr_timedout = 0;
 
-		spin_lock_irqsave(ap->lock, flags);
-
 		/* This must occur under the ap->lock as we don't want
 		   a polled recovery to race the real interrupt handler
 
@@ -700,12 +699,11 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap,
 		if (nr_timedout)
 			__ata_port_freeze(ap);
 
-		spin_unlock_irqrestore(ap->lock, flags);
 
 		/* initialize eh_tries */
 		ap->eh_tries = ATA_EH_MAX_TRIES;
-	} else
-		spin_unlock_wait(ap->lock);
+	}
+	spin_unlock_irqrestore(ap->lock, flags);
 
 }
 EXPORT_SYMBOL(ata_scsi_cmd_error_handler);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8ddc98279c8f..80aaf3420e12 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -470,7 +470,7 @@ config VIRTIO_BLK
 	depends on VIRTIO
 	---help---
 	  This is the virtual block driver for virtio.  It can be used with
-          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+          QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
 config VIRTIO_BLK_SCSI
 	bool "SCSI passthrough request for the Virtio block driver"
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index ccd239ab879f..623714344600 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -161,7 +161,7 @@ config VIRTIO_CONSOLE
 	depends on VIRTIO && TTY
 	select HVC_DRIVER
 	help
-	  Virtio console for use with lguest and other hypervisors.
+	  Virtio console for use with hypervisors.
 
 	  Also serves as a general-purpose serial device for data
 	  transfer between the guest and host.  Character devices at
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ad843eb02ae7..4d229dde6522 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = {
  * We turn the characters into a scatter-gather list, add it to the
  * output queue and then kick the Host.  Then we sit here waiting for
  * it to finish: inefficient in theory, but in practice
- * implementations will do it immediately (lguest's Launcher does).
+ * implementations will do it immediately.
  */
 static int put_chars(u32 vtermno, const char *buf, int count)
 {
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index b0184360efc6..50a9cab5a834 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -205,7 +205,7 @@ again:
 		unsigned long m = (unsigned long)map;
 		u64 start, end;
 
-		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+		desc = efi_early_memdesc_ptr(m, desc_size, i);
 		if (desc->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
@@ -298,7 +298,7 @@ efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg,
 		unsigned long m = (unsigned long)map;
 		u64 start, end;
 
-		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+		desc = efi_early_memdesc_ptr(m, desc_size, i);
 
 		if (desc->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
deleted file mode 100644
index 169172d2ba05..000000000000
--- a/drivers/lguest/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config LGUEST
-	tristate "Linux hypervisor example code"
-	depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
-	select HVC_DRIVER
-	---help---
-	  This is a very simple module which allows you to run
-	  multiple instances of the same Linux kernel, using the
-	  "lguest" command found in the tools/lguest directory.
-
-	  Note that "lguest" is pronounced to rhyme with "fell quest",
-	  not "rustyvisor". See tools/lguest/lguest.txt.
-
-	  If unsure, say N.  If curious, say M.  If masochistic, say Y.
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
deleted file mode 100644
index 16f52ee73994..000000000000
--- a/drivers/lguest/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Host requires the other files, which can be a module.
-obj-$(CONFIG_LGUEST)	+= lg.o
-lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
-	segments.o lguest_user.o
-
-lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
-
-Preparation Preparation!: PREFIX=P
-Guest: PREFIX=G
-Drivers: PREFIX=D
-Launcher: PREFIX=L
-Host: PREFIX=H
-Switcher: PREFIX=S
-Mastery: PREFIX=M
-Beer:
-	@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
-Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
-	@sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
-Puppy:
-	@clear
-	@printf "      __  \n (___()'\`;\n /,    /\`\n \\\\\\\"--\\\\\\   \n"
-	@sleep 2; clear; printf "\n\n   Sit!\n\n"; sleep 1; clear
-	@printf "    __    \n   ()'\`;  \n   /\\|\` \n  /  |  \n(/_)_|_   \n"
-	@sleep 2; clear; printf "\n\n  Stand!\n\n"; sleep 1; clear
-	@printf "    __    \n   ()'\`;  \n   /\\|\` \n  /._.= \n /| /     \n(_\_)_    \n"
-	@sleep 2; clear; printf "\n\n  Good puppy!\n\n"; sleep 1; clear
diff --git a/drivers/lguest/README b/drivers/lguest/README
deleted file mode 100644
index b7db39a64c66..000000000000
--- a/drivers/lguest/README
+++ /dev/null
@@ -1,47 +0,0 @@
-Welcome, friend reader, to lguest.
-
-Lguest is an adventure, with you, the reader, as Hero.  I can't think of many
-5000-line projects which offer both such capability and glimpses of future
-potential; it is an exciting time to be delving into the source!
-
-But be warned; this is an arduous journey of several hours or more!  And as we
-know, all true Heroes are driven by a Noble Goal.  Thus I offer a Beer (or
-equivalent) to anyone I meet who has completed this documentation.
-
-So get comfortable and keep your wits about you (both quick and humorous).
-Along your way to the Noble Goal, you will also gain masterly insight into
-lguest, and hypervisors and x86 virtualization in general.
-
-Our Quest is in seven parts: (best read with C highlighting turned on)
-
-I) Preparation
-	- In which our potential hero is flown quickly over the landscape for a
-	  taste of its scope.  Suitable for the armchair coders and other such
-	  persons of faint constitution.
-
-II) Guest
-	- Where we encounter the first tantalising wisps of code, and come to
-	  understand the details of the life of a Guest kernel.
-
-III) Drivers
-	- Whereby the Guest finds its voice and become useful, and our
-	  understanding of the Guest is completed.
-
-IV) Launcher
-	- Where we trace back to the creation of the Guest, and thus begin our
-	  understanding of the Host.
-
-V) Host
-	- Where we master the Host code, through a long and tortuous journey.
-	  Indeed, it is here that our hero is tested in the Bit of Despair.
-
-VI) Switcher
-	- Where our understanding of the intertwined nature of Guests and Hosts
-	  is completed.
-
-VII) Mastery
-	- Where our fully fledged hero grapples with the Great Question:
-	  "What next?"
-
-make Preparation!
-Rusty Russell.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
deleted file mode 100644
index 395ed1961dbf..000000000000
--- a/drivers/lguest/core.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*P:400
- * This contains run_guest() which actually calls into the Host<->Guest
- * Switcher and analyzes the return, such as determining if the Guest wants the
- * Host to do something.  This file also contains useful helper routines.
-:*/
-#include <linux/module.h>
-#include <linux/stringify.h>
-#include <linux/stddef.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/vmalloc.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/paravirt.h>
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-#include <asm/poll.h>
-#include <asm/asm-offsets.h>
-#include "lg.h"
-
-unsigned long switcher_addr;
-struct page **lg_switcher_pages;
-static struct vm_struct *switcher_text_vma;
-static struct vm_struct *switcher_stacks_vma;
-
-/* This One Big lock protects all inter-guest data structures. */
-DEFINE_MUTEX(lguest_lock);
-
-/*H:010
- * We need to set up the Switcher at a high virtual address.  Remember the
- * Switcher is a few hundred bytes of assembler code which actually changes the
- * CPU to run the Guest, and then changes back to the Host when a trap or
- * interrupt happens.
- *
- * The Switcher code must be at the same virtual address in the Guest as the
- * Host since it will be running as the switchover occurs.
- *
- * Trying to map memory at a particular address is an unusual thing to do, so
- * it's not a simple one-liner.
- */
-static __init int map_switcher(void)
-{
-	int i, err;
-
-	/*
-	 * Map the Switcher in to high memory.
-	 *
-	 * It turns out that if we choose the address 0xFFC00000 (4MB under the
-	 * top virtual address), it makes setting up the page tables really
-	 * easy.
-	 */
-
-	/* We assume Switcher text fits into a single page. */
-	if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
-		printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
-		       end_switcher_text - start_switcher_text);
-		return -EINVAL;
-	}
-
-	/*
-	 * We allocate an array of struct page pointers.  map_vm_area() wants
-	 * this, rather than just an array of pages.
-	 */
-	lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
-				    * TOTAL_SWITCHER_PAGES,
-				    GFP_KERNEL);
-	if (!lg_switcher_pages) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	/*
-	 * Now we actually allocate the pages.  The Guest will see these pages,
-	 * so we make sure they're zeroed.
-	 */
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (!lg_switcher_pages[i]) {
-			err = -ENOMEM;
-			goto free_some_pages;
-		}
-	}
-
-	/*
-	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
-	 * It goes in the first page, which we map in momentarily.
-	 */
-	memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
-	       end_switcher_text - start_switcher_text);
-	kunmap(lg_switcher_pages[0]);
-
-	/*
-	 * We place the Switcher underneath the fixmap area, which is the
-	 * highest virtual address we can get.  This is important, since we
-	 * tell the Guest it can't access this memory, so we want its ceiling
-	 * as high as possible.
-	 */
-	switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
-
-	/*
-	 * Now we reserve the "virtual memory area"s we want.  We might
-	 * not get them in theory, but in practice it's worked so far.
-	 *
-	 * We want the switcher text to be read-only and executable, and
-	 * the stacks to be read-write and non-executable.
-	 */
-	switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
-					  switcher_addr,
-					  switcher_addr + PAGE_SIZE);
-
-	if (!switcher_text_vma) {
-		err = -ENOMEM;
-		printk("lguest: could not map switcher pages high\n");
-		goto free_pages;
-	}
-
-	switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
-					    VM_ALLOC|VM_NO_GUARD,
-					    switcher_addr + PAGE_SIZE,
-					    switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
-	if (!switcher_stacks_vma) {
-		err = -ENOMEM;
-		printk("lguest: could not map switcher pages high\n");
-		goto free_text_vma;
-	}
-
-	/*
-	 * This code actually sets up the pages we've allocated to appear at
-	 * switcher_addr.  map_vm_area() takes the vma we allocated above, the
-	 * kind of pages we're mapping (kernel text pages and kernel writable
-	 * pages respectively), and a pointer to our array of struct pages.
-	 */
-	err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
-	if (err) {
-		printk("lguest: text map_vm_area failed: %i\n", err);
-		goto free_vmas;
-	}
-
-	err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
-			  lg_switcher_pages + SWITCHER_TEXT_PAGES);
-	if (err) {
-		printk("lguest: stacks map_vm_area failed: %i\n", err);
-		goto free_vmas;
-	}
-
-	/*
-	 * Now the Switcher is mapped at the right address, we can't fail!
-	 */
-	printk(KERN_INFO "lguest: mapped switcher at %p\n",
-	       switcher_text_vma->addr);
-	/* And we succeeded... */
-	return 0;
-
-free_vmas:
-	/* Undoes map_vm_area and __get_vm_area */
-	vunmap(switcher_stacks_vma->addr);
-free_text_vma:
-	vunmap(switcher_text_vma->addr);
-free_pages:
-	i = TOTAL_SWITCHER_PAGES;
-free_some_pages:
-	for (--i; i >= 0; i--)
-		__free_pages(lg_switcher_pages[i], 0);
-	kfree(lg_switcher_pages);
-out:
-	return err;
-}
-/*:*/
-
-/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
-static void unmap_switcher(void)
-{
-	unsigned int i;
-
-	/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
-	vunmap(switcher_text_vma->addr);
-	vunmap(switcher_stacks_vma->addr);
-	/* Now we just need to free the pages we copied the switcher into */
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-		__free_pages(lg_switcher_pages[i], 0);
-	kfree(lg_switcher_pages);
-}
-
-/*H:032
- * Dealing With Guest Memory.
- *
- * Before we go too much further into the Host, we need to grok the routines
- * we use to deal with Guest memory.
- *
- * When the Guest gives us (what it thinks is) a physical address, we can use
- * the normal copy_from_user() & copy_to_user() on the corresponding place in
- * the memory region allocated by the Launcher.
- *
- * But we can't trust the Guest: it might be trying to access the Launcher
- * code.  We have to check that the range is below the pfn_limit the Launcher
- * gave us.  We have to make sure that addr + len doesn't give us a false
- * positive by overflowing, too.
- */
-bool lguest_address_ok(const struct lguest *lg,
-		       unsigned long addr, unsigned long len)
-{
-	return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr);
-}
-
-/*
- * This routine copies memory from the Guest.  Here we can see how useful the
- * kill_lguest() routine we met in the Launcher can be: we return a random
- * value (all zeroes) instead of needing to return an error.
- */
-void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
-{
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
-		/* copy_from_user should do this, but as we rely on it... */
-		memset(b, 0, bytes);
-		kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
-	}
-}
-
-/* This is the write (copy into Guest) version. */
-void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
-	       unsigned bytes)
-{
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
-		kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
-}
-/*:*/
-
-/*H:030
- * Let's jump straight to the the main loop which runs the Guest.
- * Remember, this is called by the Launcher reading /dev/lguest, and we keep
- * going around and around until something interesting happens.
- */
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
-{
-	/* If the launcher asked for a register with LHREQ_GETREG */
-	if (cpu->reg_read) {
-		if (put_user(*cpu->reg_read, user))
-			return -EFAULT;
-		cpu->reg_read = NULL;
-		return sizeof(*cpu->reg_read);
-	}
-
-	/* We stop running once the Guest is dead. */
-	while (!cpu->lg->dead) {
-		unsigned int irq;
-		bool more;
-
-		/* First we run any hypercalls the Guest wants done. */
-		if (cpu->hcall)
-			do_hypercalls(cpu);
-
-		/* Do we have to tell the Launcher about a trap? */
-		if (cpu->pending.trap) {
-			if (copy_to_user(user, &cpu->pending,
-					 sizeof(cpu->pending)))
-				return -EFAULT;
-			return sizeof(cpu->pending);
-		}
-
-		/*
-		 * All long-lived kernel loops need to check with this horrible
-		 * thing called the freezer.  If the Host is trying to suspend,
-		 * it stops us.
-		 */
-		try_to_freeze();
-
-		/* Check for signals */
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-
-		/*
-		 * Check if there are any interrupts which can be delivered now:
-		 * if so, this sets up the hander to be executed when we next
-		 * run the Guest.
-		 */
-		irq = interrupt_pending(cpu, &more);
-		if (irq < LGUEST_IRQS)
-			try_deliver_interrupt(cpu, irq, more);
-
-		/*
-		 * Just make absolutely sure the Guest is still alive.  One of
-		 * those hypercalls could have been fatal, for example.
-		 */
-		if (cpu->lg->dead)
-			break;
-
-		/*
-		 * If the Guest asked to be stopped, we sleep.  The Guest's
-		 * clock timer will wake us.
-		 */
-		if (cpu->halted) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			/*
-			 * Just before we sleep, make sure no interrupt snuck in
-			 * which we should be doing.
-			 */
-			if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
-				set_current_state(TASK_RUNNING);
-			else
-				schedule();
-			continue;
-		}
-
-		/*
-		 * OK, now we're ready to jump into the Guest.  First we put up
-		 * the "Do Not Disturb" sign:
-		 */
-		local_irq_disable();
-
-		/* Actually run the Guest until something happens. */
-		lguest_arch_run_guest(cpu);
-
-		/* Now we're ready to be interrupted or moved to other CPUs */
-		local_irq_enable();
-
-		/* Now we deal with whatever happened to the Guest. */
-		lguest_arch_handle_trap(cpu);
-	}
-
-	/* Special case: Guest is 'dead' but wants a reboot. */
-	if (cpu->lg->dead == ERR_PTR(-ERESTART))
-		return -ERESTART;
-
-	/* The Guest is dead => "No such file or directory" */
-	return -ENOENT;
-}
-
-/*H:000
- * Welcome to the Host!
- *
- * By this point your brain has been tickled by the Guest code and numbed by
- * the Launcher code; prepare for it to be stretched by the Host code.  This is
- * the heart.  Let's begin at the initialization routine for the Host's lg
- * module.
- */
-static int __init init(void)
-{
-	int err;
-
-	/* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
-	if (get_kernel_rpl() != 0) {
-		printk("lguest is afraid of being a guest\n");
-		return -EPERM;
-	}
-
-	/* First we put the Switcher up in very high virtual memory. */
-	err = map_switcher();
-	if (err)
-		goto out;
-
-	/* We might need to reserve an interrupt vector. */
-	err = init_interrupts();
-	if (err)
-		goto unmap;
-
-	/* /dev/lguest needs to be registered. */
-	err = lguest_device_init();
-	if (err)
-		goto free_interrupts;
-
-	/* Finally we do some architecture-specific setup. */
-	lguest_arch_host_init();
-
-	/* All good! */
-	return 0;
-
-free_interrupts:
-	free_interrupts();
-unmap:
-	unmap_switcher();
-out:
-	return err;
-}
-
-/* Cleaning up is just the same code, backwards.  With a little French. */
-static void __exit fini(void)
-{
-	lguest_device_remove();
-	free_interrupts();
-	unmap_switcher();
-
-	lguest_arch_host_fini();
-}
-/*:*/
-
-/*
- * The Host side of lguest can be a module.  This is a nice way for people to
- * play with it.
- */
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
deleted file mode 100644
index 601f81c04873..000000000000
--- a/drivers/lguest/hypercalls.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*P:500
- * Just as userspace programs request kernel operations through a system
- * call, the Guest requests Host operations through a "hypercall".  You might
- * notice this nomenclature doesn't really follow any logic, but the name has
- * been around for long enough that we're stuck with it.  As you'd expect, this
- * code is basically a one big switch statement.
-:*/
-
-/*  Copyright (C) 2006 Rusty Russell IBM Corporation
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-*/
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/mm.h>
-#include <linux/ktime.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include "lg.h"
-
-/*H:120
- * This is the core hypercall routine: where the Guest gets what it wants.
- * Or gets killed.  Or, in the case of LHCALL_SHUTDOWN, both.
- */
-static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-	switch (args->arg0) {
-	case LHCALL_FLUSH_ASYNC:
-		/*
-		 * This call does nothing, except by breaking out of the Guest
-		 * it makes us process all the asynchronous hypercalls.
-		 */
-		break;
-	case LHCALL_SEND_INTERRUPTS:
-		/*
-		 * This call does nothing too, but by breaking out of the Guest
-		 * it makes us process any pending interrupts.
-		 */
-		break;
-	case LHCALL_LGUEST_INIT:
-		/*
-		 * You can't get here unless you're already initialized.  Don't
-		 * do that.
-		 */
-		kill_guest(cpu, "already have lguest_data");
-		break;
-	case LHCALL_SHUTDOWN: {
-		char msg[128];
-		/*
-		 * Shutdown is such a trivial hypercall that we do it in five
-		 * lines right here.
-		 *
-		 * If the lgread fails, it will call kill_guest() itself; the
-		 * kill_guest() with the message will be ignored.
-		 */
-		__lgread(cpu, msg, args->arg1, sizeof(msg));
-		msg[sizeof(msg)-1] = '\0';
-		kill_guest(cpu, "CRASH: %s", msg);
-		if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
-			cpu->lg->dead = ERR_PTR(-ERESTART);
-		break;
-	}
-	case LHCALL_FLUSH_TLB:
-		/* FLUSH_TLB comes in two flavors, depending on the argument: */
-		if (args->arg1)
-			guest_pagetable_clear_all(cpu);
-		else
-			guest_pagetable_flush_user(cpu);
-		break;
-
-	/*
-	 * All these calls simply pass the arguments through to the right
-	 * routines.
-	 */
-	case LHCALL_NEW_PGTABLE:
-		guest_new_pagetable(cpu, args->arg1);
-		break;
-	case LHCALL_SET_STACK:
-		guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_SET_PTE:
-#ifdef CONFIG_X86_PAE
-		guest_set_pte(cpu, args->arg1, args->arg2,
-				__pte(args->arg3 | (u64)args->arg4 << 32));
-#else
-		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
-#endif
-		break;
-	case LHCALL_SET_PGD:
-		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
-		break;
-#ifdef CONFIG_X86_PAE
-	case LHCALL_SET_PMD:
-		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
-		break;
-#endif
-	case LHCALL_SET_CLOCKEVENT:
-		guest_set_clockevent(cpu, args->arg1);
-		break;
-	case LHCALL_HALT:
-		/* Similarly, this sets the halted flag for run_guest(). */
-		cpu->halted = 1;
-		break;
-	default:
-		/* It should be an architecture-specific hypercall. */
-		if (lguest_arch_do_hcall(cpu, args))
-			kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
-	}
-}
-
-/*H:124
- * Asynchronous hypercalls are easy: we just look in the array in the
- * Guest's "struct lguest_data" to see if any new ones are marked "ready".
- *
- * We are careful to do these in order: obviously we respect the order the
- * Guest put them in the ring, but we also promise the Guest that they will
- * happen before any normal hypercall (which is why we check this before
- * checking for a normal hcall).
- */
-static void do_async_hcalls(struct lg_cpu *cpu)
-{
-	unsigned int i;
-	u8 st[LHCALL_RING_SIZE];
-
-	/* For simplicity, we copy the entire call status array in at once. */
-	if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
-		return;
-
-	/* We process "struct lguest_data"s hcalls[] ring once. */
-	for (i = 0; i < ARRAY_SIZE(st); i++) {
-		struct hcall_args args;
-		/*
-		 * We remember where we were up to from last time.  This makes
-		 * sure that the hypercalls are done in the order the Guest
-		 * places them in the ring.
-		 */
-		unsigned int n = cpu->next_hcall;
-
-		/* 0xFF means there's no call here (yet). */
-		if (st[n] == 0xFF)
-			break;
-
-		/*
-		 * OK, we have hypercall.  Increment the "next_hcall" cursor,
-		 * and wrap back to 0 if we reach the end.
-		 */
-		if (++cpu->next_hcall == LHCALL_RING_SIZE)
-			cpu->next_hcall = 0;
-
-		/*
-		 * Copy the hypercall arguments into a local copy of the
-		 * hcall_args struct.
-		 */
-		if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
-				   sizeof(struct hcall_args))) {
-			kill_guest(cpu, "Fetching async hypercalls");
-			break;
-		}
-
-		/* Do the hypercall, same as a normal one. */
-		do_hcall(cpu, &args);
-
-		/* Mark the hypercall done. */
-		if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
-			kill_guest(cpu, "Writing result for async hypercall");
-			break;
-		}
-
-		/*
-		 * Stop doing hypercalls if they want to notify the Launcher:
-		 * it needs to service this first.
-		 */
-		if (cpu->pending.trap)
-			break;
-	}
-}
-
-/*
- * Last of all, we look at what happens first of all.  The very first time the
- * Guest makes a hypercall, we end up here to set things up:
- */
-static void initialize(struct lg_cpu *cpu)
-{
-	/*
-	 * You can't do anything until you're initialized.  The Guest knows the
-	 * rules, so we're unforgiving here.
-	 */
-	if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
-		kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
-		return;
-	}
-
-	if (lguest_arch_init_hypercalls(cpu))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-	/*
-	 * The Guest tells us where we're not to deliver interrupts by putting
-	 * the instruction address into "struct lguest_data".
-	 */
-	if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-	/*
-	 * We write the current time into the Guest's data page once so it can
-	 * set its clock.
-	 */
-	write_timestamp(cpu);
-
-	/* page_tables.c will also do some setup. */
-	page_table_guest_data_init(cpu);
-
-	/*
-	 * This is the one case where the above accesses might have been the
-	 * first write to a Guest page.  This may have caused a copy-on-write
-	 * fault, but the old page might be (read-only) in the Guest
-	 * pagetable.
-	 */
-	guest_pagetable_clear_all(cpu);
-}
-/*:*/
-
-/*M:013
- * If a Guest reads from a page (so creates a mapping) that it has never
- * written to, and then the Launcher writes to it (ie. the output of a virtual
- * device), the Guest will still see the old page.  In practice, this never
- * happens: why would the Guest read a page which it has never written to?  But
- * a similar scenario might one day bite us, so it's worth mentioning.
- *
- * Note that if we used a shared anonymous mapping in the Launcher instead of
- * mapping /dev/zero private, we wouldn't worry about cop-on-write.  And we
- * need that to switch the Launcher to processes (away from threads) anyway.
-:*/
-
-/*H:100
- * Hypercalls
- *
- * Remember from the Guest, hypercalls come in two flavors: normal and
- * asynchronous.  This file handles both of types.
- */
-void do_hypercalls(struct lg_cpu *cpu)
-{
-	/* Not initialized yet?  This hypercall must do it. */
-	if (unlikely(!cpu->lg->lguest_data)) {
-		/* Set up the "struct lguest_data" */
-		initialize(cpu);
-		/* Hcall is done. */
-		cpu->hcall = NULL;
-		return;
-	}
-
-	/*
-	 * The Guest has initialized.
-	 *
-	 * Look in the hypercall ring for the async hypercalls:
-	 */
-	do_async_hcalls(cpu);
-
-	/*
-	 * If we stopped reading the hypercall ring because the Guest did a
-	 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
-	 * the hypercall.
-	 */
-	if (!cpu->pending.trap) {
-		do_hcall(cpu, cpu->hcall);
-		/*
-		 * Tricky point: we reset the hcall pointer to mark the
-		 * hypercall as "done".  We use the hcall pointer rather than
-		 * the trap number to indicate a hypercall is pending.
-		 * Normally it doesn't matter: the Guest will run again and
-		 * update the trap number before we come back here.
-		 *
-		 * However, if we are signalled or the Guest sends I/O to the
-		 * Launcher, the run_guest() loop will exit without running the
-		 * Guest.  When it comes back it would try to re-run the
-		 * hypercall.  Finding that bug sucked.
-		 */
-		cpu->hcall = NULL;
-	}
-}
-
-/*
- * This routine supplies the Guest with time: it's used for wallclock time at
- * initial boot and as a rough time source if the TSC isn't available.
- */
-void write_timestamp(struct lg_cpu *cpu)
-{
-	struct timespec now;
-	ktime_get_real_ts(&now);
-	if (copy_to_user(&cpu->lg->lguest_data->time,
-			 &now, sizeof(struct timespec)))
-		kill_guest(cpu, "Writing timestamp");
-}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
deleted file mode 100644
index 67392b6ab845..000000000000
--- a/drivers/lguest/interrupts_and_traps.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*P:800
- * Interrupts (traps) are complicated enough to earn their own file.
- * There are three classes of interrupts:
- *
- * 1) Real hardware interrupts which occur while we're running the Guest,
- * 2) Interrupts for virtual devices attached to the Guest, and
- * 3) Traps and faults from the Guest.
- *
- * Real hardware interrupts must be delivered to the Host, not the Guest.
- * Virtual interrupts must be delivered to the Guest, but we make them look
- * just like real hardware would deliver them.  Traps from the Guest can be set
- * up to go directly back into the Guest, but sometimes the Host wants to see
- * them first, so we also have a way of "reflecting" them into the Guest as if
- * they had been delivered to it directly.
-:*/
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include "lg.h"
-
-/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
-static unsigned int syscall_vector = IA32_SYSCALL_VECTOR;
-module_param(syscall_vector, uint, 0444);
-
-/* The address of the interrupt handler is split into two bits: */
-static unsigned long idt_address(u32 lo, u32 hi)
-{
-	return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
-}
-
-/*
- * The "type" of the interrupt handler is a 4 bit field: we only support a
- * couple of types.
- */
-static int idt_type(u32 lo, u32 hi)
-{
-	return (hi >> 8) & 0xF;
-}
-
-/* An IDT entry can't be used unless the "present" bit is set. */
-static bool idt_present(u32 lo, u32 hi)
-{
-	return (hi & 0x8000);
-}
-
-/*
- * We need a helper to "push" a value onto the Guest's stack, since that's a
- * big part of what delivering an interrupt does.
- */
-static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
-{
-	/* Stack grows upwards: move stack then write value. */
-	*gstack -= 4;
-	lgwrite(cpu, *gstack, u32, val);
-}
-
-/*H:210
- * The push_guest_interrupt_stack() routine saves Guest state on the stack for
- * an interrupt or trap.  The mechanics of delivering traps and interrupts to
- * the Guest are the same, except some traps have an "error code" which gets
- * pushed onto the stack as well: the caller tells us if this is one.
- *
- * We set up the stack just like the CPU does for a real interrupt, so it's
- * identical for the Guest (and the standard "iret" instruction will undo
- * it).
- */
-static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err)
-{
-	unsigned long gstack, origstack;
-	u32 eflags, ss, irq_enable;
-	unsigned long virtstack;
-
-	/*
-	 * There are two cases for interrupts: one where the Guest is already
-	 * in the kernel, and a more complex one where the Guest is in
-	 * userspace.  We check the privilege level to find out.
-	 */
-	if ((cpu->regs->ss&0x3) != GUEST_PL) {
-		/*
-		 * The Guest told us their kernel stack with the SET_STACK
-		 * hypercall: both the virtual address and the segment.
-		 */
-		virtstack = cpu->esp1;
-		ss = cpu->ss1;
-
-		origstack = gstack = guest_pa(cpu, virtstack);
-		/*
-		 * We push the old stack segment and pointer onto the new
-		 * stack: when the Guest does an "iret" back from the interrupt
-		 * handler the CPU will notice they're dropping privilege
-		 * levels and expect these here.
-		 */
-		push_guest_stack(cpu, &gstack, cpu->regs->ss);
-		push_guest_stack(cpu, &gstack, cpu->regs->esp);
-	} else {
-		/* We're staying on the same Guest (kernel) stack. */
-		virtstack = cpu->regs->esp;
-		ss = cpu->regs->ss;
-
-		origstack = gstack = guest_pa(cpu, virtstack);
-	}
-
-	/*
-	 * Remember that we never let the Guest actually disable interrupts, so
-	 * the "Interrupt Flag" bit is always set.  We copy that bit from the
-	 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
-	 * copy it back in "lguest_iret".
-	 */
-	eflags = cpu->regs->eflags;
-	if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
-	    && !(irq_enable & X86_EFLAGS_IF))
-		eflags &= ~X86_EFLAGS_IF;
-
-	/*
-	 * An interrupt is expected to push three things on the stack: the old
-	 * "eflags" word, the old code segment, and the old instruction
-	 * pointer.
-	 */
-	push_guest_stack(cpu, &gstack, eflags);
-	push_guest_stack(cpu, &gstack, cpu->regs->cs);
-	push_guest_stack(cpu, &gstack, cpu->regs->eip);
-
-	/* For the six traps which supply an error code, we push that, too. */
-	if (has_err)
-		push_guest_stack(cpu, &gstack, cpu->regs->errcode);
-
-	/* Adjust the stack pointer and stack segment. */
-	cpu->regs->ss = ss;
-	cpu->regs->esp = virtstack + (gstack - origstack);
-}
-
-/*
- * This actually makes the Guest start executing the given interrupt/trap
- * handler.
- *
- * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
- * interrupt or trap.  It's split into two parts for traditional reasons: gcc
- * on i386 used to be frightened by 64 bit numbers.
- */
-static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi)
-{
-	/* If we're already in the kernel, we don't change stacks. */
-	if ((cpu->regs->ss&0x3) != GUEST_PL)
-		cpu->regs->ss = cpu->esp1;
-
-	/*
-	 * Set the code segment and the address to execute.
-	 */
-	cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
-	cpu->regs->eip = idt_address(lo, hi);
-
-	/*
-	 * Trapping always clears these flags:
-	 * TF: Trap flag
-	 * VM: Virtual 8086 mode
-	 * RF: Resume
-	 * NT: Nested task.
-	 */
-	cpu->regs->eflags &=
-		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-
-	/*
-	 * There are two kinds of interrupt handlers: 0xE is an "interrupt
-	 * gate" which expects interrupts to be disabled on entry.
-	 */
-	if (idt_type(lo, hi) == 0xE)
-		if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Disabling interrupts");
-}
-
-/* This restores the eflags word which was pushed on the stack by a trap */
-static void restore_eflags(struct lg_cpu *cpu)
-{
-	/* This is the physical address of the stack. */
-	unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp);
-
-	/*
-	 * Stack looks like this:
-	 * Address	Contents
-	 * esp		EIP
-	 * esp + 4	CS
-	 * esp + 8	EFLAGS
-	 */
-	cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32);
-	cpu->regs->eflags &=
-		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-}
-
-/*H:205
- * Virtual Interrupts.
- *
- * interrupt_pending() returns the first pending interrupt which isn't blocked
- * by the Guest.  It is called before every entry to the Guest, and just before
- * we go to sleep when the Guest has halted itself.
- */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
-{
-	unsigned int irq;
-	DECLARE_BITMAP(blk, LGUEST_IRQS);
-
-	/* If the Guest hasn't even initialized yet, we can do nothing. */
-	if (!cpu->lg->lguest_data)
-		return LGUEST_IRQS;
-
-	/*
-	 * Take our "irqs_pending" array and remove any interrupts the Guest
-	 * wants blocked: the result ends up in "blk".
-	 */
-	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
-			   sizeof(blk)))
-		return LGUEST_IRQS;
-	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
-
-	/* Find the first interrupt. */
-	irq = find_first_bit(blk, LGUEST_IRQS);
-	*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
-
-	return irq;
-}
-
-/*
- * This actually diverts the Guest to running an interrupt handler, once an
- * interrupt has been identified by interrupt_pending().
- */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
-{
-	struct desc_struct *idt;
-
-	BUG_ON(irq >= LGUEST_IRQS);
-
-	/* If they're halted, interrupts restart them. */
-	if (cpu->halted) {
-		/* Re-enable interrupts. */
-		if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Re-enabling interrupts");
-		cpu->halted = 0;
-	} else {
-		/* Otherwise we check if they have interrupts disabled. */
-		u32 irq_enabled;
-		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
-			irq_enabled = 0;
-		if (!irq_enabled) {
-			/* Make sure they know an IRQ is pending. */
-			put_user(X86_EFLAGS_IF,
-				 &cpu->lg->lguest_data->irq_pending);
-			return;
-		}
-	}
-
-	/*
-	 * Look at the IDT entry the Guest gave us for this interrupt.  The
-	 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
-	 * over them.
-	 */
-	idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
-	/* If they don't have a handler (yet?), we just ignore it */
-	if (idt_present(idt->a, idt->b)) {
-		/* OK, mark it no longer pending and deliver it. */
-		clear_bit(irq, cpu->irqs_pending);
-
-		/*
-		 * They may be about to iret, where they asked us never to
-		 * deliver interrupts.  In this case, we can emulate that iret
-		 * then immediately deliver the interrupt.  This is basically
-		 * a noop: the iret would pop the interrupt frame and restore
-		 * eflags, and then we'd set it up again.  So just restore the
-		 * eflags word and jump straight to the handler in this case.
-		 *
-		 * Denys Vlasenko points out that this isn't quite right: if
-		 * the iret was returning to userspace, then that interrupt
-		 * would reset the stack pointer (which the Guest told us
-		 * about via LHCALL_SET_STACK).  But unless the Guest is being
-		 * *really* weird, that will be the same as the current stack
-		 * anyway.
-		 */
-		if (cpu->regs->eip == cpu->lg->noirq_iret) {
-			restore_eflags(cpu);
-		} else {
-			/*
-			 * set_guest_interrupt() takes a flag to say whether
-			 * this interrupt pushes an error code onto the stack
-			 * as well: virtual interrupts never do.
-			 */
-			push_guest_interrupt_stack(cpu, false);
-		}
-		/* Actually make Guest cpu jump to handler. */
-		guest_run_interrupt(cpu, idt->a, idt->b);
-	}
-
-	/*
-	 * Every time we deliver an interrupt, we update the timestamp in the
-	 * Guest's lguest_data struct.  It would be better for the Guest if we
-	 * did this more often, but it can actually be quite slow: doing it
-	 * here is a compromise which means at least it gets updated every
-	 * timer interrupt.
-	 */
-	write_timestamp(cpu);
-
-	/*
-	 * If there are no other interrupts we want to deliver, clear
-	 * the pending flag.
-	 */
-	if (!more)
-		put_user(0, &cpu->lg->lguest_data->irq_pending);
-}
-
-/* And this is the routine when we want to set an interrupt for the Guest. */
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
-{
-	/*
-	 * Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt.
-	 */
-	set_bit(irq, cpu->irqs_pending);
-
-	/*
-	 * Make sure it sees it; it might be asleep (eg. halted), or running
-	 * the Guest right now, in which case kick_process() will knock it out.
-	 */
-	if (!wake_up_process(cpu->tsk))
-		kick_process(cpu->tsk);
-}
-/*:*/
-
-/*
- * Linux uses trap 128 for system calls.  Plan9 uses 64, and Ron Minnich sent
- * me a patch, so we support that too.  It'd be a big step for lguest if half
- * the Plan 9 user base were to start using it.
- *
- * Actually now I think of it, it's possible that Ron *is* half the Plan 9
- * userbase.  Oh well.
- */
-bool could_be_syscall(unsigned int num)
-{
-	/* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
-	return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
-}
-
-/* The syscall vector it wants must be unused by Host. */
-bool check_syscall_vector(struct lguest *lg)
-{
-	u32 vector;
-
-	if (get_user(vector, &lg->lguest_data->syscall_vec))
-		return false;
-
-	return could_be_syscall(vector);
-}
-
-int init_interrupts(void)
-{
-	/* If they want some strange system call vector, reserve it now */
-	if (syscall_vector != IA32_SYSCALL_VECTOR) {
-		if (test_bit(syscall_vector, used_vectors) ||
-		    vector_used_by_percpu_irq(syscall_vector)) {
-			printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
-				 syscall_vector);
-			return -EBUSY;
-		}
-		set_bit(syscall_vector, used_vectors);
-	}
-
-	return 0;
-}
-
-void free_interrupts(void)
-{
-	if (syscall_vector != IA32_SYSCALL_VECTOR)
-		clear_bit(syscall_vector, used_vectors);
-}
-
-/*H:220
- * Now we've got the routines to deliver interrupts, delivering traps like
- * page fault is easy.  The only trick is that Intel decided that some traps
- * should have error codes:
- */
-static bool has_err(unsigned int trap)
-{
-	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
-}
-
-/* deliver_trap() returns true if it could deliver the trap. */
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
-{
-	/*
-	 * Trap numbers are always 8 bit, but we set an impossible trap number
-	 * for traps inside the Switcher, so check that here.
-	 */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		return false;
-
-	/*
-	 * Early on the Guest hasn't set the IDT entries (or maybe it put a
-	 * bogus one in): if we fail here, the Guest will be killed.
-	 */
-	if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
-		return false;
-	push_guest_interrupt_stack(cpu, has_err(num));
-	guest_run_interrupt(cpu, cpu->arch.idt[num].a,
-			    cpu->arch.idt[num].b);
-	return true;
-}
-
-/*H:250
- * Here's the hard part: returning to the Host every time a trap happens
- * and then calling deliver_trap() and re-entering the Guest is slow.
- * Particularly because Guest userspace system calls are traps (usually trap
- * 128).
- *
- * So we'd like to set up the IDT to tell the CPU to deliver traps directly
- * into the Guest.  This is possible, but the complexities cause the size of
- * this file to double!  However, 150 lines of code is worth writing for taking
- * system calls down from 1750ns to 270ns.  Plus, if lguest didn't do it, all
- * the other hypervisors would beat it up at lunchtime.
- *
- * This routine indicates if a particular trap number could be delivered
- * directly.
- *
- * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
- * trap gate for syscalls, so this trick is ineffective.  See Mastery for
- * how we could do this anyway...
- */
-static bool direct_trap(unsigned int num)
-{
-	/*
-	 * Hardware interrupts don't go to the Guest at all (except system
-	 * call).
-	 */
-	if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
-		return false;
-
-	/*
-	 * The Host needs to see page faults (for shadow paging and to save the
-	 * fault address), general protection faults (in/out emulation) and
-	 * device not available (TS handling) and of course, the hypercall trap.
-	 */
-	return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
-}
-/*:*/
-
-/*M:005
- * The Guest has the ability to turn its interrupt gates into trap gates,
- * if it is careful.  The Host will let trap gates can go directly to the
- * Guest, but the Guest needs the interrupts atomically disabled for an
- * interrupt gate.  The Host could provide a mechanism to register more
- * "no-interrupt" regions, and the Guest could point the trap gate at
- * instructions within that region, where it can safely disable interrupts.
- */
-
-/*M:006
- * The Guests do not use the sysenter (fast system call) instruction,
- * because it's hardcoded to enter privilege level 0 and so can't go direct.
- * It's about twice as fast as the older "int 0x80" system call, so it might
- * still be worthwhile to handle it in the Switcher and lcall down to the
- * Guest.  The sysenter semantics are hairy tho: search for that keyword in
- * entry.S
-:*/
-
-/*H:260
- * When we make traps go directly into the Guest, we need to make sure
- * the kernel stack is valid (ie. mapped in the page tables).  Otherwise, the
- * CPU trying to deliver the trap will fault while trying to push the interrupt
- * words on the stack: this is called a double fault, and it forces us to kill
- * the Guest.
- *
- * Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
- */
-void pin_stack_pages(struct lg_cpu *cpu)
-{
-	unsigned int i;
-
-	/*
-	 * Depending on the CONFIG_4KSTACKS option, the Guest can have one or
-	 * two pages of stack space.
-	 */
-	for (i = 0; i < cpu->lg->stack_pages; i++)
-		/*
-		 * The stack grows *upwards*, so the address we're given is the
-		 * start of the page after the kernel stack.  Subtract one to
-		 * get back onto the first stack page, and keep subtracting to
-		 * get to the rest of the stack pages.
-		 */
-		pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
-}
-
-/*
- * Direct traps also mean that we need to know whenever the Guest wants to use
- * a different kernel stack, so we can change the guest TSS to use that
- * stack.  The TSS entries expect a virtual address, so unlike most addresses
- * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
- * physical.
- *
- * In Linux each process has its own kernel stack, so this happens a lot: we
- * change stacks on each context switch.
- */
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
-{
-	/*
-	 * You're not allowed a stack segment with privilege level 0: bad Guest!
-	 */
-	if ((seg & 0x3) != GUEST_PL)
-		kill_guest(cpu, "bad stack segment %i", seg);
-	/* We only expect one or two stack pages. */
-	if (pages > 2)
-		kill_guest(cpu, "bad stack pages %u", pages);
-	/* Save where the stack is, and how many pages */
-	cpu->ss1 = seg;
-	cpu->esp1 = esp;
-	cpu->lg->stack_pages = pages;
-	/* Make sure the new stack pages are mapped */
-	pin_stack_pages(cpu);
-}
-
-/*
- * All this reference to mapping stacks leads us neatly into the other complex
- * part of the Host: page table handling.
- */
-
-/*H:235
- * This is the routine which actually checks the Guest's IDT entry and
- * transfers it into the entry in "struct lguest":
- */
-static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
-		     unsigned int num, u32 lo, u32 hi)
-{
-	u8 type = idt_type(lo, hi);
-
-	/* We zero-out a not-present entry */
-	if (!idt_present(lo, hi)) {
-		trap->a = trap->b = 0;
-		return;
-	}
-
-	/* We only support interrupt and trap gates. */
-	if (type != 0xE && type != 0xF)
-		kill_guest(cpu, "bad IDT type %i", type);
-
-	/*
-	 * We only copy the handler address, present bit, privilege level and
-	 * type.  The privilege level controls where the trap can be triggered
-	 * manually with an "int" instruction.  This is usually GUEST_PL,
-	 * except for system calls which userspace can use.
-	 */
-	trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
-	trap->b = (hi&0xFFFFEF00);
-}
-
-/*H:230
- * While we're here, dealing with delivering traps and interrupts to the
- * Guest, we might as well complete the picture: how the Guest tells us where
- * it wants them to go.  This would be simple, except making traps fast
- * requires some tricks.
- *
- * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
- * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
- */
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
-{
-	/*
-	 * Guest never handles: NMI, doublefault, spurious interrupt or
-	 * hypercall.  We ignore when it tries to set them.
-	 */
-	if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
-		return;
-
-	/*
-	 * Mark the IDT as changed: next time the Guest runs we'll know we have
-	 * to copy this again.
-	 */
-	cpu->changed |= CHANGED_IDT;
-
-	/* Check that the Guest doesn't try to step outside the bounds. */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		kill_guest(cpu, "Setting idt entry %u", num);
-	else
-		set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
-}
-
-/*
- * The default entry for each interrupt points into the Switcher routines which
- * simply return to the Host.  The run_guest() loop will then call
- * deliver_trap() to bounce it back into the Guest.
- */
-static void default_idt_entry(struct desc_struct *idt,
-			      int trap,
-			      const unsigned long handler,
-			      const struct desc_struct *base)
-{
-	/* A present interrupt gate. */
-	u32 flags = 0x8e00;
-
-	/*
-	 * Set the privilege level on the entry for the hypercall: this allows
-	 * the Guest to use the "int" instruction to trigger it.
-	 */
-	if (trap == LGUEST_TRAP_ENTRY)
-		flags |= (GUEST_PL << 13);
-	else if (base)
-		/*
-		 * Copy privilege level from what Guest asked for.  This allows
-		 * debug (int 3) traps from Guest userspace, for example.
-		 */
-		flags |= (base->b & 0x6000);
-
-	/* Now pack it into the IDT entry in its weird format. */
-	idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
-	idt->b = (handler&0xFFFF0000) | flags;
-}
-
-/* When the Guest first starts, we put default entries into the IDT. */
-void setup_default_idt_entries(struct lguest_ro_state *state,
-			       const unsigned long *def)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
-		default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
-}
-
-/*H:240
- * We don't use the IDT entries in the "struct lguest" directly, instead
- * we copy them into the IDT which we've set up for Guests on this CPU, just
- * before we run the Guest.  This routine does that copy.
- */
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-		const unsigned long *def)
-{
-	unsigned int i;
-
-	/*
-	 * We can simply copy the direct traps, otherwise we use the default
-	 * ones in the Switcher: they will return to the Host.
-	 */
-	for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
-		const struct desc_struct *gidt = &cpu->arch.idt[i];
-
-		/* If no Guest can ever override this trap, leave it alone. */
-		if (!direct_trap(i))
-			continue;
-
-		/*
-		 * Only trap gates (type 15) can go direct to the Guest.
-		 * Interrupt gates (type 14) disable interrupts as they are
-		 * entered, which we never let the Guest do.  Not present
-		 * entries (type 0x0) also can't go direct, of course.
-		 *
-		 * If it can't go direct, we still need to copy the priv. level:
-		 * they might want to give userspace access to a software
-		 * interrupt.
-		 */
-		if (idt_type(gidt->a, gidt->b) == 0xF)
-			idt[i] = *gidt;
-		else
-			default_idt_entry(&idt[i], i, def[i], gidt);
-	}
-}
-
-/*H:200
- * The Guest Clock.
- *
- * There are two sources of virtual interrupts.  We saw one in lguest_user.c:
- * the Launcher sending interrupts for virtual devices.  The other is the Guest
- * timer interrupt.
- *
- * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
- * the next timer interrupt (in nanoseconds).  We use the high-resolution timer
- * infrastructure to set a callback at that time.
- *
- * 0 means "turn off the clock".
- */
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
-{
-	ktime_t expires;
-
-	if (unlikely(delta == 0)) {
-		/* Clock event device is shutting down. */
-		hrtimer_cancel(&cpu->hrt);
-		return;
-	}
-
-	/*
-	 * We use wallclock time here, so the Guest might not be running for
-	 * all the time between now and the timer interrupt it asked for.  This
-	 * is almost always the right thing to do.
-	 */
-	expires = ktime_add_ns(ktime_get_real(), delta);
-	hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
-}
-
-/* This is the function called when the Guest's timer expires. */
-static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
-{
-	struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
-
-	/* Remember the first interrupt is the timer interrupt. */
-	set_interrupt(cpu, 0);
-	return HRTIMER_NORESTART;
-}
-
-/* This sets up the timer for this Guest. */
-void init_clockdev(struct lg_cpu *cpu)
-{
-	hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-	cpu->hrt.function = clockdev_fn;
-}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
deleted file mode 100644
index 2356a2318034..000000000000
--- a/drivers/lguest/lg.h
+++ /dev/null
@@ -1,258 +0,0 @@
-#ifndef _LGUEST_H
-#define _LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/stringify.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/wait.h>
-#include <linux/hrtimer.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include <asm/lguest.h>
-
-struct pgdir {
-	unsigned long gpgdir;
-	bool switcher_mapped;
-	int last_host_cpu;
-	pgd_t *pgdir;
-};
-
-/* We have two pages shared with guests, per cpu.  */
-struct lguest_pages {
-	/* This is the stack page mapped rw in guest */
-	char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
-	struct lguest_regs regs;
-
-	/* This is the host state & guest descriptor page, ro in guest */
-	struct lguest_ro_state state;
-} __attribute__((aligned(PAGE_SIZE)));
-
-#define CHANGED_IDT		1
-#define CHANGED_GDT		2
-#define CHANGED_GDT_TLS		4 /* Actually a subset of CHANGED_GDT */
-#define CHANGED_ALL	        3
-
-struct lg_cpu {
-	unsigned int id;
-	struct lguest *lg;
-	struct task_struct *tsk;
-	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
-
-	u32 cr2;
-	u32 esp1;
-	u16 ss1;
-
-	/* Bitmap of what has changed: see CHANGED_* above. */
-	int changed;
-
-	/* Pending operation. */
-	struct lguest_pending pending;
-
-	unsigned long *reg_read; /* register from LHREQ_GETREG */
-
-	/* At end of a page shared mapped over lguest_pages in guest. */
-	unsigned long regs_page;
-	struct lguest_regs *regs;
-
-	struct lguest_pages *last_pages;
-
-	/* Initialization mode: linear map everything. */
-	bool linear_pages;
-	int cpu_pgd; /* Which pgd this cpu is currently using */
-
-	/* If a hypercall was asked for, this points to the arguments. */
-	struct hcall_args *hcall;
-	u32 next_hcall;
-
-	/* Virtual clock device */
-	struct hrtimer hrt;
-
-	/* Did the Guest tell us to halt? */
-	int halted;
-
-	/* Pending virtual interrupts */
-	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
-
-	struct lg_cpu_arch arch;
-};
-
-/* The private info the thread maintains about the guest. */
-struct lguest {
-	struct lguest_data __user *lguest_data;
-	struct lg_cpu cpus[NR_CPUS];
-	unsigned int nr_cpus;
-
-	/* Valid guest memory pages must be < this. */
-	u32 pfn_limit;
-
-	/* Device memory is >= pfn_limit and < device_limit. */
-	u32 device_limit;
-
-	/*
-	 * This provides the offset to the base of guest-physical memory in the
-	 * Launcher.
-	 */
-	void __user *mem_base;
-	unsigned long kernel_address;
-
-	struct pgdir pgdirs[4];
-
-	unsigned long noirq_iret;
-
-	unsigned int stack_pages;
-	u32 tsc_khz;
-
-	/* Dead? */
-	const char *dead;
-};
-
-extern struct mutex lguest_lock;
-
-/* core.c: */
-bool lguest_address_ok(const struct lguest *lg,
-		       unsigned long addr, unsigned long len);
-void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
-void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
-extern struct page **lg_switcher_pages;
-
-/*H:035
- * Using memory-copy operations like that is usually inconvient, so we
- * have the following helper macros which read and write a specific type (often
- * an unsigned long).
- *
- * This reads into a variable of the given type then returns that.
- */
-#define lgread(cpu, addr, type)						\
-	({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
-
-/* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(cpu, addr, type, val)				\
-	do {							\
-		typecheck(type, val);				\
-		__lgwrite((cpu), (addr), &(val), sizeof(val));	\
-	} while(0)
-/* (end of memory access helper routines) :*/
-
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
-
-/*
- * Helper macros to obtain the first 12 or the last 20 bits, this is only the
- * first step in the migration to the kernel types.  pte_pfn is already defined
- * in the kernel.
- */
-#define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
-#define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
-#define pmd_flags(x)    (pmd_val(x) & ~PAGE_MASK)
-#define pmd_pfn(x)	(pmd_val(x) >> PAGE_SHIFT)
-
-/* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lg_cpu *cpu);
-void setup_default_idt_entries(struct lguest_ro_state *state,
-			       const unsigned long *def);
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-		const unsigned long *def);
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
-bool send_notify_to_eventfd(struct lg_cpu *cpu);
-void init_clockdev(struct lg_cpu *cpu);
-bool check_syscall_vector(struct lguest *lg);
-bool could_be_syscall(unsigned int num);
-int init_interrupts(void);
-void free_interrupts(void);
-
-/* segments.c: */
-void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lg_cpu *cpu);
-void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
-
-/* page_tables.c: */
-int init_guest_pagetable(struct lguest *lg);
-void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#ifdef CONFIG_X86_PAE
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#endif
-void guest_pagetable_clear_all(struct lg_cpu *cpu);
-void guest_pagetable_flush_user(struct lg_cpu *cpu);
-void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
-		   unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
-		 unsigned long *iomem);
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
-void page_table_guest_data_init(struct lg_cpu *cpu);
-
-/* <arch>/core.c: */
-void lguest_arch_host_init(void);
-void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lg_cpu *cpu);
-void lguest_arch_handle_trap(struct lg_cpu *cpu);
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
-
-/* <arch>/switcher.S: */
-extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
-
-/* lguest_user.c: */
-int lguest_device_init(void);
-void lguest_device_remove(void);
-
-/* hypercalls.c: */
-void do_hypercalls(struct lg_cpu *cpu);
-void write_timestamp(struct lg_cpu *cpu);
-
-/*L:035
- * Let's step aside for the moment, to study one important routine that's used
- * widely in the Host code.
- *
- * There are many cases where the Guest can do something invalid, like pass crap
- * to a hypercall.  Since only the Guest kernel can make hypercalls, it's quite
- * acceptable to simply terminate the Guest and give the Launcher a nicely
- * formatted reason.  It's also simpler for the Guest itself, which doesn't
- * need to check most hypercalls for "success"; if you're still running, it
- * succeeded.
- *
- * Once this is called, the Guest will never run again, so most Host code can
- * call this then continue as if nothing had happened.  This means many
- * functions don't have to explicitly return an error code, which keeps the
- * code simple.
- *
- * It also means that this can be called more than once: only the first one is
- * remembered.  The only trick is that we still need to kill the Guest even if
- * we can't allocate memory to store the reason.  Linux has a neat way of
- * packing error codes into invalid pointers, so we use that here.
- *
- * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
- * } while(0)".
- */
-#define kill_guest(cpu, fmt...)					\
-do {								\
-	if (!(cpu)->lg->dead) {					\
-		(cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt);	\
-		if (!(cpu)->lg->dead)				\
-			(cpu)->lg->dead = ERR_PTR(-ENOMEM);	\
-	}							\
-} while(0)
-/* (End of aside) :*/
-
-#endif	/* __ASSEMBLY__ */
-#endif	/* _LGUEST_H */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
deleted file mode 100644
index 1a6787bc9386..000000000000
--- a/drivers/lguest/lguest_user.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*P:200 This contains all the /dev/lguest code, whereby the userspace
- * launcher controls and communicates with the Guest.  For example,
- * the first write will tell us the Guest's memory layout and entry
- * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest accessing a device.
-:*/
-#include <linux/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include "lg.h"
-
-/*L:052
-  The Launcher can get the registers, and also set some of them.
-*/
-static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long which;
-
-	/* We re-use the ptrace structure to specify which register to read. */
-	if (get_user(which, input) != 0)
-		return -EFAULT;
-
-	/*
-	 * We set up the cpu register pointer, and their next read will
-	 * actually get the value (instead of running the guest).
-	 *
-	 * The last argument 'true' says we can access any register.
-	 */
-	cpu->reg_read = lguest_arch_regptr(cpu, which, true);
-	if (!cpu->reg_read)
-		return -ENOENT;
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(unsigned long) * 2;
-}
-
-static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long which, value, *reg;
-
-	/* We re-use the ptrace structure to specify which register to read. */
-	if (get_user(which, input) != 0)
-		return -EFAULT;
-	input++;
-	if (get_user(value, input) != 0)
-		return -EFAULT;
-
-	/* The last argument 'false' means we can't access all registers. */
-	reg = lguest_arch_regptr(cpu, which, false);
-	if (!reg)
-		return -ENOENT;
-
-	*reg = value;
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(unsigned long) * 3;
-}
-
-/*L:050
- * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
- * number to /dev/lguest.
- */
-static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long irq;
-
-	if (get_user(irq, input) != 0)
-		return -EFAULT;
-	if (irq >= LGUEST_IRQS)
-		return -EINVAL;
-
-	/*
-	 * Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt.
-	 */
-	set_interrupt(cpu, irq);
-	return 0;
-}
-
-/*L:053
- * Deliver a trap: this is used by the Launcher if it can't emulate
- * an instruction.
- */
-static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long trapnum;
-
-	if (get_user(trapnum, input) != 0)
-		return -EFAULT;
-
-	if (!deliver_trap(cpu, trapnum))
-		return -EINVAL;
-
-	return 0;
-}
-
-/*L:040
- * Once our Guest is initialized, the Launcher makes it run by reading
- * from /dev/lguest.
- */
-static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
-{
-	struct lguest *lg = file->private_data;
-	struct lg_cpu *cpu;
-	unsigned int cpu_id = *o;
-
-	/* You must write LHREQ_INITIALIZE first! */
-	if (!lg)
-		return -EINVAL;
-
-	/* Watch out for arbitrary vcpu indexes! */
-	if (cpu_id >= lg->nr_cpus)
-		return -EINVAL;
-
-	cpu = &lg->cpus[cpu_id];
-
-	/* If you're not the task which owns the Guest, go away. */
-	if (current != cpu->tsk)
-		return -EPERM;
-
-	/* If the Guest is already dead, we indicate why */
-	if (lg->dead) {
-		size_t len;
-
-		/* lg->dead either contains an error code, or a string. */
-		if (IS_ERR(lg->dead))
-			return PTR_ERR(lg->dead);
-
-		/* We can only return as much as the buffer they read with. */
-		len = min(size, strlen(lg->dead)+1);
-		if (copy_to_user(user, lg->dead, len) != 0)
-			return -EFAULT;
-		return len;
-	}
-
-	/*
-	 * If we returned from read() last time because the Guest sent I/O,
-	 * clear the flag.
-	 */
-	if (cpu->pending.trap)
-		cpu->pending.trap = 0;
-
-	/* Run the Guest until something interesting happens. */
-	return run_guest(cpu, (unsigned long __user *)user);
-}
-
-/*L:025
- * This actually initializes a CPU.  For the moment, a Guest is only
- * uniprocessor, so "id" is always 0.
- */
-static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
-{
-	/* We have a limited number of CPUs in the lguest struct. */
-	if (id >= ARRAY_SIZE(cpu->lg->cpus))
-		return -EINVAL;
-
-	/* Set up this CPU's id, and pointer back to the lguest struct. */
-	cpu->id = id;
-	cpu->lg = container_of(cpu, struct lguest, cpus[id]);
-	cpu->lg->nr_cpus++;
-
-	/* Each CPU has a timer it can set. */
-	init_clockdev(cpu);
-
-	/*
-	 * We need a complete page for the Guest registers: they are accessible
-	 * to the Guest and we can only grant it access to whole pages.
-	 */
-	cpu->regs_page = get_zeroed_page(GFP_KERNEL);
-	if (!cpu->regs_page)
-		return -ENOMEM;
-
-	/* We actually put the registers at the end of the page. */
-	cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
-
-	/*
-	 * Now we initialize the Guest's registers, handing it the start
-	 * address.
-	 */
-	lguest_arch_setup_regs(cpu, start_ip);
-
-	/*
-	 * We keep a pointer to the Launcher task (ie. current task) for when
-	 * other Guests want to wake this one (eg. console input).
-	 */
-	cpu->tsk = current;
-
-	/*
-	 * We need to keep a pointer to the Launcher's memory map, because if
-	 * the Launcher dies we need to clean it up.  If we don't keep a
-	 * reference, it is destroyed before close() is called.
-	 */
-	cpu->mm = get_task_mm(cpu->tsk);
-
-	/*
-	 * We remember which CPU's pages this Guest used last, for optimization
-	 * when the same Guest runs on the same CPU twice.
-	 */
-	cpu->last_pages = NULL;
-
-	/* No error == success. */
-	return 0;
-}
-
-/*L:020
- * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
- * addition to the LHREQ_INITIALIZE value).  These are:
- *
- * base: The start of the Guest-physical memory inside the Launcher memory.
- *
- * pfnlimit: The highest (Guest-physical) page number the Guest should be
- * allowed to access.  The Guest memory lives inside the Launcher, so it sets
- * this to ensure the Guest can only reach its own memory.
- *
- * start: The first instruction to execute ("eip" in x86-speak).
- */
-static int initialize(struct file *file, const unsigned long __user *input)
-{
-	/* "struct lguest" contains all we (the Host) know about a Guest. */
-	struct lguest *lg;
-	int err;
-	unsigned long args[4];
-
-	/*
-	 * We grab the Big Lguest lock, which protects against multiple
-	 * simultaneous initializations.
-	 */
-	mutex_lock(&lguest_lock);
-	/* You can't initialize twice!  Close the device and start again... */
-	if (file->private_data) {
-		err = -EBUSY;
-		goto unlock;
-	}
-
-	if (copy_from_user(args, input, sizeof(args)) != 0) {
-		err = -EFAULT;
-		goto unlock;
-	}
-
-	lg = kzalloc(sizeof(*lg), GFP_KERNEL);
-	if (!lg) {
-		err = -ENOMEM;
-		goto unlock;
-	}
-
-	/* Populate the easy fields of our "struct lguest" */
-	lg->mem_base = (void __user *)args[0];
-	lg->pfn_limit = args[1];
-	lg->device_limit = args[3];
-
-	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
-	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
-	if (err)
-		goto free_lg;
-
-	/*
-	 * Initialize the Guest's shadow page tables.  This allocates
-	 * memory, so can fail.
-	 */
-	err = init_guest_pagetable(lg);
-	if (err)
-		goto free_regs;
-
-	/* We keep our "struct lguest" in the file's private_data. */
-	file->private_data = lg;
-
-	mutex_unlock(&lguest_lock);
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(args);
-
-free_regs:
-	/* FIXME: This should be in free_vcpu */
-	free_page(lg->cpus[0].regs_page);
-free_lg:
-	kfree(lg);
-unlock:
-	mutex_unlock(&lguest_lock);
-	return err;
-}
-
-/*L:010
- * The first operation the Launcher does must be a write.  All writes
- * start with an unsigned long number: for the first write this must be
- * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use
- * writes of other values to send interrupts or set up receipt of notifications.
- *
- * Note that we overload the "offset" in the /dev/lguest file to indicate what
- * CPU number we're dealing with.  Currently this is always 0 since we only
- * support uniprocessor Guests, but you can see the beginnings of SMP support
- * here.
- */
-static ssize_t write(struct file *file, const char __user *in,
-		     size_t size, loff_t *off)
-{
-	/*
-	 * Once the Guest is initialized, we hold the "struct lguest" in the
-	 * file private data.
-	 */
-	struct lguest *lg = file->private_data;
-	const unsigned long __user *input = (const unsigned long __user *)in;
-	unsigned long req;
-	struct lg_cpu *uninitialized_var(cpu);
-	unsigned int cpu_id = *off;
-
-	/* The first value tells us what this request is. */
-	if (get_user(req, input) != 0)
-		return -EFAULT;
-	input++;
-
-	/* If you haven't initialized, you must do that first. */
-	if (req != LHREQ_INITIALIZE) {
-		if (!lg || (cpu_id >= lg->nr_cpus))
-			return -EINVAL;
-		cpu = &lg->cpus[cpu_id];
-
-		/* Once the Guest is dead, you can only read() why it died. */
-		if (lg->dead)
-			return -ENOENT;
-	}
-
-	switch (req) {
-	case LHREQ_INITIALIZE:
-		return initialize(file, input);
-	case LHREQ_IRQ:
-		return user_send_irq(cpu, input);
-	case LHREQ_GETREG:
-		return getreg_setup(cpu, input);
-	case LHREQ_SETREG:
-		return setreg(cpu, input);
-	case LHREQ_TRAP:
-		return trap(cpu, input);
-	default:
-		return -EINVAL;
-	}
-}
-
-static int open(struct inode *inode, struct file *file)
-{
-	file->private_data = NULL;
-
-	return 0;
-}
-
-/*L:060
- * The final piece of interface code is the close() routine.  It reverses
- * everything done in initialize().  This is usually called because the
- * Launcher exited.
- *
- * Note that the close routine returns 0 or a negative error number: it can't
- * really fail, but it can whine.  I blame Sun for this wart, and K&R C for
- * letting them do it.
-:*/
-static int close(struct inode *inode, struct file *file)
-{
-	struct lguest *lg = file->private_data;
-	unsigned int i;
-
-	/* If we never successfully initialized, there's nothing to clean up */
-	if (!lg)
-		return 0;
-
-	/*
-	 * We need the big lock, to protect from inter-guest I/O and other
-	 * Launchers initializing guests.
-	 */
-	mutex_lock(&lguest_lock);
-
-	/* Free up the shadow page tables for the Guest. */
-	free_guest_pagetable(lg);
-
-	for (i = 0; i < lg->nr_cpus; i++) {
-		/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
-		hrtimer_cancel(&lg->cpus[i].hrt);
-		/* We can free up the register page we allocated. */
-		free_page(lg->cpus[i].regs_page);
-		/*
-		 * Now all the memory cleanups are done, it's safe to release
-		 * the Launcher's memory management structure.
-		 */
-		mmput(lg->cpus[i].mm);
-	}
-
-	/*
-	 * If lg->dead doesn't contain an error code it will be NULL or a
-	 * kmalloc()ed string, either of which is ok to hand to kfree().
-	 */
-	if (!IS_ERR(lg->dead))
-		kfree(lg->dead);
-	/* Free the memory allocated to the lguest_struct */
-	kfree(lg);
-	/* Release lock and exit. */
-	mutex_unlock(&lguest_lock);
-
-	return 0;
-}
-
-/*L:000
- * Welcome to our journey through the Launcher!
- *
- * The Launcher is the Host userspace program which sets up, runs and services
- * the Guest.  In fact, many comments in the Drivers which refer to "the Host"
- * doing things are inaccurate: the Launcher does all the device handling for
- * the Guest, but the Guest can't know that.
- *
- * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
- * shall see more of that later.
- *
- * We begin our understanding with the Host kernel interface which the Launcher
- * uses: reading and writing a character device called /dev/lguest.  All the
- * work happens in the read(), write() and close() routines:
- */
-static const struct file_operations lguest_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = open,
-	.release = close,
-	.write	 = write,
-	.read	 = read,
-	.llseek  = default_llseek,
-};
-/*:*/
-
-/*
- * This is a textbook example of a "misc" character device.  Populate a "struct
- * miscdevice" and register it with misc_register().
- */
-static struct miscdevice lguest_dev = {
-	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= "lguest",
-	.fops	= &lguest_fops,
-};
-
-int __init lguest_device_init(void)
-{
-	return misc_register(&lguest_dev);
-}
-
-void __exit lguest_device_remove(void)
-{
-	misc_deregister(&lguest_dev);
-}
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
deleted file mode 100644
index 0bc127e9f16a..000000000000
--- a/drivers/lguest/page_tables.c
+++ /dev/null
@@ -1,1239 +0,0 @@
-/*P:700
- * The pagetable code, on the other hand, still shows the scars of
- * previous encounters.  It's functional, and as neat as it can be in the
- * circumstances, but be wary, for these things are subtle and break easily.
- * The Guest provides a virtual to physical mapping, but we can neither trust
- * it nor use it: we verify and convert it here then point the CPU to the
- * converted Guest pages when running the Guest.
-:*/
-
-/* Copyright (C) Rusty Russell IBM Corporation 2013.
- * GPL v2 and any later version */
-#include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/random.h>
-#include <linux/percpu.h>
-#include <asm/tlbflush.h>
-#include <linux/uaccess.h>
-#include "lg.h"
-
-/*M:008
- * We hold reference to pages, which prevents them from being swapped.
- * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
- * to swap out.  If we had this, and a shrinker callback to trim PTE pages, we
- * could probably consider launching Guests as non-root.
-:*/
-
-/*H:300
- * The Page Table Code
- *
- * We use two-level page tables for the Guest, or three-level with PAE.  If
- * you're not entirely comfortable with virtual addresses, physical addresses
- * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
- * Table Handling" (with diagrams!).
- *
- * The Guest keeps page tables, but we maintain the actual ones here: these are
- * called "shadow" page tables.  Which is a very Guest-centric name: these are
- * the real page tables the CPU uses, although we keep them up to date to
- * reflect the Guest's.  (See what I mean about weird naming?  Since when do
- * shadows reflect anything?)
- *
- * Anyway, this is the most complicated part of the Host code.  There are seven
- * parts to this:
- *  (i) Looking up a page table entry when the Guest faults,
- *  (ii) Making sure the Guest stack is mapped,
- *  (iii) Setting up a page table entry when the Guest tells us one has changed,
- *  (iv) Switching page tables,
- *  (v) Flushing (throwing away) page tables,
- *  (vi) Mapping the Switcher when the Guest is about to run,
- *  (vii) Setting up the page tables initially.
-:*/
-
-/*
- * The Switcher uses the complete top PTE page.  That's 1024 PTE entries (4MB)
- * or 512 PTE entries with PAE (2MB).
- */
-#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
-
-/*
- * For PAE we need the PMD index as well. We use the last 2MB, so we
- * will need the last pmd entry of the last pmd page.
- */
-#ifdef CONFIG_X86_PAE
-#define CHECK_GPGD_MASK		_PAGE_PRESENT
-#else
-#define CHECK_GPGD_MASK		_PAGE_TABLE
-#endif
-
-/*H:320
- * The page table code is curly enough to need helper functions to keep it
- * clear and clean.  The kernel itself provides many of them; one advantage
- * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting.
- *
- * There are two functions which return pointers to the shadow (aka "real")
- * page tables.
- *
- * spgd_addr() takes the virtual address and returns a pointer to the top-level
- * page directory entry (PGD) for that address.  Since we keep track of several
- * page tables, the "i" argument tells us which one we're interested in (it's
- * usually the current one).
- */
-static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
-{
-	unsigned int index = pgd_index(vaddr);
-
-	/* Return a pointer index'th pgd entry for the i'th page table. */
-	return &cpu->lg->pgdirs[i].pgdir[index];
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * This routine then takes the PGD entry given above, which contains the
- * address of the PMD page.  It then returns a pointer to the PMD entry for the
- * given address.
- */
-static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-	unsigned int index = pmd_index(vaddr);
-	pmd_t *page;
-
-	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-	page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-
-	return &page[index];
-}
-#endif
-
-/*
- * This routine then takes the page directory entry returned above, which
- * contains the address of the page table entry (PTE) page.  It then returns a
- * pointer to the PTE entry for the given address.
- */
-static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-#ifdef CONFIG_X86_PAE
-	pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
-	pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
-
-	/* You should never call this if the PMD entry wasn't valid */
-	BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
-#else
-	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-#endif
-
-	return &page[pte_index(vaddr)];
-}
-
-/*
- * These functions are just like the above, except they access the Guest
- * page tables.  Hence they return a Guest address.
- */
-static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned int index = vaddr >> (PGDIR_SHIFT);
-	return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
-}
-
-#ifdef CONFIG_X86_PAE
-/* Follow the PGD to the PMD. */
-static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
-{
-	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
-}
-
-/* Follow the PMD to the PTE. */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-			       pmd_t gpmd, unsigned long vaddr)
-{
-	unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
-
-	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
-	return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#else
-/* Follow the PGD to the PTE (no mid-level for !PAE). */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-				pgd_t gpgd, unsigned long vaddr)
-{
-	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#endif
-/*:*/
-
-/*M:007
- * get_pfn is slow: we could probably try to grab batches of pages here as
- * an optimization (ie. pre-faulting).
-:*/
-
-/*H:350
- * This routine takes a page number given by the Guest and converts it to
- * an actual, physical page number.  It can fail for several reasons: the
- * virtual address might not be mapped by the Launcher, the write flag is set
- * and the page is read-only, or the write flag was set and the page was
- * shared so had to be copied, but we ran out of memory.
- *
- * This holds a reference to the page, so release_pte() is careful to put that
- * back.
- */
-static unsigned long get_pfn(unsigned long virtpfn, int write)
-{
-	struct page *page;
-
-	/* gup me one page at this address please! */
-	if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1)
-		return page_to_pfn(page);
-
-	/* This value indicates failure. */
-	return -1UL;
-}
-
-/*H:340
- * Converting a Guest page table entry to a shadow (ie. real) page table
- * entry can be a little tricky.  The flags are (almost) the same, but the
- * Guest PTE contains a virtual page number: the CPU needs the real page
- * number.
- */
-static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
-{
-	unsigned long pfn, base, flags;
-
-	/*
-	 * The Guest sets the global flag, because it thinks that it is using
-	 * PGE.  We only told it to use PGE so it would tell us whether it was
-	 * flushing a kernel mapping or a userspace mapping.  We don't actually
-	 * use the global bit, so throw it away.
-	 */
-	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
-
-	/* The Guest's pages are offset inside the Launcher. */
-	base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
-
-	/*
-	 * We need a temporary "unsigned long" variable to hold the answer from
-	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
-	 * fit in spte.pfn.  get_pfn() finds the real physical number of the
-	 * page, given the virtual number.
-	 */
-	pfn = get_pfn(base + pte_pfn(gpte), write);
-	if (pfn == -1UL) {
-		kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
-		/*
-		 * When we destroy the Guest, we'll go through the shadow page
-		 * tables and release_pte() them.  Make sure we don't think
-		 * this one is valid!
-		 */
-		flags = 0;
-	}
-	/* Now we assemble our shadow PTE from the page number and flags. */
-	return pfn_pte(pfn, __pgprot(flags));
-}
-
-/*H:460 And to complete the chain, release_pte() looks like this: */
-static void release_pte(pte_t pte)
-{
-	/*
-	 * Remember that get_user_pages_fast() took a reference to the page, in
-	 * get_pfn()?  We have to put it back now.
-	 */
-	if (pte_flags(pte) & _PAGE_PRESENT)
-		put_page(pte_page(pte));
-}
-/*:*/
-
-static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
-{
-	/* We don't handle large pages. */
-	if (pte_flags(gpte) & _PAGE_PSE)
-		return false;
-
-	return (pte_pfn(gpte) >= cpu->lg->pfn_limit
-		&& pte_pfn(gpte) < cpu->lg->device_limit);
-}
-
-static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
-{
-	if ((pte_flags(gpte) & _PAGE_PSE) ||
-	    pte_pfn(gpte) >= cpu->lg->pfn_limit) {
-		kill_guest(cpu, "bad page table entry");
-		return false;
-	}
-	return true;
-}
-
-static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
-{
-	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
-	    (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
-		kill_guest(cpu, "bad page directory entry");
-		return false;
-	}
-	return true;
-}
-
-#ifdef CONFIG_X86_PAE
-static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
-{
-	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
-	    (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
-		kill_guest(cpu, "bad page middle directory entry");
-		return false;
-	}
-	return true;
-}
-#endif
-
-/*H:331
- * This is the core routine to walk the shadow page tables and find the page
- * table entry for a specific address.
- *
- * If allocate is set, then we allocate any missing levels, setting the flags
- * on the new page directory and mid-level directories using the arguments
- * (which are copied from the Guest's page table entries).
- */
-static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
-			int pgd_flags, int pmd_flags)
-{
-	pgd_t *spgd;
-	/* Mid level for PAE. */
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
-
-	/* Get top level entry. */
-	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage;
-
-		/* If they didn't want us to allocate anything, stop. */
-		if (!allocate)
-			return NULL;
-
-		ptepage = get_zeroed_page(GFP_KERNEL);
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
-		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pte page");
-			return NULL;
-		}
-		/*
-		 * And we copy the flags to the shadow PGD entry.  The page
-		 * number in the shadow PGD is the page we just allocated.
-		 */
-		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
-	}
-
-	/*
-	 * Intel's Physical Address Extension actually uses three levels of
-	 * page tables, so we need to look in the mid-level.
-	 */
-#ifdef CONFIG_X86_PAE
-	/* Now look at the mid-level shadow entry. */
-	spmd = spmd_addr(cpu, *spgd, vaddr);
-
-	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage;
-
-		/* If they didn't want us to allocate anything, stop. */
-		if (!allocate)
-			return NULL;
-
-		ptepage = get_zeroed_page(GFP_KERNEL);
-
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
-		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pmd page");
-			return NULL;
-		}
-
-		/*
-		 * And we copy the flags to the shadow PMD entry.  The page
-		 * number in the shadow PMD is the page we just allocated.
-		 */
-		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
-	}
-#endif
-
-	/* Get the pointer to the shadow PTE entry we're going to set. */
-	return spte_addr(cpu, *spgd, vaddr);
-}
-
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here.  That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
- *
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true.  Otherwise, it was a real fault and we need to tell the Guest.
- *
- * There's a corner case: they're trying to access memory between
- * pfn_limit and device_limit, which is I/O memory.  In this case, we
- * return false and set @iomem to the physical address, so the the
- * Launcher can handle the instruction manually.
- */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
-		 unsigned long *iomem)
-{
-	unsigned long gpte_ptr;
-	pte_t gpte;
-	pte_t *spte;
-	pmd_t gpmd;
-	pgd_t gpgd;
-
-	*iomem = 0;
-
-	/* We never demand page the Switcher, so trying is a mistake. */
-	if (vaddr >= switcher_addr)
-		return false;
-
-	/* First step: get the top-level Guest page table entry. */
-	if (unlikely(cpu->linear_pages)) {
-		/* Faking up a linear mapping. */
-		gpgd = __pgd(CHECK_GPGD_MASK);
-	} else {
-		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-		/* Toplevel not present?  We can't map it in. */
-		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-			return false;
-
-		/* 
-		 * This kills the Guest if it has weird flags or tries to
-		 * refer to a "physical" address outside the bounds.
-		 */
-		if (!check_gpgd(cpu, gpgd))
-			return false;
-	}
-
-	/* This "mid-level" entry is only used for non-linear, PAE mode. */
-	gpmd = __pmd(_PAGE_TABLE);
-
-#ifdef CONFIG_X86_PAE
-	if (likely(!cpu->linear_pages)) {
-		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-		/* Middle level not present?  We can't map it in. */
-		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-			return false;
-
-		/* 
-		 * This kills the Guest if it has weird flags or tries to
-		 * refer to a "physical" address outside the bounds.
-		 */
-		if (!check_gpmd(cpu, gpmd))
-			return false;
-	}
-
-	/*
-	 * OK, now we look at the lower level in the Guest page table: keep its
-	 * address, because we might update it later.
-	 */
-	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
-#else
-	/*
-	 * OK, now we look at the lower level in the Guest page table: keep its
-	 * address, because we might update it later.
-	 */
-	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
-#endif
-
-	if (unlikely(cpu->linear_pages)) {
-		/* Linear?  Make up a PTE which points to same page. */
-		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
-	} else {
-		/* Read the actual PTE value. */
-		gpte = lgread(cpu, gpte_ptr, pte_t);
-	}
-
-	/* If this page isn't in the Guest page tables, we can't page it in. */
-	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		return false;
-
-	/*
-	 * Check they're not trying to write to a page the Guest wants
-	 * read-only (bit 2 of errcode == write).
-	 */
-	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
-		return false;
-
-	/* User access to a kernel-only page? (bit 3 == user access) */
-	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
-		return false;
-
-	/* If they're accessing io memory, we expect a fault. */
-	if (gpte_in_iomem(cpu, gpte)) {
-		*iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
-		return false;
-	}
-
-	/*
-	 * Check that the Guest PTE flags are OK, and the page number is below
-	 * the pfn_limit (ie. not mapping the Launcher binary).
-	 */
-	if (!check_gpte(cpu, gpte))
-		return false;
-
-	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
-	gpte = pte_mkyoung(gpte);
-	if (errcode & 2)
-		gpte = pte_mkdirty(gpte);
-
-	/* Get the pointer to the shadow PTE entry we're going to set. */
-	spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
-	if (!spte)
-		return false;
-
-	/*
-	 * If there was a valid shadow PTE entry here before, we release it.
-	 * This can happen with a write to a previously read-only entry.
-	 */
-	release_pte(*spte);
-
-	/*
-	 * If this is a write, we insist that the Guest page is writable (the
-	 * final arg to gpte_to_spte()).
-	 */
-	if (pte_dirty(gpte))
-		*spte = gpte_to_spte(cpu, gpte, 1);
-	else
-		/*
-		 * If this is a read, don't set the "writable" bit in the page
-		 * table entry, even if the Guest says it's writable.  That way
-		 * we will come back here when a write does actually occur, so
-		 * we can update the Guest's _PAGE_DIRTY flag.
-		 */
-		set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
-
-	/*
-	 * Finally, we write the Guest PTE entry back: we've set the
-	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
-	 */
-	if (likely(!cpu->linear_pages))
-		lgwrite(cpu, gpte_ptr, pte_t, gpte);
-
-	/*
-	 * The fault is fixed, the page table is populated, the mapping
-	 * manipulated, the result returned and the code complete.  A small
-	 * delay and a trace of alliteration are the only indications the Guest
-	 * has that a page fault occurred at all.
-	 */
-	return true;
-}
-
-/*H:360
- * (ii) Making sure the Guest stack is mapped.
- *
- * Remember that direct traps into the Guest need a mapped Guest kernel stack.
- * pin_stack_pages() calls us here: we could simply call demand_page(), but as
- * we've seen that logic is quite long, and usually the stack pages are already
- * mapped, so it's overkill.
- *
- * This is a quick version which answers the question: is this virtual address
- * mapped by the shadow page tables, and is it writable?
- */
-static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	pte_t *spte;
-	unsigned long flags;
-
-	/* You can't put your stack in the Switcher! */
-	if (vaddr >= switcher_addr)
-		return false;
-
-	/* If there's no shadow PTE, it's not writable. */
-	spte = find_spte(cpu, vaddr, false, 0, 0);
-	if (!spte)
-		return false;
-
-	/*
-	 * Check the flags on the pte entry itself: it must be present and
-	 * writable.
-	 */
-	flags = pte_flags(*spte);
-	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
-}
-
-/*
- * So, when pin_stack_pages() asks us to pin a page, we check if it's already
- * in the page tables, and if not, we call demand_page() with error code 2
- * (meaning "write").
- */
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned long iomem;
-
-	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
-		kill_guest(cpu, "bad stack page %#lx", vaddr);
-}
-/*:*/
-
-#ifdef CONFIG_X86_PAE
-static void release_pmd(pmd_t *spmd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-		unsigned int i;
-		pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
-		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTRS_PER_PTE; i++)
-			release_pte(ptepage[i]);
-		/* Now we can free the page of PTEs */
-		free_page((long)ptepage);
-		/* And zero out the PMD entry so we never release it twice. */
-		set_pmd(spmd, __pmd(0));
-	}
-}
-
-static void release_pgd(pgd_t *spgd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		unsigned int i;
-		pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
-		for (i = 0; i < PTRS_PER_PMD; i++)
-			release_pmd(&pmdpage[i]);
-
-		/* Now we can free the page of PMDs */
-		free_page((long)pmdpage);
-		/* And zero out the PGD entry so we never release it twice. */
-		set_pgd(spgd, __pgd(0));
-	}
-}
-
-#else /* !CONFIG_X86_PAE */
-/*H:450
- * If we chase down the release_pgd() code, the non-PAE version looks like
- * this.  The PAE version is almost identical, but instead of calling
- * release_pte it calls release_pmd(), which looks much like this.
- */
-static void release_pgd(pgd_t *spgd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		unsigned int i;
-		/*
-		 * Converting the pfn to find the actual PTE page is easy: turn
-		 * the page number into a physical address, then convert to a
-		 * virtual address (easy for kernel pages like this one).
-		 */
-		pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTRS_PER_PTE; i++)
-			release_pte(ptepage[i]);
-		/* Now we can free the page of PTEs */
-		free_page((long)ptepage);
-		/* And zero out the PGD entry so we never release it twice. */
-		*spgd = __pgd(0);
-	}
-}
-#endif
-
-/*H:445
- * We saw flush_user_mappings() twice: once from the flush_user_mappings()
- * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
- * It simply releases every PTE page from 0 up to the Guest's kernel address.
- */
-static void flush_user_mappings(struct lguest *lg, int idx)
-{
-	unsigned int i;
-	/* Release every pgd entry up to the kernel's address. */
-	for (i = 0; i < pgd_index(lg->kernel_address); i++)
-		release_pgd(lg->pgdirs[idx].pgdir + i);
-}
-
-/*H:440
- * (v) Flushing (throwing away) page tables,
- *
- * The Guest has a hypercall to throw away the page tables: it's used when a
- * large number of mappings have been changed.
- */
-void guest_pagetable_flush_user(struct lg_cpu *cpu)
-{
-	/* Drop the userspace part of the current page table. */
-	flush_user_mappings(cpu->lg, cpu->cpu_pgd);
-}
-/*:*/
-
-/* We walk down the guest page tables to get a guest-physical address */
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
-{
-	pgd_t gpgd;
-	pte_t gpte;
-#ifdef CONFIG_X86_PAE
-	pmd_t gpmd;
-#endif
-
-	/* Still not set up?  Just map 1:1. */
-	if (unlikely(cpu->linear_pages)) {
-		*paddr = vaddr;
-		return true;
-	}
-
-	/* First step: get the top-level Guest page table entry. */
-	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-	/* Toplevel not present?  We can't map it in. */
-	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-		goto fail;
-
-#ifdef CONFIG_X86_PAE
-	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-		goto fail;
-	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
-#else
-	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
-#endif
-	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		goto fail;
-
-	*paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
-	return true;
-
-fail:
-	*paddr = -1UL;
-	return false;
-}
-
-/*
- * This is the version we normally use: kills the Guest if it uses a
- * bad address
- */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned long paddr;
-
-	if (!__guest_pa(cpu, vaddr, &paddr))
-		kill_guest(cpu, "Bad address %#lx", vaddr);
-	return paddr;
-}
-
-/*
- * We keep several page tables.  This is a simple routine to find the page
- * table (if any) corresponding to this top-level address the Guest has given
- * us.
- */
-static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
-{
-	unsigned int i;
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable)
-			break;
-	return i;
-}
-
-/*H:435
- * And this is us, creating the new page directory.  If we really do
- * allocate a new one (and so the kernel parts are not there), we set
- * blank_pgdir.
- */
-static unsigned int new_pgdir(struct lg_cpu *cpu,
-			      unsigned long gpgdir,
-			      int *blank_pgdir)
-{
-	unsigned int next;
-
-	/*
-	 * We pick one entry at random to throw out.  Choosing the Least
-	 * Recently Used might be better, but this is easy.
-	 */
-	next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs);
-	/* If it's never been allocated at all before, try now. */
-	if (!cpu->lg->pgdirs[next].pgdir) {
-		cpu->lg->pgdirs[next].pgdir =
-					(pgd_t *)get_zeroed_page(GFP_KERNEL);
-		/* If the allocation fails, just keep using the one we have */
-		if (!cpu->lg->pgdirs[next].pgdir)
-			next = cpu->cpu_pgd;
-		else {
-			/*
-			 * This is a blank page, so there are no kernel
-			 * mappings: caller must map the stack!
-			 */
-			*blank_pgdir = 1;
-		}
-	}
-	/* Record which Guest toplevel this shadows. */
-	cpu->lg->pgdirs[next].gpgdir = gpgdir;
-	/* Release all the non-kernel mappings. */
-	flush_user_mappings(cpu->lg, next);
-
-	/* This hasn't run on any CPU at all. */
-	cpu->lg->pgdirs[next].last_host_cpu = -1;
-
-	return next;
-}
-
-/*H:501
- * We do need the Switcher code mapped at all times, so we allocate that
- * part of the Guest page table here.  We map the Switcher code immediately,
- * but defer mapping of the guest register page and IDT/LDT etc page until
- * just before we run the guest in map_switcher_in_guest().
- *
- * We *could* do this setup in map_switcher_in_guest(), but at that point
- * we've interrupts disabled, and allocating pages like that is fraught: we
- * can't sleep if we need to free up some memory.
- */
-static bool allocate_switcher_mapping(struct lg_cpu *cpu)
-{
-	int i;
-
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
-				       CHECK_GPGD_MASK, _PAGE_TABLE);
-		if (!pte)
-			return false;
-
-		/*
-		 * Map the switcher page if not already there.  It might
-		 * already be there because we call allocate_switcher_mapping()
-		 * in guest_set_pgd() just in case it did discard our Switcher
-		 * mapping, but it probably didn't.
-		 */
-		if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
-			/* Get a reference to the Switcher page. */
-			get_page(lg_switcher_pages[0]);
-			/* Create a read-only, exectuable, kernel-style PTE */
-			set_pte(pte,
-				mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
-		}
-	}
-	cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
-	return true;
-}
-
-/*H:470
- * Finally, a routine which throws away everything: all PGD entries in all
- * the shadow page tables, including the Guest's kernel mappings.  This is used
- * when we destroy the Guest.
- */
-static void release_all_pagetables(struct lguest *lg)
-{
-	unsigned int i, j;
-
-	/* Every shadow pagetable this Guest has */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
-		if (!lg->pgdirs[i].pgdir)
-			continue;
-
-		/* Every PGD entry. */
-		for (j = 0; j < PTRS_PER_PGD; j++)
-			release_pgd(lg->pgdirs[i].pgdir + j);
-		lg->pgdirs[i].switcher_mapped = false;
-		lg->pgdirs[i].last_host_cpu = -1;
-	}
-}
-
-/*
- * We also throw away everything when a Guest tells us it's changed a kernel
- * mapping.  Since kernel mappings are in every page table, it's easiest to
- * throw them all away.  This traps the Guest in amber for a while as
- * everything faults back in, but it's rare.
- */
-void guest_pagetable_clear_all(struct lg_cpu *cpu)
-{
-	release_all_pagetables(cpu->lg);
-	/* We need the Guest kernel stack mapped again. */
-	pin_stack_pages(cpu);
-	/* And we need Switcher allocated. */
-	if (!allocate_switcher_mapping(cpu))
-		kill_guest(cpu, "Cannot populate switcher mapping");
-}
-
-/*H:430
- * (iv) Switching page tables
- *
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir).  This occurs on almost every context switch.
- */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
-{
-	int newpgdir, repin = 0;
-
-	/*
-	 * The very first time they call this, we're actually running without
-	 * any page tables; we've been making it up.  Throw them away now.
-	 */
-	if (unlikely(cpu->linear_pages)) {
-		release_all_pagetables(cpu->lg);
-		cpu->linear_pages = false;
-		/* Force allocation of a new pgdir. */
-		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
-	} else {
-		/* Look to see if we have this one already. */
-		newpgdir = find_pgdir(cpu->lg, pgtable);
-	}
-
-	/*
-	 * If not, we allocate or mug an existing one: if it's a fresh one,
-	 * repin gets set to 1.
-	 */
-	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
-		newpgdir = new_pgdir(cpu, pgtable, &repin);
-	/* Change the current pgd index to the new one. */
-	cpu->cpu_pgd = newpgdir;
-	/*
-	 * If it was completely blank, we map in the Guest kernel stack and
-	 * the Switcher.
-	 */
-	if (repin)
-		pin_stack_pages(cpu);
-
-	if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
-		if (!allocate_switcher_mapping(cpu))
-			kill_guest(cpu, "Cannot populate switcher mapping");
-	}
-}
-/*:*/
-
-/*M:009
- * Since we throw away all mappings when a kernel mapping changes, our
- * performance sucks for guests using highmem.  In fact, a guest with
- * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
- * usually slower than a Guest with less memory.
- *
- * This, of course, cannot be fixed.  It would take some kind of... well, I
- * don't know, but the term "puissant code-fu" comes to mind.
-:*/
-
-/*H:420
- * This is the routine which actually sets the page table entry for then
- * "idx"'th shadow page table.
- *
- * Normally, we can just throw out the old entry and replace it with 0: if they
- * use it demand_page() will put the new entry in.  We need to do this anyway:
- * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
- * is read from, and _PAGE_DIRTY when it's written to.
- *
- * But Avi Kivity pointed out that most Operating Systems (Linux included) set
- * these bits on PTEs immediately anyway.  This is done to save the CPU from
- * having to update them, but it helps us the same way: if they set
- * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
- * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
- */
-static void __guest_set_pte(struct lg_cpu *cpu, int idx,
-		       unsigned long vaddr, pte_t gpte)
-{
-	/* Look up the matching shadow page directory entry. */
-	pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
-
-	/* If the top level isn't present, there's no entry to update. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-#ifdef CONFIG_X86_PAE
-		spmd = spmd_addr(cpu, *spgd, vaddr);
-		if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-#endif
-			/* Otherwise, start by releasing the existing entry. */
-			pte_t *spte = spte_addr(cpu, *spgd, vaddr);
-			release_pte(*spte);
-
-			/*
-			 * If they're setting this entry as dirty or accessed,
-			 * we might as well put that entry they've given us in
-			 * now.  This shaves 10% off a copy-on-write
-			 * micro-benchmark.
-			 */
-			if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
-			    && !gpte_in_iomem(cpu, gpte)) {
-				if (!check_gpte(cpu, gpte))
-					return;
-				set_pte(spte,
-					gpte_to_spte(cpu, gpte,
-						pte_flags(gpte) & _PAGE_DIRTY));
-			} else {
-				/*
-				 * Otherwise kill it and we can demand_page()
-				 * it in later.
-				 */
-				set_pte(spte, __pte(0));
-			}
-#ifdef CONFIG_X86_PAE
-		}
-#endif
-	}
-}
-
-/*H:410
- * Updating a PTE entry is a little trickier.
- *
- * We keep track of several different page tables (the Guest uses one for each
- * process, so it makes sense to cache at least a few).  Each of these have
- * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
- * all processes.  So when the page table above that address changes, we update
- * all the page tables, not just the current one.  This is rare.
- *
- * The benefit is that when we have to track a new page table, we can keep all
- * the kernel mappings.  This speeds up context switch immensely.
- */
-void guest_set_pte(struct lg_cpu *cpu,
-		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
-{
-	/* We don't let you remap the Switcher; we need it to get back! */
-	if (vaddr >= switcher_addr) {
-		kill_guest(cpu, "attempt to set pte into Switcher pages");
-		return;
-	}
-
-	/*
-	 * Kernel mappings must be changed on all top levels.  Slow, but doesn't
-	 * happen often.
-	 */
-	if (vaddr >= cpu->lg->kernel_address) {
-		unsigned int i;
-		for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
-			if (cpu->lg->pgdirs[i].pgdir)
-				__guest_set_pte(cpu, i, vaddr, gpte);
-	} else {
-		/* Is this page table one we have a shadow for? */
-		int pgdir = find_pgdir(cpu->lg, gpgdir);
-		if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
-			/* If so, do the update. */
-			__guest_set_pte(cpu, pgdir, vaddr, gpte);
-	}
-}
-
-/*H:400
- * (iii) Setting up a page table entry when the Guest tells us one has changed.
- *
- * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
- * with the other side of page tables while we're here: what happens when the
- * Guest asks for a page table to be updated?
- *
- * We already saw that demand_page() will fill in the shadow page tables when
- * needed, so we can simply remove shadow page table entries whenever the Guest
- * tells us they've changed.  When the Guest tries to use the new entry it will
- * fault and demand_page() will fix it up.
- *
- * So with that in mind here's our code to update a (top-level) PGD entry:
- */
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
-{
-	int pgdir;
-
-	if (idx > PTRS_PER_PGD) {
-		kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
-			   idx, PTRS_PER_PGD);
-		return;
-	}
-
-	/* If they're talking about a page table we have a shadow for... */
-	pgdir = find_pgdir(lg, gpgdir);
-	if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
-		/* ... throw it away. */
-		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
-		/* That might have been the Switcher mapping, remap it. */
-		if (!allocate_switcher_mapping(&lg->cpus[0])) {
-			kill_guest(&lg->cpus[0],
-				   "Cannot populate switcher mapping");
-		}
-		lg->pgdirs[pgdir].last_host_cpu = -1;
-	}
-}
-
-#ifdef CONFIG_X86_PAE
-/* For setting a mid-level, we just throw everything away.  It's easy. */
-void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
-{
-	guest_pagetable_clear_all(&lg->cpus[0]);
-}
-#endif
-
-/*H:500
- * (vii) Setting up the page tables initially.
- *
- * When a Guest is first created, set initialize a shadow page table which
- * we will populate on future faults.  The Guest doesn't have any actual
- * pagetables yet, so we set linear_pages to tell demand_page() to fake it
- * for the moment.
- *
- * We do need the Switcher to be mapped at all times, so we allocate that
- * part of the Guest page table here.
- */
-int init_guest_pagetable(struct lguest *lg)
-{
-	struct lg_cpu *cpu = &lg->cpus[0];
-	int allocated = 0;
-
-	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
-	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
-	if (!allocated)
-		return -ENOMEM;
-
-	/* We start with a linear mapping until the initialize. */
-	cpu->linear_pages = true;
-
-	/* Allocate the page tables for the Switcher. */
-	if (!allocate_switcher_mapping(cpu)) {
-		release_all_pagetables(lg);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lg_cpu *cpu)
-{
-	/*
-	 * We tell the Guest that it can't use the virtual addresses
-	 * used by the Switcher.  This trick is equivalent to 4GB -
-	 * switcher_addr.
-	 */
-	u32 top = ~switcher_addr + 1;
-
-	/* We get the kernel address: above this is all kernel memory. */
-	if (get_user(cpu->lg->kernel_address,
-		     &cpu->lg->lguest_data->kernel_address)
-		/*
-		 * We tell the Guest that it can't use the top virtual
-		 * addresses (used by the Switcher).
-		 */
-	    || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-		return;
-	}
-
-	/*
-	 * In flush_user_mappings() we loop from 0 to
-	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
-	 * Switcher mappings, so check that now.
-	 */
-	if (cpu->lg->kernel_address >= switcher_addr)
-		kill_guest(cpu, "bad kernel address %#lx",
-				 cpu->lg->kernel_address);
-}
-
-/* When a Guest dies, our cleanup is fairly simple. */
-void free_guest_pagetable(struct lguest *lg)
-{
-	unsigned int i;
-
-	/* Throw away all page table pages. */
-	release_all_pagetables(lg);
-	/* Now free the top levels: free_page() can handle 0 just fine. */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		free_page((long)lg->pgdirs[i].pgdir);
-}
-
-/*H:481
- * This clears the Switcher mappings for cpu #i.
- */
-static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
-{
-	unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
-	pte_t *pte;
-
-	/* Clear the mappings for both pages. */
-	pte = find_spte(cpu, base, false, 0, 0);
-	release_pte(*pte);
-	set_pte(pte, __pte(0));
-
-	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	release_pte(*pte);
-	set_pte(pte, __pte(0));
-}
-
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the Guest
- * (and not the pages for other CPUs).
- *
- * The pages for the pagetables have all been allocated before: we just need
- * to make sure the actual PTEs are up-to-date for the CPU we're about to run
- * on.
- */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	unsigned long base;
-	struct page *percpu_switcher_page, *regs_page;
-	pte_t *pte;
-	struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
-
-	/* Switcher page should always be mapped by now! */
-	BUG_ON(!pgdir->switcher_mapped);
-
-	/* 
-	 * Remember that we have two pages for each Host CPU, so we can run a
-	 * Guest on each CPU without them interfering.  We need to make sure
-	 * those pages are mapped correctly in the Guest, but since we usually
-	 * run on the same CPU, we cache that, and only update the mappings
-	 * when we move.
-	 */
-	if (pgdir->last_host_cpu == raw_smp_processor_id())
-		return;
-
-	/* -1 means unknown so we remove everything. */
-	if (pgdir->last_host_cpu == -1) {
-		unsigned int i;
-		for_each_possible_cpu(i)
-			remove_switcher_percpu_map(cpu, i);
-	} else {
-		/* We know exactly what CPU mapping to remove. */
-		remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
-	}
-
-	/*
-	 * When we're running the Guest, we want the Guest's "regs" page to
-	 * appear where the first Switcher page for this CPU is.  This is an
-	 * optimization: when the Switcher saves the Guest registers, it saves
-	 * them into the first page of this CPU's "struct lguest_pages": if we
-	 * make sure the Guest's register page is already mapped there, we
-	 * don't have to copy them out again.
-	 */
-	/* Find the shadow PTE for this regs page. */
-	base = switcher_addr + PAGE_SIZE
-		+ raw_smp_processor_id() * sizeof(struct lguest_pages);
-	pte = find_spte(cpu, base, false, 0, 0);
-	regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
-	get_page(regs_page);
-	set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
-
-	/*
-	 * We map the second page of the struct lguest_pages read-only in
-	 * the Guest: the IDT, GDT and other things it's not supposed to
-	 * change.
-	 */
-	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	percpu_switcher_page
-		= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
-	get_page(percpu_switcher_page);
-	set_pte(pte, mk_pte(percpu_switcher_page,
-			    __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
-
-	pgdir->last_host_cpu = raw_smp_processor_id();
-}
-
-/*H:490
- * We've made it through the page table code.  Perhaps our tired brains are
- * still processing the details, or perhaps we're simply glad it's over.
- *
- * If nothing else, note that all this complexity in juggling shadow page tables
- * in sync with the Guest's page tables is for one reason: for most Guests this
- * page table dance determines how bad performance will be.  This is why Xen
- * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
- * have implemented shadow page table support directly into hardware.
- *
- * There is just one file remaining in the Host.
- */
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
deleted file mode 100644
index c4fb424dfddb..000000000000
--- a/drivers/lguest/segments.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*P:600
- * The x86 architecture has segments, which involve a table of descriptors
- * which can be used to do funky things with virtual address interpretation.
- * We originally used to use segments so the Guest couldn't alter the
- * Guest<->Host Switcher, and then we had to trim Guest segments, and restore
- * for userspace per-thread segments, but trim again for on userspace->kernel
- * transitions...  This nightmarish creation was contained within this file,
- * where we knew not to tread without heavy armament and a change of underwear.
- *
- * In these modern times, the segment handling code consists of simple sanity
- * checks, and the worst you'll experience reading this code is butterfly-rash
- * from frolicking through its parklike serenity.
-:*/
-#include "lg.h"
-
-/*H:600
- * Segments & The Global Descriptor Table
- *
- * (That title sounds like a bad Nerdcore group.  Not to suggest that there are
- * any good Nerdcore groups, but in high school a friend of mine had a band
- * called Joe Fish and the Chips, so there are definitely worse band names).
- *
- * To refresh: the GDT is a table of 8-byte values describing segments.  Once
- * set up, these segments can be loaded into one of the 6 "segment registers".
- *
- * GDT entries are passed around as "struct desc_struct"s, which like IDT
- * entries are split into two 32-bit members, "a" and "b".  One day, someone
- * will clean that up, and be declared a Hero.  (No pressure, I'm just saying).
- *
- * Anyway, the GDT entry contains a base (the start address of the segment), a
- * limit (the size of the segment - 1), and some flags.  Sounds simple, and it
- * would be, except those zany Intel engineers decided that it was too boring
- * to put the base at one end, the limit at the other, and the flags in
- * between.  They decided to shotgun the bits at random throughout the 8 bytes,
- * like so:
- *
- * 0               16                     40       48  52  56     63
- * [ limit part 1 ][     base part 1     ][ flags ][li][fl][base ]
- *                                                  mit ags part 2
- *                                                part 2
- *
- * As a result, this file contains a certain amount of magic numeracy.  Let's
- * begin.
- */
-
-/*
- * There are several entries we don't let the Guest set.  The TSS entry is the
- * "Task State Segment" which controls all kinds of delicate things.  The
- * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
- * the Guest can't be trusted to deal with double faults.
- */
-static bool ignored_gdt(unsigned int num)
-{
-	return (num == GDT_ENTRY_TSS
-		|| num == GDT_ENTRY_LGUEST_CS
-		|| num == GDT_ENTRY_LGUEST_DS
-		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
-}
-
-/*H:630
- * Once the Guest gave us new GDT entries, we fix them up a little.  We
- * don't care if they're invalid: the worst that can happen is a General
- * Protection Fault in the Switcher when it restores a Guest segment register
- * which tries to use that entry.  Then we kill the Guest for causing such a
- * mess: the message will be "unhandled trap 256".
- */
-static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
-{
-	unsigned int i;
-
-	for (i = start; i < end; i++) {
-		/*
-		 * We never copy these ones to real GDT, so we don't care what
-		 * they say
-		 */
-		if (ignored_gdt(i))
-			continue;
-
-		/*
-		 * Segment descriptors contain a privilege level: the Guest is
-		 * sometimes careless and leaves this as 0, even though it's
-		 * running at privilege level 1.  If so, we fix it here.
-		 */
-		if (cpu->arch.gdt[i].dpl == 0)
-			cpu->arch.gdt[i].dpl |= GUEST_PL;
-
-		/*
-		 * Each descriptor has an "accessed" bit.  If we don't set it
-		 * now, the CPU will try to set it when the Guest first loads
-		 * that entry into a segment register.  But the GDT isn't
-		 * writable by the Guest, so bad things can happen.
-		 */
-		cpu->arch.gdt[i].type |= 0x1;
-	}
-}
-
-/*H:610
- * Like the IDT, we never simply use the GDT the Guest gives us.  We keep
- * a GDT for each CPU, and copy across the Guest's entries each time we want to
- * run the Guest on that CPU.
- *
- * This routine is called at boot or modprobe time for each CPU to set up the
- * constant GDT entries: the ones which are the same no matter what Guest we're
- * running.
- */
-void setup_default_gdt_entries(struct lguest_ro_state *state)
-{
-	struct desc_struct *gdt = state->guest_gdt;
-	unsigned long tss = (unsigned long)&state->guest_tss;
-
-	/* The Switcher segments are full 0-4G segments, privilege level 0 */
-	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-
-	/*
-	 * The TSS segment refers to the TSS entry for this particular CPU.
-	 */
-	gdt[GDT_ENTRY_TSS].a = 0;
-	gdt[GDT_ENTRY_TSS].b = 0;
-
-	gdt[GDT_ENTRY_TSS].limit0 = 0x67;
-	gdt[GDT_ENTRY_TSS].base0  = tss & 0xFFFF;
-	gdt[GDT_ENTRY_TSS].base1  = (tss >> 16) & 0xFF;
-	gdt[GDT_ENTRY_TSS].base2  = tss >> 24;
-	gdt[GDT_ENTRY_TSS].type   = 0x9; /* 32-bit TSS (available) */
-	gdt[GDT_ENTRY_TSS].p      = 0x1; /* Entry is present */
-	gdt[GDT_ENTRY_TSS].dpl    = 0x0; /* Privilege level 0 */
-	gdt[GDT_ENTRY_TSS].s      = 0x0; /* system segment */
-
-}
-
-/*
- * This routine sets up the initial Guest GDT for booting.  All entries start
- * as 0 (unusable).
- */
-void setup_guest_gdt(struct lg_cpu *cpu)
-{
-	/*
-	 * Start with full 0-4G segments...except the Guest is allowed to use
-	 * them, so set the privilege level appropriately in the flags.
-	 */
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
-}
-
-/*H:650
- * An optimization of copy_gdt(), for just the three "thead-local storage"
- * entries.
- */
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-	unsigned int i;
-
-	for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
-		gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:640
- * When the Guest is run on a different CPU, or the GDT entries have changed,
- * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
- * GDT.
- */
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-	unsigned int i;
-
-	/*
-	 * The default entries from setup_default_gdt_entries() are not
-	 * replaced.  See ignored_gdt() above.
-	 */
-	for (i = 0; i < GDT_ENTRIES; i++)
-		if (!ignored_gdt(i))
-			gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:620
- * This is where the Guest asks us to load a new GDT entry
- * (LHCALL_LOAD_GDT_ENTRY).  We tweak the entry and copy it in.
- */
-void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
-{
-	/*
-	 * We assume the Guest has the same number of GDT entries as the
-	 * Host, otherwise we'd have to dynamically allocate the Guest GDT.
-	 */
-	if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
-		kill_guest(cpu, "too many gdt entries %i", num);
-		return;
-	}
-
-	/* Set it up, then fix it. */
-	cpu->arch.gdt[num].a = lo;
-	cpu->arch.gdt[num].b = hi;
-	fixup_gdt_table(cpu, num, num+1);
-	/*
-	 * Mark that the GDT changed so the core knows it has to copy it again,
-	 * even if the Guest is run on the same CPU.
-	 */
-	cpu->changed |= CHANGED_GDT;
-}
-
-/*
- * This is the fast-track version for just changing the three TLS entries.
- * Remember that this happens on every context switch, so it's worth
- * optimizing.  But wouldn't it be neater to have a single hypercall to cover
- * both cases?
- */
-void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
-{
-	struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
-
-	__lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
-	fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
-	/* Note that just the TLS entries have changed. */
-	cpu->changed |= CHANGED_GDT_TLS;
-}
-
-/*H:660
- * With this, we have finished the Host.
- *
- * Five of the seven parts of our task are complete.  You have made it through
- * the Bit of Despair (I think that's somewhere in the page table code,
- * myself).
- *
- * Next, we examine "make Switcher".  It's short, but intense.
- */
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
deleted file mode 100644
index b4f79b923aea..000000000000
--- a/drivers/lguest/x86/core.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-/*P:450
- * This file contains the x86-specific lguest code.  It used to be all
- * mixed in with drivers/lguest/core.c but several foolhardy code slashers
- * wrestled most of the dependencies out to here in preparation for porting
- * lguest to other architectures (see what I mean by foolhardy?).
- *
- * This also contains a couple of non-obvious setup and teardown pieces which
- * were implemented after days of debugging pain.
-:*/
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/cpu.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/lguest.h>
-#include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
-#include <asm/tlbflush.h>
-#include "../lg.h"
-
-static int cpu_had_pge;
-
-static struct {
-	unsigned long offset;
-	unsigned short segment;
-} lguest_entry;
-
-/* Offset from where switcher.S was compiled to where we've copied it */
-static unsigned long switcher_offset(void)
-{
-	return switcher_addr - (unsigned long)start_switcher_text;
-}
-
-/* This cpu's struct lguest_pages (after the Switcher text page) */
-static struct lguest_pages *lguest_pages(unsigned int cpu)
-{
-	return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
-}
-
-static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
-
-/*S:010
- * We approach the Switcher.
- *
- * Remember that each CPU has two pages which are visible to the Guest when it
- * runs on that CPU.  This has to contain the state for that Guest: we copy the
- * state in just before we run the Guest.
- *
- * Each Guest has "changed" flags which indicate what has changed in the Guest
- * since it last ran.  We saw this set in interrupts_and_traps.c and
- * segments.c.
- */
-static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	/*
-	 * Copying all this data can be quite expensive.  We usually run the
-	 * same Guest we ran last time (and that Guest hasn't run anywhere else
-	 * meanwhile).  If that's not the case, we pretend everything in the
-	 * Guest has changed.
-	 */
-	if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
-		__this_cpu_write(lg_last_cpu, cpu);
-		cpu->last_pages = pages;
-		cpu->changed = CHANGED_ALL;
-	}
-
-	/*
-	 * These copies are pretty cheap, so we do them unconditionally: */
-	/* Save the current Host top-level page directory.
-	 */
-	pages->state.host_cr3 = __pa(current->mm->pgd);
-	/*
-	 * Set up the Guest's page tables to see this CPU's pages (and no
-	 * other CPU's pages).
-	 */
-	map_switcher_in_guest(cpu, pages);
-	/*
-	 * Set up the two "TSS" members which tell the CPU what stack to use
-	 * for traps which do directly into the Guest (ie. traps at privilege
-	 * level 1).
-	 */
-	pages->state.guest_tss.sp1 = cpu->esp1;
-	pages->state.guest_tss.ss1 = cpu->ss1;
-
-	/* Copy direct-to-Guest trap entries. */
-	if (cpu->changed & CHANGED_IDT)
-		copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
-
-	/* Copy all GDT entries which the Guest can change. */
-	if (cpu->changed & CHANGED_GDT)
-		copy_gdt(cpu, pages->state.guest_gdt);
-	/* If only the TLS entries have changed, copy them. */
-	else if (cpu->changed & CHANGED_GDT_TLS)
-		copy_gdt_tls(cpu, pages->state.guest_gdt);
-
-	/* Mark the Guest as unchanged for next time. */
-	cpu->changed = 0;
-}
-
-/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	/* This is a dummy value we need for GCC's sake. */
-	unsigned int clobber;
-
-	/*
-	 * Copy the guest-specific information into this CPU's "struct
-	 * lguest_pages".
-	 */
-	copy_in_guest_info(cpu, pages);
-
-	/*
-	 * Set the trap number to 256 (impossible value).  If we fault while
-	 * switching to the Guest (bad segment registers or bug), this will
-	 * cause us to abort the Guest.
-	 */
-	cpu->regs->trapnum = 256;
-
-	/*
-	 * Now: we push the "eflags" register on the stack, then do an "lcall".
-	 * This is how we change from using the kernel code segment to using
-	 * the dedicated lguest code segment, as well as jumping into the
-	 * Switcher.
-	 *
-	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
-	 * stack, then the address of this call.  This stack layout happens to
-	 * exactly match the stack layout created by an interrupt...
-	 */
-	asm volatile("pushf; lcall *%4"
-		     /*
-		      * This is how we tell GCC that %eax ("a") and %ebx ("b")
-		      * are changed by this routine.  The "=" means output.
-		      */
-		     : "=a"(clobber), "=b"(clobber)
-		     /*
-		      * %eax contains the pages pointer.  ("0" refers to the
-		      * 0-th argument above, ie "a").  %ebx contains the
-		      * physical address of the Guest's top-level page
-		      * directory.
-		      */
-		     : "0"(pages), 
-		       "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
-		       "m"(lguest_entry)
-		     /*
-		      * We tell gcc that all these registers could change,
-		      * which means we don't have to save and restore them in
-		      * the Switcher.
-		      */
-		     : "memory", "%edx", "%ecx", "%edi", "%esi");
-}
-/*:*/
-
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
-{
-	switch (reg_off) {
-	case offsetof(struct pt_regs, bx):
-		return &cpu->regs->ebx;
-	case offsetof(struct pt_regs, cx):
-		return &cpu->regs->ecx;
-	case offsetof(struct pt_regs, dx):
-		return &cpu->regs->edx;
-	case offsetof(struct pt_regs, si):
-		return &cpu->regs->esi;
-	case offsetof(struct pt_regs, di):
-		return &cpu->regs->edi;
-	case offsetof(struct pt_regs, bp):
-		return &cpu->regs->ebp;
-	case offsetof(struct pt_regs, ax):
-		return &cpu->regs->eax;
-	case offsetof(struct pt_regs, ip):
-		return &cpu->regs->eip;
-	case offsetof(struct pt_regs, sp):
-		return &cpu->regs->esp;
-	}
-
-	/* Launcher can read these, but we don't allow any setting. */
-	if (any) {
-		switch (reg_off) {
-		case offsetof(struct pt_regs, ds):
-			return &cpu->regs->ds;
-		case offsetof(struct pt_regs, es):
-			return &cpu->regs->es;
-		case offsetof(struct pt_regs, fs):
-			return &cpu->regs->fs;
-		case offsetof(struct pt_regs, gs):
-			return &cpu->regs->gs;
-		case offsetof(struct pt_regs, cs):
-			return &cpu->regs->cs;
-		case offsetof(struct pt_regs, flags):
-			return &cpu->regs->eflags;
-		case offsetof(struct pt_regs, ss):
-			return &cpu->regs->ss;
-		}
-	}
-
-	return NULL;
-}
-
-/*M:002
- * There are hooks in the scheduler which we can register to tell when we
- * get kicked off the CPU (preempt_notifier_register()).  This would allow us
- * to lazily disable SYSENTER which would regain some performance, and should
- * also simplify copy_in_guest_info().  Note that we'd still need to restore
- * things when we exit to Launcher userspace, but that's fairly easy.
- *
- * We could also try using these hooks for PGE, but that might be too expensive.
- *
- * The hooks were designed for KVM, but we can also put them to good use.
-:*/
-
-/*H:040
- * This is the i386-specific code to setup and run the Guest.  Interrupts
- * are disabled: we own the CPU.
- */
-void lguest_arch_run_guest(struct lg_cpu *cpu)
-{
-	/*
-	 * SYSENTER is an optimized way of doing system calls.  We can't allow
-	 * it because it always jumps to privilege level 0.  A normal Guest
-	 * won't try it because we don't advertise it in CPUID, but a malicious
-	 * Guest (or malicious Guest userspace program) could, so we tell the
-	 * CPU to disable it before running the Guest.
-	 */
-	if (boot_cpu_has(X86_FEATURE_SEP))
-		wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
-
-	/*
-	 * Now we actually run the Guest.  It will return when something
-	 * interesting happens, and we can examine its registers to see what it
-	 * was doing.
-	 */
-	run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
-
-	/*
-	 * Note that the "regs" structure contains two extra entries which are
-	 * not really registers: a trap number which says what interrupt or
-	 * trap made the switcher code come back, and an error code which some
-	 * traps set.
-	 */
-
-	 /* Restore SYSENTER if it's supposed to be on. */
-	 if (boot_cpu_has(X86_FEATURE_SEP))
-		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-
-	/*
-	 * If the Guest page faulted, then the cr2 register will tell us the
-	 * bad virtual address.  We have to grab this now, because once we
-	 * re-enable interrupts an interrupt could fault and thus overwrite
-	 * cr2, or we could even move off to a different CPU.
-	 */
-	if (cpu->regs->trapnum == 14)
-		cpu->arch.last_pagefault = read_cr2();
-	/*
-	 * Similarly, if we took a trap because the Guest used the FPU,
-	 * we have to restore the FPU it expects to see.
-	 * fpu__restore() may sleep and we may even move off to
-	 * a different CPU. So all the critical stuff should be done
-	 * before this.
-	 */
-	else if (cpu->regs->trapnum == 7 && !fpregs_active())
-		fpu__restore(&current->thread.fpu);
-}
-
-/*H:130
- * Now we've examined the hypercall code; our Guest can make requests.
- * Our Guest is usually so well behaved; it never tries to do things it isn't
- * allowed to, and uses hypercalls instead.  Unfortunately, Linux's paravirtual
- * infrastructure isn't quite complete, because it doesn't contain replacements
- * for the Intel I/O instructions.  As a result, the Guest sometimes fumbles
- * across one during the boot process as it probes for various things which are
- * usually attached to a PC.
- *
- * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We queue this to be sent out to the
- * Launcher to handle.
- */
-
-/*
- * The eip contains the *virtual* address of the Guest's instruction:
- * we copy the instruction here so the Launcher doesn't have to walk
- * the page tables to decode it.  We handle the case (eg. in a kernel
- * module) where the instruction is over two pages, and the pages are
- * virtually but not physically contiguous.
- *
- * The longest possible x86 instruction is 15 bytes, but we don't handle
- * anything that strange.
- */
-static void copy_from_guest(struct lg_cpu *cpu,
-			    void *dst, unsigned long vaddr, size_t len)
-{
-	size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
-	unsigned long paddr;
-
-	BUG_ON(len > PAGE_SIZE);
-
-	/* If it goes over a page, copy in two parts. */
-	if (len > to_page_end) {
-		/* But make sure the next page is mapped! */
-		if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
-			copy_from_guest(cpu, dst + to_page_end,
-					vaddr + to_page_end,
-					len - to_page_end);
-		else
-			/* Otherwise fill with zeroes. */
-			memset(dst + to_page_end, 0, len - to_page_end);
-		len = to_page_end;
-	}
-
-	/* This will kill the guest if it isn't mapped, but that
-	 * shouldn't happen. */
-	__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
-}
-
-
-static void setup_emulate_insn(struct lg_cpu *cpu)
-{
-	cpu->pending.trap = 13;
-	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-			sizeof(cpu->pending.insn));
-}
-
-static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
-{
-	cpu->pending.trap = 14;
-	cpu->pending.addr = iomem_addr;
-	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-			sizeof(cpu->pending.insn));
-}
-
-/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lg_cpu *cpu)
-{
-	unsigned long iomem_addr;
-
-	switch (cpu->regs->trapnum) {
-	case 13: /* We've intercepted a General Protection Fault. */
-		/* Hand to Launcher to emulate those pesky IN and OUT insns */
-		if (cpu->regs->errcode == 0) {
-			setup_emulate_insn(cpu);
-			return;
-		}
-		break;
-	case 14: /* We've intercepted a Page Fault. */
-		/*
-		 * The Guest accessed a virtual address that wasn't mapped.
-		 * This happens a lot: we don't actually set up most of the page
-		 * tables for the Guest at all when we start: as it runs it asks
-		 * for more and more, and we set them up as required. In this
-		 * case, we don't even tell the Guest that the fault happened.
-		 *
-		 * The errcode tells whether this was a read or a write, and
-		 * whether kernel or userspace code.
-		 */
-		if (demand_page(cpu, cpu->arch.last_pagefault,
-				cpu->regs->errcode, &iomem_addr))
-			return;
-
-		/* Was this an access to memory mapped IO? */
-		if (iomem_addr) {
-			/* Tell Launcher, let it handle it. */
-			setup_iomem_insn(cpu, iomem_addr);
-			return;
-		}
-
-		/*
-		 * OK, it's really not there (or not OK): the Guest needs to
-		 * know.  We write out the cr2 value so it knows where the
-		 * fault occurred.
-		 *
-		 * Note that if the Guest were really messed up, this could
-		 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
-		 * lg->lguest_data could be NULL
-		 */
-		if (cpu->lg->lguest_data &&
-		    put_user(cpu->arch.last_pagefault,
-			     &cpu->lg->lguest_data->cr2))
-			kill_guest(cpu, "Writing cr2");
-		break;
-	case 7: /* We've intercepted a Device Not Available fault. */
-		/* No special handling is needed here. */
-		break;
-	case 32 ... 255:
-		/* This might be a syscall. */
-		if (could_be_syscall(cpu->regs->trapnum))
-			break;
-
-		/*
-		 * Other values mean a real interrupt occurred, in which case
-		 * the Host handler has already been run. We just do a
-		 * friendly check if another process should now be run, then
-		 * return to run the Guest again.
-		 */
-		cond_resched();
-		return;
-	case LGUEST_TRAP_ENTRY:
-		/*
-		 * Our 'struct hcall_args' maps directly over our regs: we set
-		 * up the pointer now to indicate a hypercall is pending.
-		 */
-		cpu->hcall = (struct hcall_args *)cpu->regs;
-		return;
-	}
-
-	/* We didn't handle the trap, so it needs to go to the Guest. */
-	if (!deliver_trap(cpu, cpu->regs->trapnum))
-		/*
-		 * If the Guest doesn't have a handler (either it hasn't
-		 * registered any yet, or it's one of the faults we don't let
-		 * it handle), it dies with this cryptic error message.
-		 */
-		kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
-			   cpu->regs->trapnum, cpu->regs->eip,
-			   cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
-			   : cpu->regs->errcode);
-}
-
-/*
- * Now we can look at each of the routines this calls, in increasing order of
- * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
- * deliver_trap() and demand_page().  After all those, we'll be ready to
- * examine the Switcher, and our philosophical understanding of the Host/Guest
- * duality will be complete.
-:*/
-static void adjust_pge(void *on)
-{
-	if (on)
-		cr4_set_bits(X86_CR4_PGE);
-	else
-		cr4_clear_bits(X86_CR4_PGE);
-}
-
-/*H:020
- * Now the Switcher is mapped and every thing else is ready, we need to do
- * some more i386-specific initialization.
- */
-void __init lguest_arch_host_init(void)
-{
-	int i;
-
-	/*
-	 * Most of the x86/switcher_32.S doesn't care that it's been moved; on
-	 * Intel, jumps are relative, and it doesn't access any references to
-	 * external code or data.
-	 *
-	 * The only exception is the interrupt handlers in switcher.S: their
-	 * addresses are placed in a table (default_idt_entries), so we need to
-	 * update the table with the new addresses.  switcher_offset() is a
-	 * convenience function which returns the distance between the
-	 * compiled-in switcher code and the high-mapped copy we just made.
-	 */
-	for (i = 0; i < IDT_ENTRIES; i++)
-		default_idt_entries[i] += switcher_offset();
-
-	/*
-	 * Set up the Switcher's per-cpu areas.
-	 *
-	 * Each CPU gets two pages of its own within the high-mapped region
-	 * (aka. "struct lguest_pages").  Much of this can be initialized now,
-	 * but some depends on what Guest we are running (which is set up in
-	 * copy_in_guest_info()).
-	 */
-	for_each_possible_cpu(i) {
-		/* lguest_pages() returns this CPU's two pages. */
-		struct lguest_pages *pages = lguest_pages(i);
-		/* This is a convenience pointer to make the code neater. */
-		struct lguest_ro_state *state = &pages->state;
-
-		/*
-		 * The Global Descriptor Table: the Host has a different one
-		 * for each CPU.  We keep a descriptor for the GDT which says
-		 * where it is and how big it is (the size is actually the last
-		 * byte, not the size, hence the "-1").
-		 */
-		state->host_gdt_desc.size = GDT_SIZE-1;
-		state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
-
-		/*
-		 * All CPUs on the Host use the same Interrupt Descriptor
-		 * Table, so we just use store_idt(), which gets this CPU's IDT
-		 * descriptor.
-		 */
-		store_idt(&state->host_idt_desc);
-
-		/*
-		 * The descriptors for the Guest's GDT and IDT can be filled
-		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
-		 * ->guest_idt before actually running the Guest.
-		 */
-		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
-		state->guest_idt_desc.address = (long)&state->guest_idt;
-		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
-		state->guest_gdt_desc.address = (long)&state->guest_gdt;
-
-		/*
-		 * We know where we want the stack to be when the Guest enters
-		 * the Switcher: in pages->regs.  The stack grows upwards, so
-		 * we start it at the end of that structure.
-		 */
-		state->guest_tss.sp0 = (long)(&pages->regs + 1);
-		/*
-		 * And this is the GDT entry to use for the stack: we keep a
-		 * couple of special LGUEST entries.
-		 */
-		state->guest_tss.ss0 = LGUEST_DS;
-
-		/*
-		 * x86 can have a finegrained bitmap which indicates what I/O
-		 * ports the process can use.  We set it to the end of our
-		 * structure, meaning "none".
-		 */
-		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
-
-		/*
-		 * Some GDT entries are the same across all Guests, so we can
-		 * set them up now.
-		 */
-		setup_default_gdt_entries(state);
-		/* Most IDT entries are the same for all Guests, too.*/
-		setup_default_idt_entries(state, default_idt_entries);
-
-		/*
-		 * The Host needs to be able to use the LGUEST segments on this
-		 * CPU, too, so put them in the Host GDT.
-		 */
-		get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-		get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-	}
-
-	/*
-	 * In the Switcher, we want the %cs segment register to use the
-	 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
-	 * it will be undisturbed when we switch.  To change %cs and jump we
-	 * need this structure to feed to Intel's "lcall" instruction.
-	 */
-	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
-	lguest_entry.segment = LGUEST_CS;
-
-	/*
-	 * Finally, we need to turn off "Page Global Enable".  PGE is an
-	 * optimization where page table entries are specially marked to show
-	 * they never change.  The Host kernel marks all the kernel pages this
-	 * way because it's always present, even when userspace is running.
-	 *
-	 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
-	 * switch to the Guest kernel.  If you don't disable this on all CPUs,
-	 * you'll get really weird bugs that you'll chase for two days.
-	 *
-	 * I used to turn PGE off every time we switched to the Guest and back
-	 * on when we return, but that slowed the Switcher down noticibly.
-	 */
-
-	/*
-	 * We don't need the complexity of CPUs coming and going while we're
-	 * doing this.
-	 */
-	get_online_cpus();
-	if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */
-		/* Remember that this was originally set (for cleanup). */
-		cpu_had_pge = 1;
-		/*
-		 * adjust_pge is a helper function which sets or unsets the PGE
-		 * bit on its CPU, depending on the argument (0 == unset).
-		 */
-		on_each_cpu(adjust_pge, (void *)0, 1);
-		/* Turn off the feature in the global feature set. */
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-	}
-	put_online_cpus();
-}
-/*:*/
-
-void __exit lguest_arch_host_fini(void)
-{
-	/* If we had PGE before we started, turn it back on now. */
-	get_online_cpus();
-	if (cpu_had_pge) {
-		set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-		/* adjust_pge's argument "1" means set PGE. */
-		on_each_cpu(adjust_pge, (void *)1, 1);
-	}
-	put_online_cpus();
-}
-
-
-/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-	switch (args->arg0) {
-	case LHCALL_LOAD_GDT_ENTRY:
-		load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_LOAD_IDT_ENTRY:
-		load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_LOAD_TLS:
-		guest_load_tls(cpu, args->arg1);
-		break;
-	default:
-		/* Bad Guest.  Bad! */
-		return -EIO;
-	}
-	return 0;
-}
-
-/*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
-{
-	u32 tsc_speed;
-
-	/*
-	 * The pointer to the Guest's "struct lguest_data" is the only argument.
-	 * We check that address now.
-	 */
-	if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
-			       sizeof(*cpu->lg->lguest_data)))
-		return -EFAULT;
-
-	/*
-	 * Having checked it, we simply set lg->lguest_data to point straight
-	 * into the Launcher's memory at the right place and then use
-	 * copy_to_user/from_user from now on, instead of lgread/write.  I put
-	 * this in to show that I'm not immune to writing stupid
-	 * optimizations.
-	 */
-	cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
-
-	/*
-	 * We insist that the Time Stamp Counter exist and doesn't change with
-	 * cpu frequency.  Some devious chip manufacturers decided that TSC
-	 * changes could be handled in software.  I decided that time going
-	 * backwards might be good for benchmarks, but it's bad for users.
-	 *
-	 * We also insist that the TSC be stable: the kernel detects unreliable
-	 * TSCs for its own purposes, and we use that here.
-	 */
-	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
-		tsc_speed = tsc_khz;
-	else
-		tsc_speed = 0;
-	if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
-		return -EFAULT;
-
-	/* The interrupt code might not like the system call vector. */
-	if (!check_syscall_vector(cpu->lg))
-		kill_guest(cpu, "bad syscall vector");
-
-	return 0;
-}
-/*:*/
-
-/*L:030
- * Most of the Guest's registers are left alone: we used get_zeroed_page() to
- * allocate the structure, so they will be 0.
- */
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
-{
-	struct lguest_regs *regs = cpu->regs;
-
-	/*
-	 * There are four "segment" registers which the Guest needs to boot:
-	 * The "code segment" register (cs) refers to the kernel code segment
-	 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
-	 * refer to the kernel data segment __KERNEL_DS.
-	 *
-	 * The privilege level is packed into the lower bits.  The Guest runs
-	 * at privilege level 1 (GUEST_PL).
-	 */
-	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
-	regs->cs = __KERNEL_CS|GUEST_PL;
-
-	/*
-	 * The "eflags" register contains miscellaneous flags.  Bit 1 (0x002)
-	 * is supposed to always be "1".  Bit 9 (0x200) controls whether
-	 * interrupts are enabled.  We always leave interrupts enabled while
-	 * running the Guest.
-	 */
-	regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
-
-	/*
-	 * The "Extended Instruction Pointer" register says where the Guest is
-	 * running.
-	 */
-	regs->eip = start;
-
-	/*
-	 * %esi points to our boot information, at physical address 0, so don't
-	 * touch it.
-	 */
-
-	/* There are a couple of GDT entries the Guest expects at boot. */
-	setup_guest_gdt(cpu);
-}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
deleted file mode 100644
index 40634b0db9f7..000000000000
--- a/drivers/lguest/x86/switcher_32.S
+++ /dev/null
@@ -1,388 +0,0 @@
-/*P:900
- * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
- * both the Host and Guest to do the low-level Guest<->Host switch.  It is as
- * simple as it can be made, but it's naturally very specific to x86.
- *
- * You have now completed Preparation.  If this has whet your appetite; if you
- * are feeling invigorated and refreshed then the next, more challenging stage
- * can be found in "make Guest".
- :*/
-
-/*M:012
- * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
- * gain at least 1% more performance.  Since neither LOC nor performance can be
- * measured beforehand, it generally means implementing a feature then deciding
- * if it's worth it.  And once it's implemented, who can say no?
- *
- * This is why I haven't implemented this idea myself.  I want to, but I
- * haven't.  You could, though.
- *
- * The main place where lguest performance sucks is Guest page faulting.  When
- * a Guest userspace process hits an unmapped page we switch back to the Host,
- * walk the page tables, find it's not mapped, switch back to the Guest page
- * fault handler, which calls a hypercall to set the page table entry, then
- * finally returns to userspace.  That's two round-trips.
- *
- * If we had a small walker in the Switcher, we could quickly check the Guest
- * page table and if the page isn't mapped, immediately reflect the fault back
- * into the Guest.  This means the Switcher would have to know the top of the
- * Guest page table and the page fault handler address.
- *
- * For simplicity, the Guest should only handle the case where the privilege
- * level of the fault is 3 and probably only not present or write faults.  It
- * should also detect recursive faults, and hand the original fault to the
- * Host (which is actually really easy).
- *
- * Two questions remain.  Would the performance gain outweigh the complexity?
- * And who would write the verse documenting it?
-:*/
-
-/*M:011
- * Lguest64 handles NMI.  This gave me NMI envy (until I looked at their
- * code).  It's worth doing though, since it would let us use oprofile in the
- * Host when a Guest is running.
-:*/
-
-/*S:100
- * Welcome to the Switcher itself!
- *
- * This file contains the low-level code which changes the CPU to run the Guest
- * code, and returns to the Host when something happens.  Understand this, and
- * you understand the heart of our journey.
- *
- * Because this is in assembler rather than C, our tale switches from prose to
- * verse.  First I tried limericks:
- *
- *	There once was an eax reg,
- *	To which our pointer was fed,
- *	It needed an add,
- *	Which asm-offsets.h had
- *	But this limerick is hurting my head.
- *
- * Next I tried haikus, but fitting the required reference to the seasons in
- * every stanza was quickly becoming tiresome:
- *
- *	The %eax reg
- *	Holds "struct lguest_pages" now:
- *	Cherry blossoms fall.
- *
- * Then I started with Heroic Verse, but the rhyming requirement leeched away
- * the content density and led to some uniquely awful oblique rhymes:
- *
- *	These constants are coming from struct offsets
- *	For use within the asm switcher text.
- *
- * Finally, I settled for something between heroic hexameter, and normal prose
- * with inappropriate linebreaks.  Anyway, it aint no Shakespeare.
- */
-
-// Not all kernel headers work from assembler
-// But these ones are needed: the ENTRY() define
-// And constants extracted from struct offsets
-// To avoid magic numbers and breakage:
-// Should they change the compiler can't save us
-// Down here in the depths of assembler code.
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/segment.h>
-#include <asm/lguest.h>
-
-// We mark the start of the code to copy
-// It's placed in .text tho it's never run here
-// You'll see the trick macro at the end
-// Which interleaves data and text to effect.
-.text
-ENTRY(start_switcher_text)
-
-// When we reach switch_to_guest we have just left
-// The safe and comforting shores of C code
-// %eax has the "struct lguest_pages" to use
-// Where we save state and still see it from the Guest
-// And %ebx holds the Guest shadow pagetable:
-// Once set we have truly left Host behind.
-ENTRY(switch_to_guest)
-	// We told gcc all its regs could fade,
-	// Clobbered by our journey into the Guest
-	// We could have saved them, if we tried
-	// But time is our master and cycles count.
-
-	// Segment registers must be saved for the Host
-	// We push them on the Host stack for later
-	pushl	%es
-	pushl	%ds
-	pushl	%gs
-	pushl	%fs
-	// But the compiler is fickle, and heeds
-	// No warning of %ebp clobbers
-	// When frame pointers are used.  That register
-	// Must be saved and restored or chaos strikes.
-	pushl	%ebp
-	// The Host's stack is done, now save it away
-	// In our "struct lguest_pages" at offset
-	// Distilled into asm-offsets.h
-	movl	%esp, LGUEST_PAGES_host_sp(%eax)
-
-	// All saved and there's now five steps before us:
-	// Stack, GDT, IDT, TSS
-	// Then last of all the page tables are flipped.
-
-	// Yet beware that our stack pointer must be
-	// Always valid lest an NMI hits
-	// %edx does the duty here as we juggle
-	// %eax is lguest_pages: our stack lies within.
-	movl	%eax, %edx
-	addl	$LGUEST_PAGES_regs, %edx
-	movl	%edx, %esp
-
-	// The Guest's GDT we so carefully
-	// Placed in the "struct lguest_pages" before
-	lgdt	LGUEST_PAGES_guest_gdt_desc(%eax)
-
-	// The Guest's IDT we did partially
-	// Copy to "struct lguest_pages" as well.
-	lidt	LGUEST_PAGES_guest_idt_desc(%eax)
-
-	// The TSS entry which controls traps
-	// Must be loaded up with "ltr" now:
-	// The GDT entry that TSS uses 
-	// Changes type when we load it: damn Intel!
-	// For after we switch over our page tables
-	// That entry will be read-only: we'd crash.
-	movl	$(GDT_ENTRY_TSS*8), %edx
-	ltr	%dx
-
-	// Look back now, before we take this last step!
-	// The Host's TSS entry was also marked used;
-	// Let's clear it again for our return.
-	// The GDT descriptor of the Host
-	// Points to the table after two "size" bytes
-	movl	(LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
-	// Clear "used" from type field (byte 5, bit 2)
-	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
-
-	// Once our page table's switched, the Guest is live!
-	// The Host fades as we run this final step.
-	// Our "struct lguest_pages" is now read-only.
-	movl	%ebx, %cr3
-
-	// The page table change did one tricky thing:
-	// The Guest's register page has been mapped
-	// Writable under our %esp (stack) --
-	// We can simply pop off all Guest regs.
-	popl	%eax
-	popl	%ebx
-	popl	%ecx
-	popl	%edx
-	popl	%esi
-	popl	%edi
-	popl	%ebp
-	popl	%gs
-	popl	%fs
-	popl	%ds
-	popl	%es
-
-	// Near the base of the stack lurk two strange fields
-	// Which we fill as we exit the Guest
-	// These are the trap number and its error
-	// We can simply step past them on our way.
-	addl	$8, %esp
-
-	// The last five stack slots hold return address
-	// And everything needed to switch privilege
-	// From Switcher's level 0 to Guest's 1,
-	// And the stack where the Guest had last left it.
-	// Interrupts are turned back on: we are Guest.
-	iret
-
-// We tread two paths to switch back to the Host
-// Yet both must save Guest state and restore Host
-// So we put the routine in a macro.
-#define SWITCH_TO_HOST							\
-	/* We save the Guest state: all registers first			\
-	 * Laid out just as "struct lguest_regs" defines */		\
-	pushl	%es;							\
-	pushl	%ds;							\
-	pushl	%fs;							\
-	pushl	%gs;							\
-	pushl	%ebp;							\
-	pushl	%edi;							\
-	pushl	%esi;							\
-	pushl	%edx;							\
-	pushl	%ecx;							\
-	pushl	%ebx;							\
-	pushl	%eax;							\
-	/* Our stack and our code are using segments			\
-	 * Set in the TSS and IDT					\
-	 * Yet if we were to touch data we'd use			\
-	 * Whatever data segment the Guest had.				\
-	 * Load the lguest ds segment for now. */			\
-	movl	$(LGUEST_DS), %eax;					\
-	movl	%eax, %ds;						\
-	/* So where are we?  Which CPU, which struct?			\
-	 * The stack is our clue: our TSS starts			\
-	 * It at the end of "struct lguest_pages".			\
-	 * Or we may have stumbled while restoring			\
-	 * Our Guest segment regs while in switch_to_guest,		\
-	 * The fault pushed atop that part-unwound stack.		\
-	 * If we round the stack down to the page start			\
-	 * We're at the start of "struct lguest_pages". */		\
-	movl	%esp, %eax;						\
-	andl	$(~(1 << PAGE_SHIFT - 1)), %eax;			\
-	/* Save our trap number: the switch will obscure it		\
-	 * (In the Host the Guest regs are not mapped here)		\
-	 * %ebx holds it safe for deliver_to_host */			\
-	movl	LGUEST_PAGES_regs_trapnum(%eax), %ebx;			\
-	/* The Host GDT, IDT and stack!					\
-	 * All these lie safely hidden from the Guest:			\
-	 * We must return to the Host page tables			\
-	 * (Hence that was saved in struct lguest_pages) */		\
-	movl	LGUEST_PAGES_host_cr3(%eax), %edx;			\
-	movl	%edx, %cr3;						\
-	/* As before, when we looked back at the Host			\
-	 * As we left and marked TSS unused				\
-	 * So must we now for the Guest left behind. */			\
-	andb	$0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
-	/* Switch to Host's GDT, IDT. */				\
-	lgdt	LGUEST_PAGES_host_gdt_desc(%eax);			\
-	lidt	LGUEST_PAGES_host_idt_desc(%eax);			\
-	/* Restore the Host's stack where its saved regs lie */		\
-	movl	LGUEST_PAGES_host_sp(%eax), %esp;			\
-	/* Last the TSS: our Host is returned */			\
-	movl	$(GDT_ENTRY_TSS*8), %edx;				\
-	ltr	%dx;							\
-	/* Restore now the regs saved right at the first. */		\
-	popl	%ebp;							\
-	popl	%fs;							\
-	popl	%gs;							\
-	popl	%ds;							\
-	popl	%es
-
-// The first path is trod when the Guest has trapped:
-// (Which trap it was has been pushed on the stack).
-// We need only switch back, and the Host will decode
-// Why we came home, and what needs to be done.
-return_to_host:
-	SWITCH_TO_HOST
-	iret
-
-// We are lead to the second path like so:
-// An interrupt, with some cause external
-// Has ajerked us rudely from the Guest's code
-// Again we must return home to the Host
-deliver_to_host:
-	SWITCH_TO_HOST
-	// But now we must go home via that place
-	// Where that interrupt was supposed to go
-	// Had we not been ensconced, running the Guest.
-	// Here we see the trickness of run_guest_once():
-	// The Host stack is formed like an interrupt
-	// With EIP, CS and EFLAGS layered.
-	// Interrupt handlers end with "iret"
-	// And that will take us home at long long last.
-
-	// But first we must find the handler to call!
-	// The IDT descriptor for the Host
-	// Has two bytes for size, and four for address:
-	// %edx will hold it for us for now.
-	movl	(LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
-	// We now know the table address we need,
-	// And saved the trap's number inside %ebx.
-	// Yet the pointer to the handler is smeared
-	// Across the bits of the table entry.
-	// What oracle can tell us how to extract
-	// From such a convoluted encoding?
-	// I consulted gcc, and it gave
-	// These instructions, which I gladly credit:
-	leal	(%edx,%ebx,8), %eax
-	movzwl	(%eax),%edx
-	movl	4(%eax), %eax
-	xorw	%ax, %ax
-	orl	%eax, %edx
-	// Now the address of the handler's in %edx
-	// We call it now: its "iret" drops us home.
-	jmp	*%edx
-
-// Every interrupt can come to us here
-// But we must truly tell each apart.
-// They number two hundred and fifty six
-// And each must land in a different spot,
-// Push its number on stack, and join the stream.
-
-// And worse, a mere six of the traps stand apart
-// And push on their stack an addition:
-// An error number, thirty two bits long
-// So we punish the other two fifty
-// And make them push a zero so they match.
-
-// Yet two fifty six entries is long
-// And all will look most the same as the last
-// So we create a macro which can make
-// As many entries as we need to fill.
-
-// Note the change to .data then .text:
-// We plant the address of each entry
-// Into a (data) table for the Host
-// To know where each Guest interrupt should go.
-.macro IRQ_STUB N TARGET
-	.data; .long 1f; .text; 1:
- // Trap eight, ten through fourteen and seventeen
- // Supply an error number.  Else zero.
- .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
-	pushl	$0
- .endif
-	pushl	$\N
-	jmp	\TARGET
-	ALIGN
-.endm
-
-// This macro creates numerous entries
-// Using GAS macros which out-power C's.
-.macro IRQ_STUBS FIRST LAST TARGET
- irq=\FIRST
- .rept \LAST-\FIRST+1
-	IRQ_STUB irq \TARGET
-  irq=irq+1
- .endr
-.endm
-
-// Here's the marker for our pointer table
-// Laid in the data section just before
-// Each macro places the address of code
-// Forming an array: each one points to text
-// Which handles interrupt in its turn.
-.data
-.global default_idt_entries
-default_idt_entries:
-.text
-	// The first two traps go straight back to the Host
-	IRQ_STUBS 0 1 return_to_host
-	// We'll say nothing, yet, about NMI
-	IRQ_STUB 2 handle_nmi
-	// Other traps also return to the Host
-	IRQ_STUBS 3 31 return_to_host
-	// All interrupts go via their handlers
-	IRQ_STUBS 32 127 deliver_to_host
-	// 'Cept system calls coming from userspace
-	// Are to go to the Guest, never the Host.
-	IRQ_STUB 128 return_to_host
-	IRQ_STUBS 129 255 deliver_to_host
-
-// The NMI, what a fabulous beast
-// Which swoops in and stops us no matter that
-// We're suspended between heaven and hell,
-// (Or more likely between the Host and Guest)
-// When in it comes!  We are dazed and confused
-// So we do the simplest thing which one can.
-// Though we've pushed the trap number and zero
-// We discard them, return, and hope we live.
-handle_nmi:
-	addl	$8, %esp
-	iret
-
-// We are done; all that's left is Mastery
-// And "make Mastery" is a journey long
-// Designed to make your fingers itch to code.
-
-// Here ends the text, the file and poem.
-ENTRY(end_switcher_text)
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 83a1616903f8..aba0d652095b 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -333,7 +333,7 @@ config VIRTIO_NET
 	depends on VIRTIO
 	---help---
 	  This is the virtual network driver for virtio.  It can be used with
-	  lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+	  QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
 config NLMON
 	tristate "Virtual netlink monitoring device"
diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig
index b8d5ea0ae26b..fec457edad14 100644
--- a/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@ -4,7 +4,7 @@ config HVC_DRIVER
 	bool
 	help
 	  Generic "hypervisor virtual console" infrastructure for various
-	  hypervisors (pSeries, iSeries, Xen, lguest).
+	  hypervisors (pSeries, iSeries, Xen).
 	  It will automatically be selected if one of the back-end console drivers
 	  is selected.
 
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 623f72334fa5..cff773f15b7e 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -2,8 +2,8 @@ config VIRTIO
 	tristate
 	---help---
 	  This option is selected by any driver which implements the virtio
-	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
-	  CONFIG_RPMSG or CONFIG_S390_GUEST.
+	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
+	  or CONFIG_S390_GUEST.
 
 menu "Virtio drivers"
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 719c2e943ea1..98fd8f6df851 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = {
 static int sched_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
+	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	struct task_struct *p;
 
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
-	proc_sched_show_task(p, m);
+	proc_sched_show_task(p, ns, m);
 
 	put_task_struct(p);
 
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index 9f0681bf1e87..66260777d644 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -22,17 +22,6 @@
 #include <asm-generic/qspinlock_types.h>
 
 /**
- * queued_spin_unlock_wait - wait until the _current_ lock holder releases the lock
- * @lock : Pointer to queued spinlock structure
- *
- * There is a very slight possibility of live-lock if the lockers keep coming
- * and the waiter is just unfortunate enough to not see any unlock state.
- */
-#ifndef queued_spin_unlock_wait
-extern void queued_spin_unlock_wait(struct qspinlock *lock);
-#endif
-
-/**
  * queued_spin_is_locked - is the spinlock locked?
  * @lock: Pointer to queued spinlock structure
  * Return: 1 if it is locked, 0 otherwise
@@ -41,8 +30,6 @@ extern void queued_spin_unlock_wait(struct qspinlock *lock);
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
 	/*
-	 * See queued_spin_unlock_wait().
-	 *
 	 * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL
 	 * isn't immediately observable.
 	 */
@@ -135,6 +122,5 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 #define arch_spin_trylock(l)		queued_spin_trylock(l)
 #define arch_spin_unlock(l)		queued_spin_unlock(l)
 #define arch_spin_lock_flags(l, f)	queued_spin_lock(l)
-#define arch_spin_unlock_wait(l)	queued_spin_unlock_wait(l)
 
 #endif /* __ASM_GENERIC_QSPINLOCK_H */
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 532372c6cf15..e5da44eddd2f 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -27,6 +27,8 @@
  *	__kprobes_text_start, __kprobes_text_end
  *	__entry_text_start, __entry_text_end
  *	__ctors_start, __ctors_end
+ *	__irqentry_text_start, __irqentry_text_end
+ *	__softirqentry_text_start, __softirqentry_text_end
  */
 extern char _text[], _stext[], _etext[];
 extern char _data[], _sdata[], _edata[];
@@ -39,6 +41,8 @@ extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __entry_text_start[], __entry_text_end[];
 extern char __start_rodata[], __end_rodata[];
+extern char __irqentry_text_start[], __irqentry_text_end[];
+extern char __softirqentry_text_start[], __softirqentry_text_end[];
 
 /* Start and end of .ctors section - used for constructor calls. */
 extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 9623d78f8494..9fdb54a95976 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -497,25 +497,17 @@
 		*(.entry.text)						\
 		VMLINUX_SYMBOL(__entry_text_end) = .;
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__irqentry_text_start) = .;		\
 		*(.irqentry.text)					\
 		VMLINUX_SYMBOL(__irqentry_text_end) = .;
-#else
-#define IRQENTRY_TEXT
-#endif
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define SOFTIRQENTRY_TEXT						\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__softirqentry_text_start) = .;		\
 		*(.softirqentry.text)					\
 		VMLINUX_SYMBOL(__softirqentry_text_end) = .;
-#else
-#define SOFTIRQENTRY_TEXT
-#endif
 
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  *(.head.text)
@@ -694,6 +686,31 @@
 #define BUG_TABLE
 #endif
 
+#ifdef CONFIG_ORC_UNWINDER
+#define ORC_UNWIND_TABLE						\
+	. = ALIGN(4);							\
+	.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__start_orc_unwind_ip) = .;		\
+		KEEP(*(.orc_unwind_ip))					\
+		VMLINUX_SYMBOL(__stop_orc_unwind_ip) = .;		\
+	}								\
+	. = ALIGN(6);							\
+	.orc_unwind : AT(ADDR(.orc_unwind) - LOAD_OFFSET) {		\
+		VMLINUX_SYMBOL(__start_orc_unwind) = .;			\
+		KEEP(*(.orc_unwind))					\
+		VMLINUX_SYMBOL(__stop_orc_unwind) = .;			\
+	}								\
+	. = ALIGN(4);							\
+	.orc_lookup : AT(ADDR(.orc_lookup) - LOAD_OFFSET) {		\
+		VMLINUX_SYMBOL(orc_lookup) = .;				\
+		. += (((SIZEOF(.text) + LOOKUP_BLOCK_SIZE - 1) /	\
+			LOOKUP_BLOCK_SIZE) + 1) * 4;			\
+		VMLINUX_SYMBOL(orc_lookup_end) = .;			\
+	}
+#else
+#define ORC_UNWIND_TABLE
+#endif
+
 #ifdef CONFIG_PM_TRACE
 #define TRACEDATA							\
 	. = ALIGN(4);							\
@@ -880,7 +897,7 @@
 		DATA_DATA						\
 		CONSTRUCTORS						\
 	}								\
-	BUG_TABLE
+	BUG_TABLE							\
 
 #define INIT_TEXT_SECTION(inittext_align)				\
 	. = ALIGN(inittext_align);					\
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 10825052b03f..310f51d42550 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -201,22 +201,6 @@
 #endif
 #endif
 
-#ifdef CONFIG_STACK_VALIDATION
-#define annotate_unreachable() ({					\
-	asm("%c0:\n\t"							\
-	    ".pushsection .discard.unreachable\n\t"			\
-	    ".long %c0b - .\n\t"					\
-	    ".popsection\n\t" : : "i" (__LINE__));			\
-})
-#define ASM_UNREACHABLE							\
-	"999:\n\t"							\
-	".pushsection .discard.unreachable\n\t"				\
-	".long 999b - .\n\t"						\
-	".popsection\n\t"
-#else
-#define annotate_unreachable()
-#endif
-
 /*
  * Mark a position in code as unreachable.  This can be used to
  * suppress control flow warnings after asm blocks that transfer
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 535504312fc3..e786337cf5a7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -185,11 +185,34 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #endif
 
 /* Unreachable code */
+#ifdef CONFIG_STACK_VALIDATION
+#define annotate_reachable() ({						\
+	asm("%c0:\n\t"							\
+	    ".pushsection .discard.reachable\n\t"			\
+	    ".long %c0b - .\n\t"					\
+	    ".popsection\n\t" : : "i" (__LINE__));			\
+})
+#define annotate_unreachable() ({					\
+	asm("%c0:\n\t"							\
+	    ".pushsection .discard.unreachable\n\t"			\
+	    ".long %c0b - .\n\t"					\
+	    ".popsection\n\t" : : "i" (__LINE__));			\
+})
+#define ASM_UNREACHABLE							\
+	"999:\n\t"							\
+	".pushsection .discard.unreachable\n\t"				\
+	".long 999b - .\n\t"						\
+	".popsection\n\t"
+#else
+#define annotate_reachable()
+#define annotate_unreachable()
+#endif
+
 #ifndef ASM_UNREACHABLE
 # define ASM_UNREACHABLE
 #endif
 #ifndef unreachable
-# define unreachable() do { } while (1)
+# define unreachable() do { annotate_reachable(); do { } while (1); } while (0)
 #endif
 
 /*
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8269bcb8ccf7..a686ca9a7e5c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1020,6 +1020,28 @@ extern int efi_memattr_init(void);
 extern int efi_memattr_apply_permissions(struct mm_struct *mm,
 					 efi_memattr_perm_setter fn);
 
+/*
+ * efi_early_memdesc_ptr - get the n-th EFI memmap descriptor
+ * @map: the start of efi memmap
+ * @desc_size: the size of space for each EFI memmap descriptor
+ * @n: the index of efi memmap descriptor
+ *
+ * EFI boot service provides the GetMemoryMap() function to get a copy of the
+ * current memory map which is an array of memory descriptors, each of
+ * which describes a contiguous block of memory. It also gets the size of the
+ * map, and the size of each descriptor, etc.
+ *
+ * Note that per section 6.2 of UEFI Spec 2.6 Errata A, the returned size of
+ * each descriptor might not be equal to sizeof(efi_memory_memdesc_t),
+ * since efi_memory_memdesc_t may be extended in the future. Thus the OS
+ * MUST use the returned size of the descriptor to find the start of each
+ * efi_memory_memdesc_t in the memory map array. This should only be used
+ * during bootup since for_each_efi_memory_desc_xxx() is available after the
+ * kernel initializes the EFI subsystem to set up struct efi_memory_map.
+ */
+#define efi_early_memdesc_ptr(map, desc_size, n)			\
+	(efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size)))
+
 /* Iterate through an efi_memory_map */
 #define for_each_efi_memory_desc_in_map(m, md)				   \
 	for ((md) = (m)->map;						   \
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a2f6707e9fc0..0e849715e5be 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -126,17 +126,11 @@ extern struct group_info init_groups;
 #endif
 
 #ifdef CONFIG_PREEMPT_RCU
-#define INIT_TASK_RCU_TREE_PREEMPT()					\
-	.rcu_blocked_node = NULL,
-#else
-#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
-#endif
-#ifdef CONFIG_PREEMPT_RCU
 #define INIT_TASK_RCU_PREEMPT(tsk)					\
 	.rcu_read_lock_nesting = 0,					\
 	.rcu_read_unlock_special.s = 0,					\
 	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
-	INIT_TASK_RCU_TREE_PREEMPT()
+	.rcu_blocked_node = NULL,
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a2fddddb0d60..59ba11661b6e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
 #include <linux/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/irq.h>
+#include <asm/sections.h>
 
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
@@ -726,7 +727,6 @@ extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 /*
  * We want to know which function is an entrypoint of a hardirq or a softirq.
  */
@@ -734,16 +734,4 @@ extern int arch_early_irq_init(void);
 #define __softirq_entry  \
 	__attribute__((__section__(".softirqentry.text")))
 
-/* Limits of hardirq entrypoints */
-extern char __irqentry_text_start[];
-extern char __irqentry_text_end[];
-/* Limits of softirq entrypoints */
-extern char __softirqentry_text_start[];
-extern char __softirqentry_text_end[];
-
-#else
-#define __irq_entry
-#define __softirq_entry
-#endif
-
 #endif
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
deleted file mode 100644
index 6db19f35f7c5..000000000000
--- a/include/linux/lguest.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Things the lguest guest needs to know.  Note: like all lguest interfaces,
- * this is subject to wild and random change between versions.
- */
-#ifndef _LINUX_LGUEST_H
-#define _LINUX_LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/time.h>
-#include <asm/irq.h>
-#include <asm/lguest_hcall.h>
-
-#define LG_CLOCK_MIN_DELTA	100UL
-#define LG_CLOCK_MAX_DELTA	ULONG_MAX
-
-/*G:031
- * The second method of communicating with the Host is to via "struct
- * lguest_data".  Once the Guest's initialization hypercall tells the Host where
- * this is, the Guest and Host both publish information in it.
-:*/
-struct lguest_data {
-	/*
-	 * 512 == enabled (same as eflags in normal hardware).  The Guest
-	 * changes interrupts so often that a hypercall is too slow.
-	 */
-	unsigned int irq_enabled;
-	/* Fine-grained interrupt disabling by the Guest */
-	DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
-
-	/*
-	 * The Host writes the virtual address of the last page fault here,
-	 * which saves the Guest a hypercall.  CR2 is the native register where
-	 * this address would normally be found.
-	 */
-	unsigned long cr2;
-
-	/* Wallclock time set by the Host. */
-	struct timespec time;
-
-	/*
-	 * Interrupt pending set by the Host.  The Guest should do a hypercall
-	 * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF).
-	 */
-	int irq_pending;
-
-	/*
-	 * Async hypercall ring.  Instead of directly making hypercalls, we can
-	 * place them in here for processing the next time the Host wants.
-	 * This batching can be quite efficient.
-	 */
-
-	/* 0xFF == done (set by Host), 0 == pending (set by Guest). */
-	u8 hcall_status[LHCALL_RING_SIZE];
-	/* The actual registers for the hypercalls. */
-	struct hcall_args hcalls[LHCALL_RING_SIZE];
-
-/* Fields initialized by the Host at boot: */
-	/* Memory not to try to access */
-	unsigned long reserve_mem;
-	/* KHz for the TSC clock. */
-	u32 tsc_khz;
-
-/* Fields initialized by the Guest at boot: */
-	/* Instruction to suppress interrupts even if enabled */
-	unsigned long noirq_iret;
-	/* Address above which page tables are all identical. */
-	unsigned long kernel_address;
-	/* The vector to try to use for system calls (0x40 or 0x80). */
-	unsigned int syscall_vec;
-};
-extern struct lguest_data lguest_data;
-#endif /* __ASSEMBLY__ */
-#endif	/* _LINUX_LGUEST_H */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
deleted file mode 100644
index acd5b12565cc..000000000000
--- a/include/linux/lguest_launcher.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _LINUX_LGUEST_LAUNCHER
-#define _LINUX_LGUEST_LAUNCHER
-/* Everything the "lguest" userspace program needs to know. */
-#include <linux/types.h>
-
-/*D:010
- * Drivers
- *
- * The Guest needs devices to do anything useful.  Since we don't let it touch
- * real devices (think of the damage it could do!) we provide virtual devices.
- * We emulate a PCI bus with virtio devices on it; we used to have our own
- * lguest bus which was far simpler, but this tests the virtio 1.0 standard.
- *
- * Virtio devices are also used by kvm, so we can simply reuse their optimized
- * device drivers.  And one day when everyone uses virtio, my plan will be
- * complete.  Bwahahahah!
- */
-
-/* Write command first word is a request. */
-enum lguest_req
-{
-	LHREQ_INITIALIZE, /* + base, pfnlimit, start */
-	LHREQ_GETDMA, /* No longer used */
-	LHREQ_IRQ, /* + irq */
-	LHREQ_BREAK, /* No longer used */
-	LHREQ_EVENTFD, /* No longer used. */
-	LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
-	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
-	LHREQ_TRAP, /* + trap number to deliver to guest. */
-};
-
-/*
- * This is what read() of the lguest fd populates.  trap ==
- * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
- * argument), 14 for a page fault in the MMIO region (addr is
- * the trap address, insn is the instruction), or 13 for a GPF
- * (insn is the instruction).
- */
-struct lguest_pending {
-	__u8 trap;
-	__u8 insn[7];
-	__u32 addr;
-};
-#endif /* _LINUX_LGUEST_LAUNCHER */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c00cd4b02f32..718ba163c1b9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -147,9 +147,6 @@ struct hw_perf_event {
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
 		};
-		struct { /* itrace */
-			int			itrace_started;
-		};
 		struct { /* amd_power */
 			u64	pwr_acc;
 			u64	ptsc;
@@ -541,6 +538,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
+#define PERF_ATTACH_ITRACE	0x10
 
 struct perf_cgroup;
 struct ring_buffer;
@@ -864,6 +862,7 @@ extern int perf_aux_output_skip(struct perf_output_handle *handle,
 				unsigned long size);
 extern void *perf_get_aux(struct perf_output_handle *handle);
 extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
+extern void perf_event_itrace_started(struct perf_event *event);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
@@ -944,6 +943,8 @@ struct perf_sample_data {
 
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
+
+	u64				phys_addr;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f816fc72b51e..96f1baf62ab8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -58,8 +58,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
 void synchronize_sched(void);
-void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
-void synchronize_rcu_tasks(void);
 void rcu_barrier_tasks(void);
 
 #ifdef CONFIG_PREEMPT_RCU
@@ -105,11 +103,13 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 void rcu_init(void);
+extern int rcu_scheduler_active __read_mostly;
 void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
 void rcu_report_dead(unsigned int cpu);
 void rcu_cpu_starting(unsigned int cpu);
+void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_RCU_STALL_COMMON
 void rcu_sysrq_start(void);
@@ -164,8 +164,6 @@ static inline void rcu_init_nohz(void) { }
  * macro rather than an inline function to avoid #include hell.
  */
 #ifdef CONFIG_TASKS_RCU
-#define TASKS_RCU(x) x
-extern struct srcu_struct tasks_rcu_exit_srcu;
 #define rcu_note_voluntary_context_switch_lite(t) \
 	do { \
 		if (READ_ONCE((t)->rcu_tasks_holdout)) \
@@ -176,10 +174,17 @@ extern struct srcu_struct tasks_rcu_exit_srcu;
 		rcu_all_qs(); \
 		rcu_note_voluntary_context_switch_lite(t); \
 	} while (0)
+void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
+void synchronize_rcu_tasks(void);
+void exit_tasks_rcu_start(void);
+void exit_tasks_rcu_finish(void);
 #else /* #ifdef CONFIG_TASKS_RCU */
-#define TASKS_RCU(x) do { } while (0)
 #define rcu_note_voluntary_context_switch_lite(t)	do { } while (0)
 #define rcu_note_voluntary_context_switch(t)		rcu_all_qs()
+#define call_rcu_tasks call_rcu_sched
+#define synchronize_rcu_tasks synchronize_sched
+static inline void exit_tasks_rcu_start(void) { }
+static inline void exit_tasks_rcu_finish(void) { }
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5becbbccb998..b3dbf9502fd0 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -116,13 +116,11 @@ static inline void rcu_irq_exit_irqson(void) { }
 static inline void rcu_irq_enter_irqson(void) { }
 static inline void rcu_irq_exit(void) { }
 static inline void exit_rcu(void) { }
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-extern int rcu_scheduler_active __read_mostly;
+#ifdef CONFIG_SRCU
 void rcu_scheduler_starting(void);
-#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
+#else /* #ifndef CONFIG_SRCU */
 static inline void rcu_scheduler_starting(void) { }
-#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
+#endif /* #else #ifndef CONFIG_SRCU */
 static inline void rcu_end_inkernel_boot(void) { }
 static inline bool rcu_is_watching(void) { return true; }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 93be319e0cbf..9ba42c663fba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -589,9 +589,10 @@ struct task_struct {
 
 #ifdef CONFIG_TASKS_RCU
 	unsigned long			rcu_tasks_nvcsw;
-	bool				rcu_tasks_holdout;
-	struct list_head		rcu_tasks_holdout_list;
+	u8				rcu_tasks_holdout;
+	u8				rcu_tasks_idx;
 	int				rcu_tasks_idle_cpu;
+	struct list_head		rcu_tasks_holdout_list;
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
 	struct sched_info		sched_info;
@@ -1242,6 +1243,19 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 	return task_pgrp_nr_ns(tsk, &init_pid_ns);
 }
 
+static inline char task_state_to_char(struct task_struct *task)
+{
+	const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+	unsigned long state = task->state;
+
+	state = state ? __ffs(state) + 1 : 0;
+
+	/* Make sure the string lines up properly with the number of task states: */
+	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
+	return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?';
+}
+
 /**
  * is_global_init - check if a task structure is init. Since init
  * is free to have sub-threads we need to check tgid.
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index e0eaee54c5a4..5d58d49e9f87 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -6,6 +6,7 @@
  */
 
 struct task_struct;
+struct pid_namespace;
 
 extern void dump_cpu_task(int cpu);
 
@@ -34,7 +35,8 @@ extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_SCHED_DEBUG
 struct seq_file;
-extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+extern void proc_sched_show_task(struct task_struct *p,
+				 struct pid_namespace *ns, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
 #endif
 
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index c97e5f096927..79a2a744648d 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -30,7 +30,6 @@ extern int lockdep_tasklist_lock_is_held(void);
 
 extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
-extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7d065abc7a47..d7b6dab956ec 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,6 +71,14 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+
+	/*
+	 * Some variables from the most recent sd_lb_stats for this domain,
+	 * used by wake_affine().
+	 */
+	unsigned long	nr_running;
+	unsigned long	load;
+	unsigned long	capacity;
 };
 
 struct sched_domain {
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 4e8cce19b507..69e079c5ff98 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -153,12 +153,6 @@ do {								\
 #define smp_mb__after_spinlock()	do { } while (0)
 #endif
 
-/**
- * raw_spin_unlock_wait - wait until the spinlock gets unlocked
- * @lock: the spinlock in question.
- */
-#define raw_spin_unlock_wait(lock)	arch_spin_unlock_wait(&(lock)->raw_lock)
-
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
 #define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock)
@@ -392,31 +386,6 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
-/**
- * spin_unlock_wait - Interpose between successive critical sections
- * @lock: the spinlock whose critical sections are to be interposed.
- *
- * Semantically this is equivalent to a spin_lock() immediately
- * followed by a spin_unlock().  However, most architectures have
- * more efficient implementations in which the spin_unlock_wait()
- * cannot block concurrent lock acquisition, and in some cases
- * where spin_unlock_wait() does not write to the lock variable.
- * Nevertheless, spin_unlock_wait() can have high overhead, so if
- * you feel the need to use it, please check to see if there is
- * a better way to get your job done.
- *
- * The ordering guarantees provided by spin_unlock_wait() are:
- *
- * 1.  All accesses preceding the spin_unlock_wait() happen before
- *     any accesses in later critical sections for this same lock.
- * 2.  All accesses following the spin_unlock_wait() happen after
- *     any accesses in earlier critical sections for this same lock.
- */
-static __always_inline void spin_unlock_wait(spinlock_t *lock)
-{
-	raw_spin_unlock_wait(&lock->rlock);
-}
-
 static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 	return raw_spin_is_locked(&lock->rlock);
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 0d9848de677d..612fb530af41 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -26,11 +26,6 @@
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define arch_spin_is_locked(x)		((x)->slock == 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, VAL);
-}
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	lock->slock = 0;
@@ -73,7 +68,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
 #else /* DEBUG_SPINLOCK */
 #define arch_spin_is_locked(lock)	((void)(lock), 0)
-#define arch_spin_unlock_wait(lock)	do { barrier(); (void)(lock); } while (0)
 /* for sched/core.c and kernel_lock.c: */
 # define arch_spin_lock(lock)		do { barrier(); (void)(lock); } while (0)
 # define arch_spin_lock_flags(lock, flags)	do { barrier(); (void)(lock); } while (0)
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index cfbfc540cafc..261471f407a5 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -87,4 +87,17 @@ static inline void srcu_barrier(struct srcu_struct *sp)
 	synchronize_srcu(sp);
 }
 
+/* Defined here to avoid size increase for non-torture kernels. */
+static inline void srcu_torture_stats_print(struct srcu_struct *sp,
+					    char *tt, char *tf)
+{
+	int idx;
+
+	idx = READ_ONCE(sp->srcu_idx) & 0x1;
+	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
+		 tt, tf, idx,
+		 READ_ONCE(sp->srcu_lock_nesting[!idx]),
+		 READ_ONCE(sp->srcu_lock_nesting[idx]));
+}
+
 #endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 42973f787e7e..a949f4f9e4d7 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -104,8 +104,6 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
-void process_srcu(struct work_struct *work);
-
 #define __SRCU_STRUCT_INIT(name)					\
 	{								\
 		.sda = &name##_srcu_data,				\
@@ -141,5 +139,6 @@ void process_srcu(struct work_struct *work);
 
 void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf);
 
 #endif
diff --git a/include/linux/swait.h b/include/linux/swait.h
index c1f9c62a8a50..4a4e180d0a35 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -169,4 +169,59 @@ do {									\
 	__ret;								\
 })
 
+#define __swait_event_idle(wq, condition)				\
+	(void)___swait_event(wq, condition, TASK_IDLE, 0, schedule())
+
+/**
+ * swait_event_idle - wait without system load contribution
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the @condition evaluates to
+ * true. The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * This function is mostly used when a kthread or workqueue waits for some
+ * condition and doesn't want to contribute to system load. Signals are
+ * ignored.
+ */
+#define swait_event_idle(wq, condition)					\
+do {									\
+	if (condition)							\
+		break;							\
+	__swait_event_idle(wq, condition);				\
+} while (0)
+
+#define __swait_event_idle_timeout(wq, condition, timeout)		\
+	___swait_event(wq, ___wait_cond_timeout(condition),		\
+		       TASK_IDLE, timeout,				\
+		       __ret = schedule_timeout(__ret))
+
+/**
+ * swait_event_idle_timeout - wait up to timeout without load contribution
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout at which we'll give up in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the @condition evaluates to
+ * true. The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * This function is mostly used when a kthread or workqueue waits for some
+ * condition and doesn't want to contribute to system load. Signals are
+ * ignored.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define swait_event_idle_timeout(wq, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __swait_event_idle_timeout(wq,			\
+						   condition, timeout);	\
+	__ret;								\
+})
+
 #endif /* _LINUX_SWAIT_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 138c94535864..d4dfac878fab 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -207,6 +207,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	}								\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
+#ifdef TIF_FSCHECK
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_user_check(void)
+{
+
+	if (!test_thread_flag(TIF_FSCHECK))
+		return;
+
+	BUG_ON(!segment_eq(get_fs(), USER_DS));
+	clear_thread_flag(TIF_FSCHECK);
+}
+#endif
+
 asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 			       qid_t id, void __user *addr);
 asmlinkage long sys_time(time_t __user *tloc);
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 91dc089d65b7..e91ae1f2290d 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -703,6 +703,7 @@ TRACE_EVENT(rcu_batch_end,
  * at the beginning and end of the read, respectively.  Note that the
  * callback address can be NULL.
  */
+#define RCUTORTURENAME_LEN 8
 TRACE_EVENT(rcu_torture_read,
 
 	TP_PROTO(const char *rcutorturename, struct rcu_head *rhp,
@@ -711,7 +712,7 @@ TRACE_EVENT(rcu_torture_read,
 	TP_ARGS(rcutorturename, rhp, secs, c_old, c),
 
 	TP_STRUCT__entry(
-		__field(const char *, rcutorturename)
+		__field(char, rcutorturename[RCUTORTURENAME_LEN])
 		__field(struct rcu_head *, rhp)
 		__field(unsigned long, secs)
 		__field(unsigned long, c_old)
@@ -719,7 +720,9 @@ TRACE_EVENT(rcu_torture_read,
 	),
 
 	TP_fast_assign(
-		__entry->rcutorturename = rcutorturename;
+		strncpy(__entry->rcutorturename, rcutorturename,
+			RCUTORTURENAME_LEN);
+		__entry->rcutorturename[RCUTORTURENAME_LEN - 1] = 0;
 		__entry->rhp = rhp;
 		__entry->secs = secs;
 		__entry->c_old = c_old;
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108bd2624..6d47b3249d8a 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,14 +40,33 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ *                          Execute a memory barrier on each running
+ *                          thread belonging to the same process as the current
+ *                          thread. Upon return from system call, the
+ *                          caller thread is ensured that all its running
+ *                          threads siblings have passed through a state
+ *                          where all memory accesses to user-space
+ *                          addresses match program order between entry
+ *                          to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This only covers threads from the
+ *                          same processes as the caller thread. This
+ *                          command returns 0. The "expedited" commands
+ *                          complete faster than the non-expedited ones,
+ *                          they never block, but have the downside of
+ *                          causing extra overhead.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-	MEMBARRIER_CMD_QUERY = 0,
-	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_QUERY			= 0,
+	MEMBARRIER_CMD_SHARED			= (1 << 0),
+	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
+	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..140ae638cfd6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -174,6 +175,8 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
 	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT	= 16, /* save branch type */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -198,9 +201,30 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
 	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE	=
+		1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
+/*
+ * Common flow change classification
+ */
+enum {
+	PERF_BR_UNKNOWN		= 0,	/* unknown */
+	PERF_BR_COND		= 1,	/* conditional */
+	PERF_BR_UNCOND		= 2,	/* unconditional  */
+	PERF_BR_IND		= 3,	/* indirect */
+	PERF_BR_CALL		= 4,	/* function call */
+	PERF_BR_IND_CALL	= 5,	/* indirect function call */
+	PERF_BR_RET		= 6,	/* function return */
+	PERF_BR_SYSCALL		= 7,	/* syscall */
+	PERF_BR_SYSRET		= 8,	/* syscall return */
+	PERF_BR_COND_CALL	= 9,	/* conditional function call */
+	PERF_BR_COND_RET	= 10,	/* conditional function return */
+	PERF_BR_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -791,6 +815,7 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
@@ -931,14 +956,20 @@ union perf_mem_data_src {
 			mem_snoop:5,	/* snoop mode */
 			mem_lock:2,	/* lock instr */
 			mem_dtlb:7,	/* tlb access */
-			mem_rsvd:31;
+			mem_lvl_num:4,	/* memory hierarchy level number */
+			mem_remote:1,   /* remote */
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_rsvd:24;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd:31,
+		__u64	mem_rsvd:24,
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_remote:1,   /* remote */
+			mem_lvl_num:4,	/* memory hierarchy level number */
 			mem_dtlb:7,	/* tlb access */
 			mem_lock:2,	/* lock instr */
 			mem_snoop:5,	/* snoop mode */
@@ -975,6 +1006,22 @@ union perf_mem_data_src {
 #define PERF_MEM_LVL_UNC	0x2000 /* Uncached memory */
 #define PERF_MEM_LVL_SHIFT	5
 
+#define PERF_MEM_REMOTE_REMOTE	0x01  /* Remote */
+#define PERF_MEM_REMOTE_SHIFT	37
+
+#define PERF_MEM_LVLNUM_L1	0x01 /* L1 */
+#define PERF_MEM_LVLNUM_L2	0x02 /* L2 */
+#define PERF_MEM_LVLNUM_L3	0x03 /* L3 */
+#define PERF_MEM_LVLNUM_L4	0x04 /* L4 */
+/* 5-0xa available */
+#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
+#define PERF_MEM_LVLNUM_LFB	0x0c /* LFB */
+#define PERF_MEM_LVLNUM_RAM	0x0d /* RAM */
+#define PERF_MEM_LVLNUM_PMEM	0x0e /* PMEM */
+#define PERF_MEM_LVLNUM_NA	0x0f /* N/A */
+
+#define PERF_MEM_LVLNUM_SHIFT	33
+
 /* snoop mode */
 #define PERF_MEM_SNOOP_NA	0x01 /* not available */
 #define PERF_MEM_SNOOP_NONE	0x02 /* no snoop */
@@ -983,6 +1030,10 @@ union perf_mem_data_src {
 #define PERF_MEM_SNOOP_HITM	0x10 /* snoop hit modified */
 #define PERF_MEM_SNOOP_SHIFT	19
 
+#define PERF_MEM_SNOOPX_FWD	0x01 /* forward */
+/* 1 free */
+#define PERF_MEM_SNOOPX_SHIFT	37
+
 /* locked instruction */
 #define PERF_MEM_LOCK_NA	0x01 /* not available */
 #define PERF_MEM_LOCK_LOCKED	0x02 /* locked transaction */
@@ -1015,6 +1066,7 @@ union perf_mem_data_src {
  *     in_tx: running in a hardware transaction
  *     abort: aborting a hardware transaction
  *    cycles: cycles from last branch (or 0 if not supported)
+ *      type: branch type
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -1024,7 +1076,8 @@ struct perf_branch_entry {
 		in_tx:1,    /* in transaction */
 		abort:1,    /* transaction abort */
 		cycles:16,  /* cycle count to last branch */
-		reserved:44;
+		type:4,     /* branch type */
+		reserved:40;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index c07295969b7e..6d5d5faa989b 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -1,7 +1,7 @@
 #ifndef _UAPI_LINUX_VIRTIO_RING_H
 #define _UAPI_LINUX_VIRTIO_RING_H
-/* An interface for efficient virtio implementation, currently for use by KVM
- * and lguest, but hopefully others soon.  Do NOT change this since it will
+/* An interface for efficient virtio implementation, currently for use by KVM,
+ * but hopefully others soon.  Do NOT change this since it will
  * break existing servers and clients.
  *
  * This header is BSD licensed so anyone can use the definitions to implement
diff --git a/init/main.c b/init/main.c
index 052481fbe363..b78f63c30b17 100644
--- a/init/main.c
+++ b/init/main.c
@@ -430,7 +430,6 @@ static noinline void __ref rest_init(void)
 	 * The boot idle thread must execute schedule()
 	 * at least once to get things moving:
 	 */
-	init_idle_bootup_task(current);
 	schedule_preempt_disabled();
 	/* Call into cpu_idle with preempt disabled */
 	cpu_startup_entry(CPUHP_ONLINE);
@@ -651,8 +650,8 @@ asmlinkage __visible void __init start_kernel(void)
 	}
 #endif
 	page_ext_init();
-	debug_objects_mem_init();
 	kmemleak_init();
+	debug_objects_mem_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	if (late_time_init)
diff --git a/ipc/sem.c b/ipc/sem.c
index 38371e93bfa5..c6c50370504c 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2091,7 +2091,8 @@ void exit_sem(struct task_struct *tsk)
 			 * possibility where we exit while freeary() didn't
 			 * finish unlocking sem_undo_list.
 			 */
-			spin_unlock_wait(&ulp->lock);
+			spin_lock(&ulp->lock);
+			spin_unlock(&ulp->lock);
 			rcu_read_unlock();
 			break;
 		}
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 39ca7e976fbb..2f4039bafebb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2351,13 +2351,7 @@ void cpuset_update_active_cpus(void)
 	 * We're inside cpu hotplug critical region which usually nests
 	 * inside cgroup synchronization.  Bounce actual hotplug processing
 	 * to a work item to avoid reverse locking order.
-	 *
-	 * We still need to do partition_sched_domains() synchronously;
-	 * otherwise, the scheduler will get confused and put tasks to the
-	 * dead CPU.  Fall back to the default single domain.
-	 * cpuset_hotplug_workfn() will rebuild it as necessary.
 	 */
-	partition_sched_domains(1, NULL, NULL);
 	schedule_work(&cpuset_hotplug_work);
 }
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index eee033134262..bfbd649ccdc8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu)
 	__cpu_die(cpu);
 
 	tick_cleanup_dead_cpu(cpu);
+	rcutree_migrate_callbacks(cpu);
 	return 0;
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 03ac9c8b02fb..ce64f3fed5c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx)
 	return parent_ctx;
 }
 
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+				enum pid_type type)
 {
+	u32 nr;
 	/*
 	 * only top level events have the pid namespace they were created in
 	 */
 	if (event->parent)
 		event = event->parent;
 
-	return task_tgid_nr_ns(p, event->ns);
+	nr = __task_pid_nr_ns(p, type, event->ns);
+	/* avoid -1 if it is idle thread or runs in another ns */
+	if (!nr && !pid_alive(p))
+		nr = -1;
+	return nr;
 }
 
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
 {
-	/*
-	 * only top level events have the pid namespace they were created in
-	 */
-	if (event->parent)
-		event = event->parent;
+	return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+}
 
-	return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+	return perf_event_pid_type(event, p, PIDTYPE_PID);
 }
 
 /*
@@ -1570,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		size += sizeof(data->phys_addr);
+
 	event->header_size = size;
 }
 
@@ -3211,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 		return;
 
 	perf_ctx_lock(cpuctx, ctx);
+	/*
+	 * We must check ctx->nr_events while holding ctx->lock, such
+	 * that we serialize against perf_install_in_context().
+	 */
+	if (!ctx->nr_events)
+		goto unlock;
+
 	perf_pmu_disable(ctx->pmu);
 	/*
 	 * We want to keep the following priority order:
@@ -3224,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 	perf_event_sched_in(cpuctx, ctx, task);
 	perf_pmu_enable(ctx->pmu);
+
+unlock:
 	perf_ctx_unlock(cpuctx, ctx);
 }
 
@@ -6003,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		perf_output_put(handle, data->phys_addr);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6018,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle,
 	}
 }
 
+static u64 perf_virt_to_phys(u64 virt)
+{
+	u64 phys_addr = 0;
+	struct page *p = NULL;
+
+	if (!virt)
+		return 0;
+
+	if (virt >= TASK_SIZE) {
+		/* If it's vmalloc()d memory, leave phys_addr as 0 */
+		if (virt_addr_valid((void *)(uintptr_t)virt) &&
+		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
+			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+	} else {
+		/*
+		 * Walking the pages tables for user address.
+		 * Interrupts are disabled, so it prevents any tear down
+		 * of the page tables.
+		 * Try IRQ-safe __get_user_pages_fast first.
+		 * If failed, leave phys_addr as 0.
+		 */
+		if ((current->mm != NULL) &&
+		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+		if (p)
+			put_page(p);
+	}
+
+	return phys_addr;
+}
+
 void perf_prepare_sample(struct perf_event_header *header,
 			 struct perf_sample_data *data,
 			 struct perf_event *event,
@@ -6136,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		data->phys_addr = perf_virt_to_phys(data->addr);
 }
 
 static void __always_inline
@@ -7287,6 +7342,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+void perf_event_itrace_started(struct perf_event *event)
+{
+	event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
 static void perf_log_itrace_start(struct perf_event *event)
 {
 	struct perf_output_handle handle;
@@ -7302,7 +7362,7 @@ static void perf_log_itrace_start(struct perf_event *event)
 		event = event->parent;
 
 	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
-	    event->hw.itrace_started)
+	    event->attach_state & PERF_ATTACH_ITRACE)
 		return;
 
 	rec.header.type	= PERF_RECORD_ITRACE_START;
@@ -9890,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* Only privileged users can get physical addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (!attr.sample_max_stack)
 		attr.sample_max_stack = sysctl_perf_event_max_stack;
 
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78eb8d5..843e97047335 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -38,9 +38,9 @@ struct ring_buffer {
 	struct user_struct		*mmap_user;
 
 	/* AUX area */
-	local_t				aux_head;
+	long				aux_head;
 	local_t				aux_nest;
-	local_t				aux_wakeup;
+	long				aux_wakeup;	/* last aux_watermark boundary crossed by aux_head */
 	unsigned long			aux_pgoff;
 	int				aux_nr_pages;
 	int				aux_overwrite;
@@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion)
 {
 	int rctx;
 
-	if (in_nmi())
+	if (unlikely(in_nmi()))
 		rctx = 3;
 	else if (in_irq())
 		rctx = 2;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ee97196bb151..af71a84e12ee 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
 		goto err_put;
 
-	aux_head = local_read(&rb->aux_head);
+	aux_head = rb->aux_head;
 
 	handle->rb = rb;
 	handle->event = event;
@@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	 */
 	if (!rb->aux_overwrite) {
 		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
-		handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 		if (aux_head - aux_tail < perf_aux_size(rb))
 			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
 
@@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 		handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
 
 		aux_head = handle->head;
-		local_set(&rb->aux_head, aux_head);
+		rb->aux_head = aux_head;
 	} else {
 		handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
 
-		aux_head = local_read(&rb->aux_head);
-		local_add(size, &rb->aux_head);
+		aux_head = rb->aux_head;
+		rb->aux_head += size;
 	}
 
 	if (size || handle->aux_flags) {
@@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 		                     handle->aux_flags);
 	}
 
-	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-
-	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+	rb->user_page->aux_head = rb->aux_head;
+	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
 		wakeup = true;
-		local_add(rb->aux_watermark, &rb->aux_wakeup);
+		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
 	}
 
 	if (wakeup) {
@@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 {
 	struct ring_buffer *rb = handle->rb;
-	unsigned long aux_head;
 
 	if (size > handle->size)
 		return -ENOSPC;
 
-	local_add(size, &rb->aux_head);
+	rb->aux_head += size;
 
-	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+	rb->user_page->aux_head = rb->aux_head;
+	if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
 		perf_output_wakeup(handle);
-		local_add(rb->aux_watermark, &rb->aux_wakeup);
-		handle->wakeup = local_read(&rb->aux_wakeup) +
-				 rb->aux_watermark;
+		rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 	}
 
-	handle->head = aux_head;
+	handle->head = rb->aux_head;
 	handle->size -= size;
 
 	return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index fa72d57db747..a35d8a17e01f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
-	TASKS_RCU(int tasks_rcu_i);
 
 	profile_task_exit(tsk);
 	kcov_task_exit(tsk);
@@ -819,7 +818,8 @@ void __noreturn do_exit(long code)
 	 * Ensure that we must observe the pi_state in exit_mm() ->
 	 * mm_release() -> exit_pi_state_list().
 	 */
-	raw_spin_unlock_wait(&tsk->pi_lock);
+	raw_spin_lock_irq(&tsk->pi_lock);
+	raw_spin_unlock_irq(&tsk->pi_lock);
 
 	if (unlikely(in_atomic())) {
 		pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -881,9 +881,7 @@ void __noreturn do_exit(long code)
 	 */
 	flush_ptrace_hw_breakpoint(tsk);
 
-	TASKS_RCU(preempt_disable());
-	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
-	TASKS_RCU(preempt_enable());
+	exit_tasks_rcu_start();
 	exit_notify(tsk, group_dead);
 	proc_exit_connector(tsk);
 	mpol_put_task_policy(tsk);
@@ -918,7 +916,7 @@ void __noreturn do_exit(long code)
 	if (tsk->nr_dirtied)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
-	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+	exit_tasks_rcu_finish();
 
 	lockdep_free_task(tsk);
 	do_task_dead();
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd24153e8a48..294294c71ba4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -268,123 +268,6 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
 #endif
 
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- *   CPU0				CPU1
- *
- *   global_lock();			local_lock(i)
- *     spin_lock(&G)			  spin_lock(&L[i])
- *     for (i)				  if (!spin_is_locked(&G)) {
- *       spin_unlock_wait(&L[i]);	    smp_acquire__after_ctrl_dep();
- *					    return;
- *					  }
- *					  // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- *   CPU0				CPU1
- *
- *   flag = set;
- *   smp_mb();				spin_lock(&l)
- *   spin_unlock_wait(&l);		if (!flag)
- *					  // add to lockless list
- *					spin_unlock(&l);
- *   // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- *  (1) clear_pending_set_locked():	*,1,0 -> *,0,1
- *
- *  (2) set_locked():			t,0,0 -> t,0,1 ; t != 0
- *      atomic_cmpxchg_relaxed():	t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
-	u32 val;
-
-	for (;;) {
-		val = atomic_read(&lock->val);
-
-		if (!val) /* not locked, we're done */
-			goto done;
-
-		if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
-			break;
-
-		/* not locked, but pending, wait until we observe the lock */
-		cpu_relax();
-	}
-
-	/* any unlock is good */
-	while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
-		cpu_relax();
-
-done:
-	smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
-
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 
 /**
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd:   Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- *                        barrier()   smp_mb() sys_membarrier()
- *        barrier()          X           X            O
- *        smp_mb()           X           O            O
- *        sys_membarrier()   O           O            O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
-	/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
-	if (tick_nohz_full_enabled())
-		return -ENOSYS;
-	if (unlikely(flags))
-		return -EINVAL;
-	switch (cmd) {
-	case MEMBARRIER_CMD_QUERY:
-		return MEMBARRIER_CMD_BITMASK;
-	case MEMBARRIER_CMD_SHARED:
-		if (num_online_cpus() > 1)
-			synchronize_sched();
-		return 0;
-	default:
-		return -EINVAL;
-	}
-}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c945063f..9210379c0353 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
 	  This option selects the full-fledged version of SRCU.
 
 config TASKS_RCU
-	bool
-	default n
+	def_bool PREEMPT
 	select SRCU
 	help
 	  This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 808b8c85f626..e4b43fef89f5 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -356,22 +356,10 @@ do {									\
 
 #ifdef CONFIG_TINY_RCU
 /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void)  /* Internal RCU use. */
-{
-	return true;
-}
-static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */
-{
-	return false;
-}
-
-static inline void rcu_expedite_gp(void)
-{
-}
-
-static inline void rcu_unexpedite_gp(void)
-{
-}
+static inline bool rcu_gp_is_normal(void) { return true; }
+static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline void rcu_expedite_gp(void) { }
+static inline void rcu_unexpedite_gp(void) { }
 #else /* #ifdef CONFIG_TINY_RCU */
 bool rcu_gp_is_normal(void);     /* Internal RCU use. */
 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
@@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
 	*gpnum = 0;
 	*completed = 0;
 }
-static inline void rcutorture_record_test_transition(void)
-{
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
+static inline void rcutorture_record_test_transition(void) { }
+static inline void rcutorture_record_progress(unsigned long vernum) { }
 #ifdef CONFIG_RCU_TRACE
 void do_trace_rcu_torture_read(const char *rcutorturename,
 			       struct rcu_head *rhp,
@@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 #endif
 
 #ifdef CONFIG_TINY_RCU
-
-/*
- * Return the number of grace periods started.
- */
-static inline unsigned long rcu_batches_started(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods started.
- */
-static inline unsigned long rcu_batches_started_bh(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of expedited sched grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
-	return 0;
-}
-
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
-	return 0;
-}
-
-static inline void rcu_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-
-static inline void show_rcu_gp_kthreads(void)
-{
-}
-
+static inline unsigned long rcu_batches_started(void) { return 0; }
+static inline unsigned long rcu_batches_started_bh(void) { return 0; }
+static inline unsigned long rcu_batches_started_sched(void) { return 0; }
+static inline unsigned long rcu_batches_completed(void) { return 0; }
+static inline unsigned long rcu_batches_completed_bh(void) { return 0; }
+static inline unsigned long rcu_batches_completed_sched(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; }
+static inline unsigned long
+srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+static inline void rcu_force_quiescent_state(void) { }
+static inline void rcu_bh_force_quiescent_state(void) { }
+static inline void rcu_sched_force_quiescent_state(void) { }
+static inline void show_rcu_gp_kthreads(void) { }
 #else /* #ifdef CONFIG_TINY_RCU */
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2b62a38b080f..7649fcd2c4c7 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
 }
 
 /*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
-	int cnt = 0;
-	struct rcu_head **rhpp = &rclp->head;
-
-	for (;;) {
-		if (!*rhpp)
-			return cnt;
-		if (++cnt > lim)
-			return -1;
-		rhpp = &(*rhpp)->next;
-	}
-}
-
-/*
  * Dequeue the oldest rcu_head structure from the specified callback
  * list.  This function assumes that the callback is non-lazy, but
  * the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
 }
 
 /*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
-	if (seg == RCU_DONE_TAIL)
-		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
-	return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
  * Does the specified rcu_segcblist structure contain callbacks that
  * are ready to be invoked?
  */
@@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
 }
 
 /*
- * Dequeue and return the first ready-to-invoke callback.  If there
- * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
- * to avoid interference.  Does not protect from interference from other
- * CPUs or tasks.
- */
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-	int i;
-	struct rcu_head *rhp;
-
-	local_irq_save(flags);
-	if (!rcu_segcblist_ready_cbs(rsclp)) {
-		local_irq_restore(flags);
-		return NULL;
-	}
-	rhp = rsclp->head;
-	BUG_ON(!rhp);
-	rsclp->head = rhp->next;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
-		if (rsclp->tails[i] != &rhp->next)
-			break;
-		rsclp->tails[i] = &rsclp->head;
-	}
-	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
-	WRITE_ONCE(rsclp->len, rsclp->len - 1);
-	local_irq_restore(flags);
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	rsclp->len_lazy--;
-	local_irq_restore(flags);
-}
-
-/*
  * Return a pointer to the first callback in the specified rcu_segcblist
  * structure.  This is useful for diagnostics.
  */
@@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
 }
 
 /*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
  * Enqueue the specified callback onto the specified rcu_segcblist
  * structure, updating accounting as needed.  Note that the ->len
  * field may be accessed locklessly, hence the WRITE_ONCE().
@@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
 			return true;
 	return false;
 }
+
+/*
+ * Merge the source rcu_segcblist structure into the destination
+ * rcu_segcblist structure, then initialize the source.  Any pending
+ * callbacks from the source get to start over.  It is best to
+ * advance and accelerate both the destination and the source
+ * before merging.
+ */
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+			 struct rcu_segcblist *src_rsclp)
+{
+	struct rcu_cblist donecbs;
+	struct rcu_cblist pendcbs;
+
+	rcu_cblist_init(&donecbs);
+	rcu_cblist_init(&pendcbs);
+	rcu_segcblist_extract_count(src_rsclp, &donecbs);
+	rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
+	rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+	rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+	rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
+	rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+	rcu_segcblist_init(src_rsclp);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 6e36e36478cd..581c12b63544 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
 	rclp->len_lazy--;
 }
 
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
-	return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
-	WARN_ON_ONCE(!rclp->head);
-	return rclp->tail;
-}
-
 void rcu_cblist_init(struct rcu_cblist *rclp);
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
 struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
 
 /*
@@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
 
 void rcu_segcblist_init(struct rcu_segcblist *rsclp);
 void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
 bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
 bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
 void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
 			   struct rcu_head *rhp, bool lazy);
 bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
 bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
 				    unsigned long seq);
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+			 struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cc18110b612..1f87a02c3399 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {
 	.name		= "sched"
 };
 
-#ifdef CONFIG_TASKS_RCU
-
 /*
  * Definitions for RCU-tasks perf testing.
  */
@@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {
 	.name		= "tasks"
 };
 
-#define RCUPERF_TASKS_OPS &tasks_ops,
-
 static bool __maybe_unused torturing_tasks(void)
 {
 	return cur_ops == &tasks_ops;
 }
 
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUPERF_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
-	return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
 /*
  * If performance tests complete, wait for shutdown to commence.
  */
@@ -658,7 +643,7 @@ rcu_perf_init(void)
 	int firsterr = 0;
 	static struct rcu_perf_ops *perf_ops[] = {
 		&rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
-		RCUPERF_TASKS_OPS
+		&tasks_ops,
 	};
 
 	if (!torture_init_begin(perf_type, verbose, &perf_runnable))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..45f2ffbc1e78 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
 static u64 notrace rcu_trace_clock_local(void)
 {
 	u64 ts = trace_clock_local();
-	unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+
+	(void)do_div(ts, NSEC_PER_USEC);
 	return ts;
 }
 #else /* #ifdef CONFIG_RCU_TRACE */
@@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
 	.fqs		= NULL,
 	.stats		= NULL,
 	.irq_capable	= 1,
-	.name		= "rcu_busted"
+	.name		= "busted"
 };
 
 /*
@@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
 
 	delay = torture_random(rrsp) %
 		(nrealreaders * 2 * longdelay * uspertick);
-	if (!delay)
+	if (!delay && in_task())
 		schedule_timeout_interruptible(longdelay);
 	else
 		rcu_read_delay(rrsp);
@@ -561,44 +562,7 @@ static void srcu_torture_barrier(void)
 
 static void srcu_torture_stats(void)
 {
-	int __maybe_unused cpu;
-	int idx;
-
-#ifdef CONFIG_TREE_SRCU
-	idx = srcu_ctlp->srcu_idx & 0x1;
-	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
-		 torture_type, TORTURE_FLAG, idx);
-	for_each_possible_cpu(cpu) {
-		unsigned long l0, l1;
-		unsigned long u0, u1;
-		long c0, c1;
-		struct srcu_data *counts;
-
-		counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
-		u0 = counts->srcu_unlock_count[!idx];
-		u1 = counts->srcu_unlock_count[idx];
-
-		/*
-		 * Make sure that a lock is always counted if the corresponding
-		 * unlock is counted.
-		 */
-		smp_rmb();
-
-		l0 = counts->srcu_lock_count[!idx];
-		l1 = counts->srcu_lock_count[idx];
-
-		c0 = l0 - u0;
-		c1 = l1 - u1;
-		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
-	}
-	pr_cont("\n");
-#elif defined(CONFIG_TINY_SRCU)
-	idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
-	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
-		 torture_type, TORTURE_FLAG, idx,
-		 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
-		 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-#endif
+	srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
 }
 
 static void srcu_torture_synchronize_expedited(void)
@@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.call		= srcu_torture_call,
 	.cb_barrier	= srcu_torture_barrier,
 	.stats		= srcu_torture_stats,
+	.irq_capable	= 1,
 	.name		= "srcu"
 };
 
@@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = {
 	.call		= srcu_torture_call,
 	.cb_barrier	= srcu_torture_barrier,
 	.stats		= srcu_torture_stats,
+	.irq_capable	= 1,
 	.name		= "srcud"
 };
 
@@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = {
 	.name		= "sched"
 };
 
-#ifdef CONFIG_TASKS_RCU
-
 /*
  * Definitions for RCU-tasks torture testing.
  */
@@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = {
 	.name		= "tasks"
 };
 
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
-
 static bool __maybe_unused torturing_tasks(void)
 {
 	return cur_ops == &tasks_ops;
 }
 
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUTORTURE_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
-	return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
 /*
  * RCU torture priority-boost testing.  Runs one real-time thread per
  * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg)
 	return 0;
 }
 
+static void rcu_torture_timer_cb(struct rcu_head *rhp)
+{
+	kfree(rhp);
+}
+
 /*
  * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
@@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused)
 	__this_cpu_inc(rcu_torture_batch[completed]);
 	preempt_enable();
 	cur_ops->readunlock(idx);
+
+	/* Test call_rcu() invocation from interrupt handler. */
+	if (cur_ops->call) {
+		struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+
+		if (rhp)
+			cur_ops->call(rhp, rcu_torture_timer_cb);
+	}
 }
 
 /*
@@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void)
 		srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
 					&flags, &gpnum, &completed);
 		wtp = READ_ONCE(writer_task);
-		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
+		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",
 			 rcu_torture_writer_state_getname(),
 			 rcu_torture_writer_state,
 			 gpnum, completed, flags,
-			 wtp == NULL ? ~0UL : wtp->state);
+			 wtp == NULL ? ~0UL : wtp->state,
+			 wtp == NULL ? -1 : (int)task_cpu(wtp));
 		show_rcu_gp_kthreads();
 		rcu_ftrace_dump(DUMP_ALL);
 	}
@@ -1749,7 +1714,7 @@ rcu_torture_init(void)
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
 		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
-		&sched_ops, RCUTORTURE_TASKS_OPS
+		&sched_ops, &tasks_ops,
 	};
 
 	if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c1047d2ed..76ac5f50b2c7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
 #include "rcu_segcblist.h"
 #include "rcu.h"
 
+int rcu_scheduler_active __read_mostly;
+
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
 	destroy_rcu_head_on_stack(&rs.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/* Lockdep diagnostics.  */
+void __init rcu_scheduler_starting(void)
+{
+	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..729a8706751d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
 
 static void srcu_invoke_callbacks(struct work_struct *work);
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
 
 /*
  * Initialize SRCU combining tree.  Note that statically allocated
@@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
 	__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
 	wait_for_completion(&rcu.completion);
 	destroy_rcu_head_on_stack(&rcu.head);
+
+	/*
+	 * Make sure that later code is ordered after the SRCU grace
+	 * period.  This pairs with the raw_spin_lock_irq_rcu_node()
+	 * in srcu_invoke_callbacks().  Unlike Tree RCU, this is needed
+	 * because the current CPU might have been totally uninvolved with
+	 * (and thus unordered against) that grace period.
+	 */
+	smp_mb();
 }
 
 /**
@@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 /*
  * This is the work-queue function that handles SRCU grace periods.
  */
-void process_srcu(struct work_struct *work)
+static void process_srcu(struct work_struct *work)
 {
 	struct srcu_struct *sp;
 
@@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)
 	srcu_advance_state(sp);
 	srcu_reschedule(sp, srcu_get_delay(sp));
 }
-EXPORT_SYMBOL_GPL(process_srcu);
 
 void srcutorture_get_gp_data(enum rcutorture_type test_type,
 			     struct srcu_struct *sp, int *flags,
@@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 }
 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
 
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+{
+	int cpu;
+	int idx;
+	unsigned long s0 = 0, s1 = 0;
+
+	idx = sp->srcu_idx & 0x1;
+	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
+	for_each_possible_cpu(cpu) {
+		unsigned long l0, l1;
+		unsigned long u0, u1;
+		long c0, c1;
+		struct srcu_data *counts;
+
+		counts = per_cpu_ptr(sp->sda, cpu);
+		u0 = counts->srcu_unlock_count[!idx];
+		u1 = counts->srcu_unlock_count[idx];
+
+		/*
+		 * Make sure that a lock is always counted if the corresponding
+		 * unlock is counted.
+		 */
+		smp_rmb();
+
+		l0 = counts->srcu_lock_count[!idx];
+		l1 = counts->srcu_lock_count[idx];
+
+		c0 = l0 - u0;
+		c1 = l1 - u1;
+		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+		s0 += c0;
+		s1 += c1;
+	}
+	pr_cont(" T(%ld,%ld)\n", s0, s1);
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
+
 static int __init srcu_bootup_announce(void)
 {
 	pr_info("Hierarchical SRCU implementation.\n");
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f8488965250f..a64eee0db39e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.curtail	= &rcu_bh_ctrlblk.rcucblist,
 };
 
-#include "tiny_plugin.h"
-
 void rcu_barrier_bh(void)
 {
 	wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2a3062..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-#include <linux/kernel_stat.h>
-
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * During boot, we forgive RCU lockdep issues.  After this function is
- * invoked, we start taking RCU lockdep issues seriously.  Note that unlike
- * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
- * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
- * The reason for this is that Tiny RCU does not need kthreads, so does
- * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process.  Unless SRCU is in the mix.
- */
-void __init rcu_scheduler_starting(void)
-{
-	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
-		? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
-}
-
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51d4c3acf32d..84fe96641b2e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \
 	.gp_state = RCU_GP_IDLE, \
 	.gpnum = 0UL - 300UL, \
 	.completed = 0UL - 300UL, \
-	.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
-	.orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
-	.orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
@@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user)
  */
 void rcu_idle_enter(void)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
 	rcu_eqs_enter(false);
-	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
 
 #ifdef CONFIG_NO_HZ_FULL
 /**
@@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
  */
 void rcu_user_enter(void)
 {
-	rcu_eqs_enter(1);
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+	rcu_eqs_enter(true);
 }
 #endif /* CONFIG_NO_HZ_FULL */
 
@@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user)
 	if (oldval & DYNTICK_TASK_NEST_MASK) {
 		rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
 	} else {
+		__this_cpu_inc(disable_rcu_irq_enter);
 		rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 		rcu_eqs_exit_common(oldval, user);
+		__this_cpu_dec(disable_rcu_irq_enter);
 	}
 }
 
@@ -979,7 +975,6 @@ void rcu_idle_exit(void)
 	rcu_eqs_exit(false);
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
 
 #ifdef CONFIG_NO_HZ_FULL
 /**
@@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 	j = jiffies;
 	gpa = READ_ONCE(rsp->gp_activity);
 	if (j - gpa > 2 * HZ) {
-		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
+		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
 		       rsp->name, j - gpa,
 		       rsp->gpnum, rsp->completed,
 		       rsp->gp_flags,
 		       gp_state_getname(rsp->gp_state), rsp->gp_state,
-		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
+		       rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
 		if (rsp->gp_kthread) {
 			sched_show_task(rsp->gp_kthread);
 			wake_up_process(rsp->gp_kthread);
@@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 }
 
 /*
- * Helper function for wait_event_interruptible_timeout() wakeup
- * at force-quiescent-state time.
+ * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * time.
  */
 static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
 {
@@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
-			swait_event_interruptible(rsp->gp_wq,
-						 READ_ONCE(rsp->gp_flags) &
-						 RCU_GP_FLAG_INIT);
+			swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+						     RCU_GP_FLAG_INIT);
 			rsp->gp_state = RCU_GP_DONE_GPS;
 			/* Locking provides needed memory barrier. */
 			if (rcu_gp_init(rsp))
@@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
-			ret = swait_event_interruptible_timeout(rsp->gp_wq,
+			ret = swait_event_idle_timeout(rsp->gp_wq,
 					rcu_gp_fqs_check_wake(rsp, &gf), j);
 			rsp->gp_state = RCU_GP_DOING_FQS;
 			/* Locking provides needed memory barriers. */
@@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 			return;
 		}
 		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
+		WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+			     rcu_preempt_blocked_readers_cgp(rnp));
 		rnp->qsmask &= ~mask;
 		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
 						 mask, rnp->qsmask, rnp->level,
@@ -2563,85 +2560,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 
 /*
- * Send the specified CPU's RCU callbacks to the orphanage.  The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
-			  struct rcu_node *rnp, struct rcu_data *rdp)
-{
-	lockdep_assert_held(&rsp->orphan_lock);
-
-	/* No-CBs CPUs do not have orphanable callbacks. */
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
-		return;
-
-	/*
-	 * Orphan the callbacks.  First adjust the counts.  This is safe
-	 * because _rcu_barrier() excludes CPU-hotplug operations, so it
-	 * cannot be running now.  Thus no memory barrier is required.
-	 */
-	rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
-	rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-
-	/*
-	 * Next, move those callbacks still needing a grace period to
-	 * the orphanage, where some other CPU will pick them up.
-	 * Some of the callbacks might have gone partway through a grace
-	 * period, but that is too bad.  They get to start over because we
-	 * cannot assume that grace periods are synchronized across CPUs.
-	 */
-	rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-
-	/*
-	 * Then move the ready-to-invoke callbacks to the orphanage,
-	 * where some other CPU will pick them up.  These will not be
-	 * required to pass though another grace period: They are done.
-	 */
-	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-
-	/* Finally, disallow further callbacks on this CPU.  */
-	rcu_segcblist_disable(&rdp->cblist);
-}
-
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage.  The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
-	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
-	lockdep_assert_held(&rsp->orphan_lock);
-
-	/* No-CBs CPUs are handled specially. */
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
-	    rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
-		return;
-
-	/* Do the accounting first. */
-	rdp->n_cbs_adopted += rsp->orphan_done.len;
-	if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
-		rcu_idle_count_callbacks_posted();
-	rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-
-	/*
-	 * We do not need a memory barrier here because the only way we
-	 * can get here if there is an rcu_barrier() in flight is if
-	 * we are the task doing the rcu_barrier().
-	 */
-
-	/* First adopt the ready-to-invoke callbacks, then the done ones. */
-	rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
-	WARN_ON_ONCE(rsp->orphan_done.head);
-	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-	WARN_ON_ONCE(rsp->orphan_pend.head);
-	WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
-		     !rcu_segcblist_n_cbs(&rdp->cblist));
-}
-
-/*
  * Trace the fact that this CPU is going offline.
  */
 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
@@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 
 /*
  * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context.  Do the remainder of the cleanup,
- * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them.  There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ * this fact from process context.  Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
  */
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
-	unsigned long flags;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
@@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 
 	/* Adjust any no-longer-needed kthreads. */
 	rcu_boost_kthread_setaffinity(rnp, -1);
-
-	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
-	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
-	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
-	rcu_adopt_orphan_cbs(rsp, flags);
-	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-
-	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
-		  !rcu_segcblist_empty(&rdp->cblist),
-		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
-		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
-		  rcu_segcblist_first_cb(&rdp->cblist));
 }
 
 /*
@@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
 	struct rcu_state *rsp = rdp->rsp;
 
 	if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
-		_rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
+		_rcu_barrier_trace(rsp, TPS("LastCB"), -1,
+				   rsp->barrier_sequence);
 		complete(&rsp->barrier_completion);
 	} else {
-		_rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
+		_rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);
 	}
 }
 
@@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type)
 	struct rcu_state *rsp = type;
 	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
 
-	_rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
+	_rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);
 	rdp->barrier_head.func = rcu_barrier_callback;
 	debug_rcu_head_queue(&rdp->barrier_head);
 	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
 		atomic_inc(&rsp->barrier_cpu_count);
 	} else {
 		debug_rcu_head_unqueue(&rdp->barrier_head);
-		_rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+		_rcu_barrier_trace(rsp, TPS("IRQNQ"), -1,
+				   rsp->barrier_sequence);
 	}
 }
 
@@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
 
-	_rcu_barrier_trace(rsp, "Begin", -1, s);
+	_rcu_barrier_trace(rsp, TPS("Begin"), -1, s);
 
 	/* Take mutex to serialize concurrent rcu_barrier() requests. */
 	mutex_lock(&rsp->barrier_mutex);
 
 	/* Did someone else do our work for us? */
 	if (rcu_seq_done(&rsp->barrier_sequence, s)) {
-		_rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
+		_rcu_barrier_trace(rsp, TPS("EarlyExit"), -1,
+				   rsp->barrier_sequence);
 		smp_mb(); /* caller's subsequent code after above check. */
 		mutex_unlock(&rsp->barrier_mutex);
 		return;
@@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 
 	/* Mark the start of the barrier operation. */
 	rcu_seq_start(&rsp->barrier_sequence);
-	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
+	_rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);
 
 	/*
 	 * Initialize the count to one rather than to zero in order to
@@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
 		rdp = per_cpu_ptr(rsp->rda, cpu);
 		if (rcu_is_nocb_cpu(cpu)) {
 			if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
-				_rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+				_rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,
 						   rsp->barrier_sequence);
 			} else {
-				_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+				_rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,
 						   rsp->barrier_sequence);
 				smp_mb__before_atomic();
 				atomic_inc(&rsp->barrier_cpu_count);
@@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
 					   rcu_barrier_callback, rsp, cpu, 0);
 			}
 		} else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
-			_rcu_barrier_trace(rsp, "OnlineQ", cpu,
+			_rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,
 					   rsp->barrier_sequence);
 			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
 		} else {
-			_rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+			_rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,
 					   rsp->barrier_sequence);
 		}
 	}
@@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	wait_for_completion(&rsp->barrier_completion);
 
 	/* Mark the end of the barrier operation. */
-	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+	_rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);
 	rcu_seq_end(&rsp->barrier_sequence);
 
 	/* Other rcu_barrier() invocations can now safely proceed. */
@@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	 */
 	rnp = rdp->mynode;
 	raw_spin_lock_rcu_node(rnp);		/* irqs already disabled. */
-	if (!rdp->beenonline)
-		WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
 	rdp->beenonline = true;	 /* We have now been online. */
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
@@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu)
 {
 	unsigned long flags;
 	unsigned long mask;
+	int nbits;
+	unsigned long oldmask;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 	struct rcu_state *rsp;
@@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu)
 		mask = rdp->grpmask;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		rnp->qsmaskinitnext |= mask;
+		oldmask = rnp->expmaskinitnext;
 		rnp->expmaskinitnext |= mask;
+		oldmask ^= rnp->expmaskinitnext;
+		nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
+		/* Allow lockless access for expedited grace periods. */
+		smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
+	smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu)
 	for_each_rcu_flavor(rsp)
 		rcu_cleanup_dying_idle_cpu(cpu, rsp);
 }
+
+/* Migrate the dead CPU's callbacks to the current CPU. */
+static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *my_rdp;
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+	if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+		return;  /* No callbacks to migrate. */
+
+	local_irq_save(flags);
+	my_rdp = this_cpu_ptr(rsp->rda);
+	if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
+		local_irq_restore(flags);
+		return;
+	}
+	raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+	rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
+	rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
+		     !rcu_segcblist_n_cbs(&my_rdp->cblist));
+	raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+		  !rcu_segcblist_empty(&rdp->cblist),
+		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+		  rcu_segcblist_first_cb(&rdp->cblist));
+}
+
+/*
+ * The outgoing CPU has just passed through the dying-idle state,
+ * and we are being invoked from the CPU that was IPIed to continue the
+ * offline operation.  We need to migrate the outgoing CPU's callbacks.
+ */
+void rcutree_migrate_callbacks(int cpu)
+{
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		rcu_migrate_callbacks(cpu, rsp);
+}
 #endif
 
 /*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9af0f31d6847..8e1f285f0a70 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -219,8 +219,6 @@ struct rcu_data {
 					/* qlen at last check for QS forcing */
 	unsigned long	n_cbs_invoked;	/* count of RCU cbs invoked. */
 	unsigned long	n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
-	unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
-	unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
@@ -268,7 +266,9 @@ struct rcu_data {
 	struct rcu_head **nocb_follower_tail;
 	struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
 	struct task_struct *nocb_kthread;
+	raw_spinlock_t nocb_lock;	/* Guard following pair of fields. */
 	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */
+	struct timer_list nocb_timer;	/* Enforce finite deferral. */
 
 	/* The following fields are used by the leader, hence own cacheline. */
 	struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -350,15 +350,6 @@ struct rcu_state {
 
 	/* End of fields guarded by root rcu_node's lock. */
 
-	raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
-						/* Protect following fields. */
-	struct rcu_cblist orphan_pend;		/* Orphaned callbacks that */
-						/*  need a grace period. */
-	struct rcu_cblist orphan_done;		/* Orphaned callbacks that */
-						/*  are ready to invoke. */
-						/* (Contains counts.) */
-	/* End of fields guarded by orphan_lock. */
-
 	struct mutex barrier_mutex;		/* Guards barrier fields. */
 	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */
 	struct completion barrier_completion;	/* Wake at barrier end. */
@@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
 				      struct rcu_data *rdp,
 				      unsigned long flags);
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dd21ca47e4b4..46d61b597731 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 	unsigned long flags;
 	unsigned long mask;
 	unsigned long oldmask;
-	int ncpus = READ_ONCE(rsp->ncpus);
+	int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_up;
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 908b309d60d7..55bde94b9572 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 	struct task_struct *t = current;
 
 	lockdep_assert_held(&rnp->lock);
+	WARN_ON_ONCE(rdp->mynode != rnp);
+	WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
 
 	/*
 	 * Decide where to queue the newly blocked task.  In theory,
@@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 		rnp->gp_tasks = &t->rcu_node_entry;
 	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
 		rnp->exp_tasks = &t->rcu_node_entry;
+	WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
+		     !(rnp->qsmask & rdp->grpmask));
+	WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
+		     !(rnp->expmask & rdp->grpmask));
 	raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
 
 	/*
@@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		rnp = t->rcu_blocked_node;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
+		WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = sync_rcu_preempt_exp_done(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t)
 		if (&t->rcu_node_entry == rnp->exp_tasks)
 			rnp->exp_tasks = np;
 		if (IS_ENABLED(CONFIG_RCU_BOOST)) {
-			if (&t->rcu_node_entry == rnp->boost_tasks)
-				rnp->boost_tasks = np;
 			/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
 			drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+			if (&t->rcu_node_entry == rnp->boost_tasks)
+				rnp->boost_tasks = np;
 		}
 
 		/*
@@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
  */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
+	struct task_struct *t;
+
 	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
 	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
-	if (rcu_preempt_has_tasks(rnp))
+	if (rcu_preempt_has_tasks(rnp)) {
 		rnp->gp_tasks = rnp->blkd_tasks.next;
+		t = container_of(rnp->gp_tasks, struct task_struct,
+				 rcu_node_entry);
+		trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
+						rnp->gpnum, t->pid);
+	}
 	WARN_ON_ONCE(rnp->qsmask);
 }
 
@@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu)
 }
 
 /*
- * Kick the leader kthread for this NOCB group.
+ * Kick the leader kthread for this NOCB group.  Caller holds ->nocb_lock
+ * and this function releases it.
  */
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
+			       unsigned long flags)
+	__releases(rdp->nocb_lock)
 {
 	struct rcu_data *rdp_leader = rdp->nocb_leader;
 
-	if (!READ_ONCE(rdp_leader->nocb_kthread))
+	lockdep_assert_held(&rdp->nocb_lock);
+	if (!READ_ONCE(rdp_leader->nocb_kthread)) {
+		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 		return;
-	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+	}
+	if (rdp_leader->nocb_leader_sleep || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
 		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+		del_timer(&rdp->nocb_timer);
+		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 		smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
 		swake_up(&rdp_leader->nocb_wq);
+	} else {
+		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 	}
 }
 
 /*
+ * Kick the leader kthread for this NOCB group, but caller has not
+ * acquired locks.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+	__wake_nocb_leader(rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the leader kthread for this NOCB group at some
+ * future time when it is safe to do so.
+ */
+static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
+				   const char *reason)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+	if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+		mod_timer(&rdp->nocb_timer, jiffies + 1);
+	WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
+	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+}
+
+/*
  * Does the specified CPU need an RCU callback for the specified flavor
  * of rcu_barrier()?
  */
@@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    TPS("WakeEmpty"));
 		} else {
-			WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
-			/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
-			smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
-			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-					    TPS("WakeEmptyIsDeferred"));
+			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+					       TPS("WakeEmptyIsDeferred"));
 		}
 		rdp->qlen_last_fqs_check = 0;
 	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
@@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    TPS("WakeOvf"));
 		} else {
-			WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
-			/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
-			smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
-			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-					    TPS("WakeOvfIsDeferred"));
+			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+					       TPS("WakeOvfIsDeferred"));
 		}
 		rdp->qlen_last_fqs_check = LONG_MAX / 2;
 	} else {
@@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
  * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
  * not a no-CBs CPU.
  */
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
 						     struct rcu_data *rdp,
 						     unsigned long flags)
 {
-	long ql = rsp->orphan_done.len;
-	long qll = rsp->orphan_done.len_lazy;
-
-	/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
 	if (!rcu_is_nocb_cpu(smp_processor_id()))
-		return false;
-
-	/* First, enqueue the donelist, if any.  This preserves CB ordering. */
-	if (rsp->orphan_done.head) {
-		__call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
-					rcu_cblist_tail(&rsp->orphan_done),
-					ql, qll, flags);
-	}
-	if (rsp->orphan_pend.head) {
-		__call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
-					rcu_cblist_tail(&rsp->orphan_pend),
-					ql, qll, flags);
-	}
-	rcu_cblist_init(&rsp->orphan_done);
-	rcu_cblist_init(&rsp->orphan_pend);
+		return false; /* Not NOCBs CPU, caller must migrate CBs. */
+	__call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
+				rcu_segcblist_tail(&rdp->cblist),
+				rcu_segcblist_n_cbs(&rdp->cblist),
+				rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
+	rcu_segcblist_init(&rdp->cblist);
+	rcu_segcblist_disable(&rdp->cblist);
 	return true;
 }
 
@@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 static void nocb_leader_wait(struct rcu_data *my_rdp)
 {
 	bool firsttime = true;
+	unsigned long flags;
 	bool gotcbs;
 	struct rcu_data *rdp;
 	struct rcu_head **tail;
@@ -2039,13 +2076,17 @@ wait_again:
 
 	/* Wait for callbacks to appear. */
 	if (!rcu_nocb_poll) {
-		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
 		swait_event_interruptible(my_rdp->nocb_wq,
 				!READ_ONCE(my_rdp->nocb_leader_sleep));
-		/* Memory barrier handled by smp_mb() calls below and repoll. */
+		raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
+		my_rdp->nocb_leader_sleep = true;
+		WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+		del_timer(&my_rdp->nocb_timer);
+		raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
 	} else if (firsttime) {
 		firsttime = false; /* Don't drown trace log with "Poll"! */
-		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
 	}
 
 	/*
@@ -2054,7 +2095,7 @@ wait_again:
 	 * nocb_gp_head, where they await a grace period.
 	 */
 	gotcbs = false;
-	smp_mb(); /* wakeup before ->nocb_head reads. */
+	smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
 		rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
 		if (!rdp->nocb_gp_head)
@@ -2066,56 +2107,41 @@ wait_again:
 		gotcbs = true;
 	}
 
-	/*
-	 * If there were no callbacks, sleep a bit, rescan after a
-	 * memory barrier, and go retry.
-	 */
+	/* No callbacks?  Sleep a bit if polling, and go retry.  */
 	if (unlikely(!gotcbs)) {
-		if (!rcu_nocb_poll)
-			trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
-					    "WokeEmpty");
 		WARN_ON(signal_pending(current));
-		schedule_timeout_interruptible(1);
-
-		/* Rescan in case we were a victim of memory ordering. */
-		my_rdp->nocb_leader_sleep = true;
-		smp_mb();  /* Ensure _sleep true before scan. */
-		for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
-			if (READ_ONCE(rdp->nocb_head)) {
-				/* Found CB, so short-circuit next wait. */
-				my_rdp->nocb_leader_sleep = false;
-				break;
-			}
+		if (rcu_nocb_poll) {
+			schedule_timeout_interruptible(1);
+		} else {
+			trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+					    TPS("WokeEmpty"));
+		}
 		goto wait_again;
 	}
 
 	/* Wait for one grace period. */
 	rcu_nocb_wait_gp(my_rdp);
 
-	/*
-	 * We left ->nocb_leader_sleep unset to reduce cache thrashing.
-	 * We set it now, but recheck for new callbacks while
-	 * traversing our follower list.
-	 */
-	my_rdp->nocb_leader_sleep = true;
-	smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
-
 	/* Each pass through the following loop wakes a follower, if needed. */
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-		if (READ_ONCE(rdp->nocb_head))
+		if (!rcu_nocb_poll &&
+		    READ_ONCE(rdp->nocb_head) &&
+		    READ_ONCE(my_rdp->nocb_leader_sleep)) {
+			raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
 			my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+			raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
+		}
 		if (!rdp->nocb_gp_head)
 			continue; /* No CBs, so no need to wake follower. */
 
 		/* Append callbacks to follower's "done" list. */
-		tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+		raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+		tail = rdp->nocb_follower_tail;
+		rdp->nocb_follower_tail = rdp->nocb_gp_tail;
 		*tail = rdp->nocb_gp_head;
-		smp_mb__after_atomic(); /* Store *tail before wakeup. */
+		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 		if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
-			/*
-			 * List was empty, wake up the follower.
-			 * Memory barriers supplied by atomic_long_add().
-			 */
+			/* List was empty, so wake up the follower.  */
 			swake_up(&rdp->nocb_wq);
 		}
 	}
@@ -2131,28 +2157,16 @@ wait_again:
  */
 static void nocb_follower_wait(struct rcu_data *rdp)
 {
-	bool firsttime = true;
-
 	for (;;) {
-		if (!rcu_nocb_poll) {
-			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-					    "FollowerSleep");
-			swait_event_interruptible(rdp->nocb_wq,
-						 READ_ONCE(rdp->nocb_follower_head));
-		} else if (firsttime) {
-			/* Don't drown trace log with "Poll"! */
-			firsttime = false;
-			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
-		}
+		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
+		swait_event_interruptible(rdp->nocb_wq,
+					 READ_ONCE(rdp->nocb_follower_head));
 		if (smp_load_acquire(&rdp->nocb_follower_head)) {
 			/* ^^^ Ensure CB invocation follows _head test. */
 			return;
 		}
-		if (!rcu_nocb_poll)
-			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-					    "WokeEmpty");
 		WARN_ON(signal_pending(current));
-		schedule_timeout_interruptible(1);
+		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
 	}
 }
 
@@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 static int rcu_nocb_kthread(void *arg)
 {
 	int c, cl;
+	unsigned long flags;
 	struct rcu_head *list;
 	struct rcu_head *next;
 	struct rcu_head **tail;
@@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg)
 			nocb_follower_wait(rdp);
 
 		/* Pull the ready-to-invoke callbacks onto local list. */
-		list = READ_ONCE(rdp->nocb_follower_head);
+		raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+		list = rdp->nocb_follower_head;
+		rdp->nocb_follower_head = NULL;
+		tail = rdp->nocb_follower_tail;
+		rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 		BUG_ON(!list);
-		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
-		WRITE_ONCE(rdp->nocb_follower_head, NULL);
-		tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
 
 		/* Each pass through the following loop invokes a callback. */
 		trace_rcu_batch_start(rdp->rsp->name,
@@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
 {
+	unsigned long flags;
 	int ndw;
 
-	if (!rcu_nocb_need_deferred_wakeup(rdp))
+	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+	if (!rcu_nocb_need_deferred_wakeup(rdp)) {
+		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 		return;
+	}
 	ndw = READ_ONCE(rdp->nocb_defer_wakeup);
 	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-	wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
+	__wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
 	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
 }
 
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(unsigned long x)
+{
+	do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check.  Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+	if (rcu_nocb_need_deferred_wakeup(rdp))
+		do_nocb_deferred_wakeup_common(rdp);
+}
+
 void __init rcu_init_nohz(void)
 {
 	int cpu;
@@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 	rdp->nocb_tail = &rdp->nocb_head;
 	init_swait_queue_head(&rdp->nocb_wq);
 	rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+	raw_spin_lock_init(&rdp->nocb_lock);
+	setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
+		    (unsigned long)rdp);
 }
 
 /*
@@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 	return false;
 }
 
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
 						     struct rcu_data *rdp,
 						     unsigned long flags)
 {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c470017..5033b66d2753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
 static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
 
 /* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
+DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
 
 /* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */
 #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)
 	mutex_unlock(&rcu_tasks_kthread_mutex);
 }
 
+/* Do the srcu_read_lock() for the above synchronize_srcu().  */
+void exit_tasks_rcu_start(void)
+{
+	preempt_disable();
+	current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+	preempt_enable();
+}
+
+/* Do the srcu_read_unlock() for the above synchronize_srcu().  */
+void exit_tasks_rcu_finish(void)
+{
+	preempt_disable();
+	__srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
+	preempt_enable();
+}
+
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
 #ifndef CONFIG_TINY_RCU
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489d2d80..de6d7f4dfcb5 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void)
 		goto out_fail;
 
 	tg = sched_create_group(&root_task_group);
-
 	if (IS_ERR(tg))
 		goto out_free;
 
@@ -101,7 +100,7 @@ out_free:
 out_fail:
 	if (printk_ratelimit()) {
 		printk(KERN_WARNING "autogroup_create: %s failure.\n",
-			ag ? "sched_create_group()" : "kmalloc()");
+			ag ? "sched_create_group()" : "kzalloc()");
 	}
 
 	return autogroup_kref_get(&autogroup_default);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 566b6ec7b6fe..cc873075c3bd 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -53,6 +53,13 @@ EXPORT_SYMBOL(complete);
  *
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
  */
 void complete_all(struct completion *x)
 {
@@ -308,9 +315,12 @@ EXPORT_SYMBOL(try_wait_for_completion);
  *	Return: 0 if there are waiters (wait_for_completion() in progress)
  *		 1 if there are no waiters.
  *
+ *	Note, this will always return true if complete_all() was called on @X.
  */
 bool completion_done(struct completion *x)
 {
+	unsigned long flags;
+
 	if (!READ_ONCE(x->done))
 		return false;
 
@@ -318,14 +328,9 @@ bool completion_done(struct completion *x)
 	 * If ->done, we need to wait for complete() to release ->wait.lock
 	 * otherwise we can end up freeing the completion before complete()
 	 * is done referencing it.
-	 *
-	 * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
-	 * the loads of ->done and ->wait.lock such that we cannot observe
-	 * the lock before complete() acquires it while observing the ->done
-	 * after it's acquired the lock.
 	 */
-	smp_rmb();
-	spin_unlock_wait(&x->wait.lock);
+	spin_lock_irqsave(&x->wait.lock, flags);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
 	return true;
 }
 EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9fece583a1f0..6d2c7ff9ba98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
 				 struct task_struct *p, int dest_cpu)
 {
-	if (unlikely(!cpu_active(dest_cpu)))
-		return rq;
+	if (p->flags & PF_KTHREAD) {
+		if (unlikely(!cpu_online(dest_cpu)))
+			return rq;
+	} else {
+		if (unlikely(!cpu_active(dest_cpu)))
+			return rq;
+	}
 
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	/*
+	 * The membarrier system call requires a full memory barrier
+	 * after storing to rq->curr, before going back to user-space.
+	 *
+	 * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+	 * up adding a full barrier to switch_mm(), or we should figure
+	 * out if a smp_mb__after_unlock_lock is really the proper API
+	 * to use.
+	 */
+	smp_mb__after_unlock_lock();
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 
@@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
+		/*
+		 * The membarrier system call requires each architecture
+		 * to have a full memory barrier after updating
+		 * rq->curr, before returning to user-space. For TSO
+		 * (e.g. x86), the architecture must provide its own
+		 * barrier in switch_mm(). For weakly ordered machines
+		 * for which spin_unlock() acts as a full memory
+		 * barrier, finish_lock_switch() in common code takes
+		 * care of this barrier. For weakly ordered machines for
+		 * which spin_unlock() acts as a RELEASE barrier (only
+		 * arm64 and PowerPC), arm64 has a full barrier in
+		 * switch_to(), and PowerPC has
+		 * smp_mb__after_unlock_lock() before
+		 * finish_lock_switch().
+		 */
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void)
 	 * To avoid it, we have to wait for releasing tsk->pi_lock which
 	 * is held by try_to_wake_up()
 	 */
-	smp_mb();
-	raw_spin_unlock_wait(&current->pi_lock);
+	raw_spin_lock_irq(&current->pi_lock);
+	raw_spin_unlock_irq(&current->pi_lock);
 
 	/* Causes final put_task_struct in finish_task_switch(): */
 	__set_current_state(TASK_DEAD);
@@ -5103,24 +5133,17 @@ out_unlock:
 	return retval;
 }
 
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	int ppid;
-	unsigned long state = p->state;
-
-	/* Make sure the string lines up properly with the number of task states: */
-	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
 
 	if (!try_get_task_stack(p))
 		return;
-	if (state)
-		state = __ffs(state) + 1;
-	printk(KERN_INFO "%-15.15s %c", p->comm,
-		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-	if (state == TASK_RUNNING)
+
+	printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+	if (p->state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
@@ -5177,11 +5200,6 @@ void show_state_filter(unsigned long state_filter)
 		debug_show_all_locks();
 }
 
-void init_idle_bootup_task(struct task_struct *idle)
-{
-	idle->sched_class = &idle_sched_class;
-}
-
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
@@ -5438,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 		 */
 		next = pick_next_task(rq, &fake_task, rf);
 		BUG_ON(!next);
-		next->sched_class->put_prev_task(rq, next);
+		put_prev_task(rq, next);
 
 		/*
 		 * Rules for changing task_struct::cpus_allowed are holding
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index fba235c7d026..8d9562d890d3 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
  * @p: the task
  * @later_mask: a mask to fill in with the selected CPUs (or NULL)
  *
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: int - CPUs were found
  */
 int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	       struct cpumask *later_mask)
 {
-	int best_cpu = -1;
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
 	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
-		best_cpu = cpumask_any(later_mask);
-		goto out;
-	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
-			dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
-		best_cpu = cpudl_maximum(cp);
-		if (later_mask)
-			cpumask_set_cpu(best_cpu, later_mask);
-	}
+		return 1;
+	} else {
+		int best_cpu = cpudl_maximum(cp);
+		WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
 
-out:
-	WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+		if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+		    dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+			if (later_mask)
+				cpumask_set_cpu(best_cpu, later_mask);
 
-	return best_cpu;
+			return 1;
+		}
+	}
+	return 0;
 }
 
 /*
@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)
 {
 	int i;
 
-	memset(cp, 0, sizeof(*cp));
 	raw_spin_lock_init(&cp->lock);
 	cp->size = 0;
 
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..2511aba36b89 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)
 {
 	int i;
 
-	memset(cp, 0, sizeof(*cp));
-
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 755bd3f1a1a9..d05bd9457a40 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * let's hope p can move out.
 	 */
 	if (rq->curr->nr_cpus_allowed == 1 ||
-	    cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+	    !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
 		return;
 
 	/*
@@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * see if it is pushed or pulled somewhere else.
 	 */
 	if (p->nr_cpus_allowed != 1 &&
-	    cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+	    cpudl_find(&rq->rd->cpudl, p, NULL))
 		return;
 
 	resched_curr(rq);
@@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 	return rb_entry(left, struct sched_dl_entity, rb_node);
 }
 
-struct task_struct *
+static struct task_struct *
 pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	struct sched_dl_entity *dl_se;
@@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task)
 	struct sched_domain *sd;
 	struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
 	int this_cpu = smp_processor_id();
-	int best_cpu, cpu = task_cpu(task);
+	int cpu = task_cpu(task);
 
 	/* Make sure the mask is initialized first */
 	if (unlikely(!later_mask))
@@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task)
 	 * We have to consider system topology and task affinity
 	 * first, then we can look for a suitable cpu.
 	 */
-	best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
-			task, later_mask);
-	if (best_cpu == -1)
+	if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
 		return -1;
 
 	/*
-	 * If we are here, some target has been found,
-	 * the most suitable of which is cached in best_cpu.
-	 * This is, among the runqueues where the current tasks
-	 * have later deadlines than the task's one, the rq
-	 * with the latest possible one.
+	 * If we are here, some targets have been found, including
+	 * the most suitable which is, among the runqueues where the
+	 * current tasks have later deadlines than the task's one, the
+	 * rq with the latest possible one.
 	 *
 	 * Now we check how well this matches with task's
 	 * affinity and system topology.
@@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task)
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		if (sd->flags & SD_WAKE_AFFINE) {
+			int best_cpu;
 
 			/*
 			 * If possible, preempting this_cpu is
@@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task)
 				return this_cpu;
 			}
 
+			best_cpu = cpumask_first_and(later_mask,
+							sched_domain_span(sd));
 			/*
-			 * Last chance: if best_cpu is valid and is
-			 * in the mask, that becomes our choice.
+			 * Last chance: if a cpu being in both later_mask
+			 * and current sd span is valid, that becomes our
+			 * choice. Of course, the latest possible cpu is
+			 * already under consideration through later_mask.
 			 */
-			if (best_cpu < nr_cpu_ids &&
-			    cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+			if (best_cpu < nr_cpu_ids) {
 				rcu_read_unlock();
 				return best_cpu;
 			}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fa66de52bd6..4a23bbc3111b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 	return table;
 }
 
+static cpumask_var_t sd_sysctl_cpus;
 static struct ctl_table_header *sd_sysctl_header;
+
 void register_sched_domain_sysctl(void)
 {
-	int i, cpu_num = num_possible_cpus();
-	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	static struct ctl_table *cpu_entries;
+	static struct ctl_table **cpu_idx;
 	char buf[32];
+	int i;
 
-	WARN_ON(sd_ctl_dir[0].child);
-	sd_ctl_dir[0].child = entry;
+	if (!cpu_entries) {
+		cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+		if (!cpu_entries)
+			return;
 
-	if (entry == NULL)
-		return;
+		WARN_ON(sd_ctl_dir[0].child);
+		sd_ctl_dir[0].child = cpu_entries;
+	}
 
-	for_each_possible_cpu(i) {
-		snprintf(buf, 32, "cpu%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_cpu_table(i);
-		entry++;
+	if (!cpu_idx) {
+		struct ctl_table *e = cpu_entries;
+
+		cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+		if (!cpu_idx)
+			return;
+
+		/* deal with sparse possible map */
+		for_each_possible_cpu(i) {
+			cpu_idx[i] = e;
+			e++;
+		}
+	}
+
+	if (!cpumask_available(sd_sysctl_cpus)) {
+		if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+			return;
+
+		/* init to possible to not have holes in @cpu_entries */
+		cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+	}
+
+	for_each_cpu(i, sd_sysctl_cpus) {
+		struct ctl_table *e = cpu_idx[i];
+
+		if (e->child)
+			sd_free_ctl_entry(&e->child);
+
+		if (!e->procname) {
+			snprintf(buf, 32, "cpu%d", i);
+			e->procname = kstrdup(buf, GFP_KERNEL);
+		}
+		e->mode = 0555;
+		e->child = sd_alloc_ctl_cpu_table(i);
+
+		__cpumask_clear_cpu(i, sd_sysctl_cpus);
 	}
 
 	WARN_ON(sd_sysctl_header);
 	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 
+void dirty_sched_domain_sysctl(int cpu)
+{
+	if (cpumask_available(sd_sysctl_cpus))
+		__cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
 /* may be called multiple times per register */
 void unregister_sched_domain_sysctl(void)
 {
 	unregister_sysctl_table(sd_sysctl_header);
 	sd_sysctl_header = NULL;
-	if (sd_ctl_dir[0].child)
-		sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #endif /* CONFIG_SYSCTL */
 #endif /* CONFIG_SMP */
@@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg)
 }
 #endif
 
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
 	if (rq->curr == p)
-		SEQ_printf(m, "R");
+		SEQ_printf(m, ">R");
 	else
-		SEQ_printf(m, " ");
+		SEQ_printf(m, " %c", task_state_to_char(p));
 
 	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
 		p->comm, task_pid_nr(p),
@@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
-	"            task   PID         tree-key  switches  prio"
+	" S           task   PID         tree-key  switches  prio"
 	"     wait-time             sum-exec        sum-sleep\n"
-	"------------------------------------------------------"
+	"-------------------------------------------------------"
 	"----------------------------------------------------\n");
 
 	rcu_read_lock();
@@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 #endif
 }
 
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+						  struct seq_file *m)
 {
 	unsigned long nr_switches;
 
-	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
 						get_nr_threads(p));
 	SEQ_printf(m,
 		"---------------------------------------------------------"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c95880e216f6..8d5868771cb3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 			/*
 			 * For !fair tasks do:
 			 *
-			update_cfs_rq_load_avg(now, cfs_rq, false);
+			update_cfs_rq_load_avg(now, cfs_rq);
 			attach_entity_load_avg(cfs_rq, se);
 			switched_from_fair(rq, p);
 			 *
@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 
+struct numa_group {
+	atomic_t refcount;
+
+	spinlock_t lock; /* nr_tasks, tasks */
+	int nr_tasks;
+	pid_t gid;
+	int active_nodes;
+
+	struct rcu_head rcu;
+	unsigned long total_faults;
+	unsigned long max_faults_cpu;
+	/*
+	 * Faults_cpu is used to decide whether memory should move
+	 * towards the CPU. As a consequence, these stats are weighted
+	 * more by CPU use than by memory faults.
+	 */
+	unsigned long *faults_cpu;
+	unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
 	unsigned long rss = 0;
@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
 	return max_t(unsigned int, floor, scan);
 }
 
+static unsigned int task_scan_start(struct task_struct *p)
+{
+	unsigned long smin = task_scan_min(p);
+	unsigned long period = smin;
+
+	/* Scale the maximum scan period with the amount of shared memory. */
+	if (p->numa_group) {
+		struct numa_group *ng = p->numa_group;
+		unsigned long shared = group_faults_shared(ng);
+		unsigned long private = group_faults_priv(ng);
+
+		period *= atomic_read(&ng->refcount);
+		period *= shared + 1;
+		period /= private + shared + 1;
+	}
+
+	return max(smin, period);
+}
+
 static unsigned int task_scan_max(struct task_struct *p)
 {
-	unsigned int smin = task_scan_min(p);
-	unsigned int smax;
+	unsigned long smin = task_scan_min(p);
+	unsigned long smax;
 
 	/* Watch for min being lower than max due to floor calculations */
 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+	/* Scale the maximum scan period with the amount of shared memory. */
+	if (p->numa_group) {
+		struct numa_group *ng = p->numa_group;
+		unsigned long shared = group_faults_shared(ng);
+		unsigned long private = group_faults_priv(ng);
+		unsigned long period = smax;
+
+		period *= atomic_read(&ng->refcount);
+		period *= shared + 1;
+		period /= private + shared + 1;
+
+		smax = max(smax, period);
+	}
+
 	return max(smin, smax);
 }
 
@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 }
 
-struct numa_group {
-	atomic_t refcount;
-
-	spinlock_t lock; /* nr_tasks, tasks */
-	int nr_tasks;
-	pid_t gid;
-	int active_nodes;
-
-	struct rcu_head rcu;
-	unsigned long total_faults;
-	unsigned long max_faults_cpu;
-	/*
-	 * Faults_cpu is used to decide whether memory should move
-	 * towards the CPU. As a consequence, these stats are weighted
-	 * more by CPU use than by memory faults.
-	 */
-	unsigned long *faults_cpu;
-	unsigned long faults[0];
-};
-
 /* Shared or private faults. */
 #define NR_NUMA_HINT_FAULT_TYPES 2
 
@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 }
 
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+	unsigned long faults = 0;
+	int node;
+
+	for_each_online_node(node) {
+		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+	}
+
+	return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+	unsigned long faults = 0;
+	int node;
+
+	for_each_online_node(node) {
+		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+	}
+
+	return faults;
+}
+
 /*
  * A node triggering more than 1/3 as many NUMA faults as the maximum is
  * considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long capacity_of(int cpu);
@@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
 		struct rq *rq = cpu_rq(cpu);
 
 		ns->nr_running += rq->nr_running;
-		ns->load += weighted_cpuload(cpu);
+		ns->load += weighted_cpuload(rq);
 		ns->compute_capacity += capacity_of(cpu);
 
 		cpus++;
@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
 	 * Reset the scan period if the task is being rescheduled on an
 	 * alternative node to recheck if the tasks is now properly placed.
 	 */
-	p->numa_scan_period = task_scan_min(p);
+	p->numa_scan_period = task_scan_start(p);
 
 	if (env.best_task == NULL) {
 		ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p,
 			unsigned long shared, unsigned long private)
 {
 	unsigned int period_slot;
-	int ratio;
+	int lr_ratio, ps_ratio;
 	int diff;
 
 	unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p,
 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
 	 */
 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
-	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
-	if (ratio >= NUMA_PERIOD_THRESHOLD) {
-		int slot = ratio - NUMA_PERIOD_THRESHOLD;
+	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+		/*
+		 * Most memory accesses are local. There is no need to
+		 * do fast NUMA scanning, since memory is already local.
+		 */
+		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+		if (!slot)
+			slot = 1;
+		diff = slot * period_slot;
+	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+		/*
+		 * Most memory accesses are shared with other tasks.
+		 * There is no point in continuing fast NUMA scanning,
+		 * since other tasks may just move the memory elsewhere.
+		 */
+		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
 		if (!slot)
 			slot = 1;
 		diff = slot * period_slot;
 	} else {
-		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
 		/*
-		 * Scale scan rate increases based on sharing. There is an
-		 * inverse relationship between the degree of sharing and
-		 * the adjustment made to the scanning period. Broadly
-		 * speaking the intent is that there is little point
-		 * scanning faster if shared accesses dominate as it may
-		 * simply bounce migrations uselessly
+		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+		 * yet they are not on the local NUMA node. Speed up
+		 * NUMA scanning to get the memory moved over.
 		 */
-		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
-		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+		int ratio = max(lr_ratio, ps_ratio);
+		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
 	}
 
 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work)
 
 	if (p->numa_scan_period == 0) {
 		p->numa_scan_period_max = task_scan_max(p);
-		p->numa_scan_period = task_scan_min(p);
+		p->numa_scan_period = task_scan_start(p);
 	}
 
 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 
 	if (now > curr->node_stamp + period) {
 		if (!curr->node_stamp)
-			curr->numa_scan_period = task_scan_min(curr);
+			curr->numa_scan_period = task_scan_start(curr);
 		curr->node_stamp += period;
 
 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	}
 }
 
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
-				    struct task_struct *p, int this_cpu,
-				    int prev_cpu, int sync)
-{
-	struct numa_stats prev_load, this_load;
-	s64 this_eff_load, prev_eff_load;
-
-	update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
-	update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
-	/*
-	 * If sync wakeup then subtract the (maximum possible)
-	 * effect of the currently running task from the load
-	 * of the current CPU:
-	 */
-	if (sync) {
-		unsigned long current_load = task_h_load(current);
-
-		if (this_load.load > current_load)
-			this_load.load -= current_load;
-		else
-			this_load.load = 0;
-	}
-
-	/*
-	 * In low-load situations, where this_cpu's node is idle due to the
-	 * sync cause above having dropped this_load.load to 0, move the task.
-	 * Moving to an idle socket will not create a bad imbalance.
-	 *
-	 * Otherwise check if the nodes are near enough in load to allow this
-	 * task to be woken on this_cpu's node.
-	 */
-	if (this_load.load > 0) {
-		unsigned long task_load = task_h_load(p);
-
-		this_eff_load = 100;
-		this_eff_load *= prev_load.compute_capacity;
-
-		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-		prev_eff_load *= this_load.compute_capacity;
-
-		this_eff_load *= this_load.load + task_load;
-		prev_eff_load *= prev_load.load - task_load;
-
-		return this_eff_load <= prev_eff_load;
-	}
-
-	return true;
-}
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
@@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
 
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
-				    struct task_struct *p, int this_cpu,
-				    int prev_cpu, int sync)
-{
-	return true;
-}
-#endif /* !SMP */
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -2790,6 +2801,29 @@ static inline void update_cfs_shares(struct sched_entity *se)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+	if (&this_rq()->cfs == cfs_rq) {
+		/*
+		 * There are a few boundary cases this might miss but it should
+		 * get called often enough that that should (hopefully) not be
+		 * a real problem -- added to that it only calls on the local
+		 * CPU, so if we enqueue remotely we'll miss an update, but
+		 * the next tick/schedule should update.
+		 *
+		 * It will not get called when we go idle, because the idle
+		 * thread is a different class (!fair), nor will the utilization
+		 * number include things like RT tasks.
+		 *
+		 * As is, the util number is not freq-invariant (we'd have to
+		 * implement arch_scale_freq_capacity() for that).
+		 *
+		 * See cpu_util().
+		 */
+		cpufreq_update_util(rq_of(cfs_rq), 0);
+	}
+}
+
 #ifdef CONFIG_SMP
 /*
  * Approximate:
@@ -2968,6 +3002,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	sa->last_update_time += delta << 10;
 
 	/*
+	 * running is a subset of runnable (weight) so running can't be set if
+	 * runnable is clear. But there are some corner cases where the current
+	 * se has been already dequeued but cfs_rq->curr still points to it.
+	 * This means that weight will be 0 but not running for a sched_entity
+	 * but also for a cfs_rq if the latter becomes idle. As an example,
+	 * this happens during idle_balance() which calls
+	 * update_blocked_averages()
+	 */
+	if (!weight)
+		running = 0;
+
+	/*
 	 * Now we know we crossed measurement unit boundaries. The *_avg
 	 * accrues by two steps:
 	 *
@@ -3276,29 +3322,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
-	if (&this_rq()->cfs == cfs_rq) {
-		/*
-		 * There are a few boundary cases this might miss but it should
-		 * get called often enough that that should (hopefully) not be
-		 * a real problem -- added to that it only calls on the local
-		 * CPU, so if we enqueue remotely we'll miss an update, but
-		 * the next tick/schedule should update.
-		 *
-		 * It will not get called when we go idle, because the idle
-		 * thread is a different class (!fair), nor will the utilization
-		 * number include things like RT tasks.
-		 *
-		 * As is, the util number is not freq-invariant (we'd have to
-		 * implement arch_scale_freq_capacity() for that).
-		 *
-		 * See cpu_util().
-		 */
-		cpufreq_update_util(rq_of(cfs_rq), 0);
-	}
-}
-
 /*
  * Unsigned subtract and clamp on underflow.
  *
@@ -3320,7 +3343,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
  * @now: current time, as per cfs_rq_clock_task()
  * @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
  *
  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
  * avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,7 +3356,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
  * call update_tg_load_avg() when this function returns true.
  */
 static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
 	struct sched_avg *sa = &cfs_rq->avg;
 	int decayed, removed_load = 0, removed_util = 0;
@@ -3362,7 +3384,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	if (update_freq && (decayed || removed_util))
+	if (decayed || removed_util)
 		cfs_rq_util_change(cfs_rq);
 
 	return decayed || removed_load;
@@ -3390,7 +3412,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
 		__update_load_avg_se(now, cpu, cfs_rq, se);
 
-	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
 	decayed |= propagate_entity_load_avg(se);
 
 	if (decayed && (flags & UPDATE_TG))
@@ -3534,7 +3556,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
 #else /* CONFIG_SMP */
 
 static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
 	return 0;
 }
@@ -3544,7 +3566,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
 static inline void update_load_avg(struct sched_entity *se, int not_used1)
 {
-	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+	cfs_rq_util_change(cfs_rq_of(se));
 }
 
 static inline void
@@ -5125,9 +5147,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
 }
 
 /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
 {
-	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+	return cfs_rq_runnable_load_avg(&rq->cfs);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5194,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (weighted_cpuload(cpu_of(this_rq)))
+	if (weighted_cpuload(this_rq))
 		return;
 
 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5215,7 @@ void cpu_load_update_nohz_start(void)
 	 * concurrently we'll exit nohz. And cpu_load write can race with
 	 * cpu_load_update_idle() but both updater would be writing the same.
 	 */
-	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+	this_rq->cpu_load[0] = weighted_cpuload(this_rq);
 }
 
 /*
@@ -5209,7 +5231,7 @@ void cpu_load_update_nohz_stop(void)
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
-	load = weighted_cpuload(cpu_of(this_rq));
+	load = weighted_cpuload(this_rq);
 	rq_lock(this_rq, &rf);
 	update_rq_clock(this_rq);
 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5257,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
  */
 void cpu_load_update_active(struct rq *this_rq)
 {
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
+	unsigned long load = weighted_cpuload(this_rq);
 
 	if (tick_nohz_tick_stopped())
 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5275,7 @@ void cpu_load_update_active(struct rq *this_rq)
 static unsigned long source_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = weighted_cpuload(rq);
 
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
@@ -5268,7 +5290,7 @@ static unsigned long source_load(int cpu, int type)
 static unsigned long target_load(int cpu, int type)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = weighted_cpuload(rq);
 
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
@@ -5290,7 +5312,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = weighted_cpuload(cpu);
+	unsigned long load_avg = weighted_cpuload(rq);
 
 	if (nr_running)
 		return load_avg / nr_running;
@@ -5345,20 +5367,115 @@ static int wake_wide(struct task_struct *p)
 	return 1;
 }
 
+struct llc_stats {
+	unsigned long	nr_running;
+	unsigned long	load;
+	unsigned long	capacity;
+	int		has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+	if (!sds)
+		return false;
+
+	stats->nr_running	= READ_ONCE(sds->nr_running);
+	stats->load		= READ_ONCE(sds->load);
+	stats->capacity		= READ_ONCE(sds->capacity);
+	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+	return true;
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+		int this_cpu, int prev_cpu, int sync)
+{
+	struct llc_stats prev_stats, this_stats;
+	s64 this_eff_load, prev_eff_load;
+	unsigned long task_load;
+
+	if (!get_llc_stats(&prev_stats, prev_cpu) ||
+	    !get_llc_stats(&this_stats, this_cpu))
+		return false;
+
+	/*
+	 * If sync wakeup then subtract the (maximum possible)
+	 * effect of the currently running task from the load
+	 * of the current LLC.
+	 */
+	if (sync) {
+		unsigned long current_load = task_h_load(current);
+
+		/* in this case load hits 0 and this LLC is considered 'idle' */
+		if (current_load > this_stats.load)
+			return true;
+
+		this_stats.load -= current_load;
+	}
+
+	/*
+	 * The has_capacity stuff is not SMT aware, but by trying to balance
+	 * the nr_running on both ends we try and fill the domain at equal
+	 * rates, thereby first consuming cores before siblings.
+	 */
+
+	/* if the old cache has capacity, stay there */
+	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+		return false;
+
+	/* if this cache has capacity, come here */
+	if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+		return true;
+
+	/*
+	 * Check to see if we can move the load without causing too much
+	 * imbalance.
+	 */
+	task_load = task_h_load(p);
+
+	this_eff_load = 100;
+	this_eff_load *= prev_stats.capacity;
+
+	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+	prev_eff_load *= this_stats.capacity;
+
+	this_eff_load *= this_stats.load + task_load;
+	prev_eff_load *= prev_stats.load - task_load;
+
+	return this_eff_load <= prev_eff_load;
+}
+
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
 	int this_cpu = smp_processor_id();
-	bool affine = false;
+	bool affine;
 
 	/*
-	 * Common case: CPUs are in the same socket, and select_idle_sibling()
-	 * will do its thing regardless of what we return:
+	 * Default to no affine wakeups; wake_affine() should not effect a task
+	 * placement the load-balancer feels inclined to undo. The conservative
+	 * option is therefore to not move tasks when they wake up.
 	 */
-	if (cpus_share_cache(prev_cpu, this_cpu))
-		affine = true;
-	else
-		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+	affine = false;
+
+	/*
+	 * If the wakeup is across cache domains, try to evaluate if movement
+	 * makes sense, otherwise rely on select_idle_siblings() to do
+	 * placement inside the cache domain.
+	 */
+	if (!cpus_share_cache(prev_cpu, this_cpu))
+		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
 
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 	if (affine) {
@@ -5550,7 +5667,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
-			load = weighted_cpuload(i);
+			load = weighted_cpuload(cpu_rq(i));
 			if (load < min_load || (load == min_load && i == this_cpu)) {
 				min_load = load;
 				least_loaded_cpu = i;
@@ -6187,10 +6304,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 	int new_tasks;
 
 again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	if (!cfs_rq->nr_running)
 		goto idle;
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	if (prev->sched_class != &fair_sched_class)
 		goto simple;
 
@@ -6220,11 +6337,17 @@ again:
 			/*
 			 * This call to check_cfs_rq_runtime() will do the
 			 * throttle and dequeue its entity in the parent(s).
-			 * Therefore the 'simple' nr_running test will indeed
+			 * Therefore the nr_running test will indeed
 			 * be correct.
 			 */
-			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+				cfs_rq = &rq->cfs;
+
+				if (!cfs_rq->nr_running)
+					goto idle;
+
 				goto simple;
+			}
 		}
 
 		se = pick_next_entity(cfs_rq, curr);
@@ -6264,12 +6387,8 @@ again:
 
 	return p;
 simple:
-	cfs_rq = &rq->cfs;
 #endif
 
-	if (!cfs_rq->nr_running)
-		goto idle;
-
 	put_prev_task(rq, prev);
 
 	do {
@@ -6917,7 +7036,7 @@ static void update_blocked_averages(int cpu)
 		if (throttled_hierarchy(cfs_rq))
 			continue;
 
-		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
 			update_tg_load_avg(cfs_rq, 0);
 
 		/* Propagate pending load changes to the parent, if any: */
@@ -6990,7 +7109,7 @@ static inline void update_blocked_averages(int cpu)
 
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
-	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
 	rq_unlock_irqrestore(rq, &rf);
 }
 
@@ -7036,6 +7155,7 @@ struct sg_lb_stats {
 struct sd_lb_stats {
 	struct sched_group *busiest;	/* Busiest group in this sd */
 	struct sched_group *local;	/* Local group in this sd */
+	unsigned long total_running;
 	unsigned long total_load;	/* Total load of all groups in sd */
 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
 	unsigned long avg_load;	/* Average load across all groups in sd */
@@ -7055,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 	*sds = (struct sd_lb_stats){
 		.busiest = NULL,
 		.local = NULL,
+		.total_running = 0UL,
 		.total_load = 0UL,
 		.total_capacity = 0UL,
 		.busiest_stat = {
@@ -7363,7 +7484,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-		sgs->sum_weighted_load += weighted_cpuload(i);
+		sgs->sum_weighted_load += weighted_cpuload(rq);
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -7490,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
+	struct sched_domain_shared *shared = env->sd->shared;
 	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
@@ -7546,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
 next_group:
 		/* Now, start updating sd_lb_stats */
+		sds->total_running += sgs->sum_nr_running;
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;
 
@@ -7561,6 +7684,21 @@ next_group:
 			env->dst_rq->rd->overload = overload;
 	}
 
+	if (!shared)
+		return;
+
+	/*
+	 * Since these are sums over groups they can contain some CPUs
+	 * multiple times for the NUMA domains.
+	 *
+	 * Currently only wake_affine_llc() and find_busiest_group()
+	 * uses these numbers, only the last is affected by this problem.
+	 *
+	 * XXX fix that.
+	 */
+	WRITE_ONCE(shared->nr_running,	sds->total_running);
+	WRITE_ONCE(shared->load,	sds->total_load);
+	WRITE_ONCE(shared->capacity,	sds->total_capacity);
 }
 
 /**
@@ -7790,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (!sds.busiest || busiest->sum_nr_running == 0)
 		goto out_balanced;
 
+	/* XXX broken for overlapping NUMA groups */
 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
 						/ sds.total_capacity;
 
@@ -7892,7 +8031,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 
 		capacity = capacity_of(i);
 
-		wl = weighted_cpuload(i);
+		wl = weighted_cpuload(rq);
 
 		/*
 		 * When comparing with imbalance, use weighted_cpuload()
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h"	/* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK	\
+	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+	smp_mb();	/* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+	int cpu;
+	bool fallback = false;
+	cpumask_var_t tmpmask;
+
+	if (num_online_cpus() == 1)
+		return;
+
+	/*
+	 * Matches memory barriers around rq->curr modification in
+	 * scheduler.
+	 */
+	smp_mb();	/* system call entry is not a mb. */
+
+	/*
+	 * Expedited membarrier commands guarantee that they won't
+	 * block, hence the GFP_NOWAIT allocation flag and fallback
+	 * implementation.
+	 */
+	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+		/* Fallback for OOM. */
+		fallback = true;
+	}
+
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct task_struct *p;
+
+		/*
+		 * Skipping the current CPU is OK even through we can be
+		 * migrated at any point. The current CPU, at the point
+		 * where we read raw_smp_processor_id(), is ensured to
+		 * be in program order with respect to the caller
+		 * thread. Therefore, we can skip this CPU from the
+		 * iteration.
+		 */
+		if (cpu == raw_smp_processor_id())
+			continue;
+		rcu_read_lock();
+		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+		if (p && p->mm == current->mm) {
+			if (!fallback)
+				__cpumask_set_cpu(cpu, tmpmask);
+			else
+				smp_call_function_single(cpu, ipi_mb, NULL, 1);
+		}
+		rcu_read_unlock();
+	}
+	if (!fallback) {
+		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+		free_cpumask_var(tmpmask);
+	}
+	cpus_read_unlock();
+
+	/*
+	 * Memory barrier on the caller thread _after_ we finished
+	 * waiting for the last IPI. Matches memory barriers around
+	 * rq->curr modification in scheduler.
+	 */
+	smp_mb();	/* exit from system call is not a mb */
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	switch (cmd) {
+	case MEMBARRIER_CMD_QUERY:
+	{
+		int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+		if (tick_nohz_full_enabled())
+			cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+		return cmd_mask;
+	}
+	case MEMBARRIER_CMD_SHARED:
+		/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+		if (tick_nohz_full_enabled())
+			return -EINVAL;
+		if (num_online_cpus() > 1)
+			synchronize_sched();
+		return 0;
+	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+		membarrier_private_expedited();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f29a7d2b57e1..ab1c7f5409a0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg);
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
 void register_sched_domain_sysctl(void);
+void dirty_sched_domain_sysctl(int cpu);
 void unregister_sched_domain_sysctl(void);
 #else
 static inline void register_sched_domain_sysctl(void)
 {
 }
+static inline void dirty_sched_domain_sysctl(int cpu)
+{
+}
 static inline void unregister_sched_domain_sysctl(void)
 {
 }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79895aec281e..6f7b43982f73 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 static int init_rootdomain(struct root_domain *rd)
 {
-	memset(rd, 0, sizeof(*rd));
-
 	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
 		goto out;
 	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
@@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void)
 {
 	struct root_domain *rd;
 
-	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
 	if (!rd)
 		return NULL;
 
@@ -337,7 +335,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
 		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
 			kfree(sg->sgc);
 
-		kfree(sg);
+		if (atomic_dec_and_test(&sg->ref))
+			kfree(sg);
 		sg = tmp;
 	} while (sg != first);
 }
@@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
 static void destroy_sched_domain(struct sched_domain *sd)
 {
 	/*
-	 * If its an overlapping domain it has private groups, iterate and
-	 * nuke them all.
+	 * A normal sched domain may have multiple group references, an
+	 * overlapping domain, having private groups, only one.  Iterate,
+	 * dropping group/capacity references, freeing where none remain.
 	 */
-	if (sd->flags & SD_OVERLAP) {
-		free_sched_groups(sd->groups, 1);
-	} else if (atomic_dec_and_test(&sd->groups->ref)) {
-		kfree(sd->groups->sgc);
-		kfree(sd->groups);
-	}
+	free_sched_groups(sd->groups, 1);
+
 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
 		kfree(sd->shared);
 	kfree(sd);
@@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	rq_attach_root(rq, rd);
 	tmp = rq->sd;
 	rcu_assign_pointer(rq->sd, sd);
+	dirty_sched_domain_sysctl(cpu);
 	destroy_sched_domains(tmp);
 
 	update_top_cache_domain(cpu);
@@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
 	else
 		cpumask_copy(sg_span, sched_domain_span(sd));
 
+	atomic_inc(&sg->ref);
 	return sg;
 }
 
@@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
 	}
 }
 
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
 {
@@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 	/* Let the architecture update CPU core mappings: */
 	new_topology = arch_update_cpu_topology();
 
-	n = doms_new ? ndoms_new : 0;
+	if (!doms_new) {
+		WARN_ON_ONCE(dattr_new);
+		n = 0;
+		doms_new = alloc_sched_domains(1);
+		if (doms_new) {
+			n = 1;
+			cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		}
+	} else {
+		n = ndoms_new;
+	}
 
 	/* Destroy deleted domains: */
 	for (i = 0; i < ndoms_cur; i++) {
@@ -1870,11 +1878,10 @@ match1:
 	}
 
 	n = ndoms_cur;
-	if (doms_new == NULL) {
+	if (!doms_new) {
 		n = 0;
 		doms_new = &fallback_doms;
 		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
-		WARN_ON_ONCE(dattr_new);
 	}
 
 	/* Build new domains: */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051fcca2..836a72a66fba 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -96,20 +96,16 @@ void task_work_run(void)
 		 * work->func() can do task_work_add(), do not set
 		 * work_exited unless the list is empty.
 		 */
+		raw_spin_lock_irq(&task->pi_lock);
 		do {
 			work = READ_ONCE(task->task_works);
 			head = !work && (task->flags & PF_EXITING) ?
 				&work_exited : NULL;
 		} while (cmpxchg(&task->task_works, work, head) != work);
+		raw_spin_unlock_irq(&task->pi_lock);
 
 		if (!work)
 			break;
-		/*
-		 * Synchronize with task_work_cancel(). It can't remove
-		 * the first entry == work, cmpxchg(task_works) should
-		 * fail, but it can play with *work and other entries.
-		 */
-		raw_spin_unlock_wait(&task->pi_lock);
 
 		do {
 			next = work->next;
diff --git a/kernel/torture.c b/kernel/torture.c
index 55de96529287..637e172835d8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
 				 torture_type, cpu);
 		(*n_offl_successes)++;
 		delta = jiffies - starttime;
-		sum_offl += delta;
+		*sum_offl += delta;
 		if (*min_offl < 0) {
 			*min_offl = delta;
 			*max_offl = delta;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e718df3cbd46..7396f5044397 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -374,6 +374,9 @@ config STACK_VALIDATION
 	  pointers (if CONFIG_FRAME_POINTER is enabled).  This helps ensure
 	  that runtime stack traces are more reliable.
 
+	  This is also a prerequisite for generation of ORC unwind data, which
+	  is needed for CONFIG_ORC_UNWINDER.
+
 	  For more information, see
 	  tools/objtool/Documentation/stack-validation.txt.
 
@@ -1130,7 +1133,7 @@ config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE
+	select FRAME_POINTER if !MIPS && !PPC && !ARM_UNWIND && !S390 && !MICROBLAZE && !ARC && !SCORE && !X86
 	select KALLSYMS
 	select KALLSYMS_ALL
 
@@ -1565,7 +1568,7 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
 	depends on !X86_64
 	select STACKTRACE
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE && !X86
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 
@@ -1574,7 +1577,7 @@ config LATENCYTOP
 	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 	depends on PROC_FS
-	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
+	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !X86
 	select KALLSYMS
 	select KALLSYMS_ALL
 	select STACKTRACE
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 17afb0430161..2f5349c6e81a 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -18,6 +18,7 @@
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/hash.h>
+#include <linux/kmemleak.h>
 
 #define ODEBUG_HASH_BITS	14
 #define ODEBUG_HASH_SIZE	(1 << ODEBUG_HASH_BITS)
@@ -110,6 +111,7 @@ static void fill_pool(void)
 		if (!new)
 			return;
 
+		kmemleak_ignore(new);
 		raw_spin_lock_irqsave(&pool_lock, flags);
 		hlist_add_head(&new->node, &obj_pool);
 		debug_objects_allocated++;
@@ -1080,6 +1082,7 @@ static int __init debug_objects_replace_static_objects(void)
 		obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL);
 		if (!obj)
 			goto free;
+		kmemleak_ignore(obj);
 		hlist_add_head(&obj->node, &objects);
 	}
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 9979f46c81dc..51390febd5e3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -96,19 +96,26 @@ static struct conntrack_gc_work conntrack_gc_work;
 
 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
 {
+	/* 1) Acquire the lock */
 	spin_lock(lock);
-	while (unlikely(nf_conntrack_locks_all)) {
-		spin_unlock(lock);
 
-		/*
-		 * Order the 'nf_conntrack_locks_all' load vs. the
-		 * spin_unlock_wait() loads below, to ensure
-		 * that 'nf_conntrack_locks_all_lock' is indeed held:
-		 */
-		smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */
-		spin_unlock_wait(&nf_conntrack_locks_all_lock);
-		spin_lock(lock);
-	}
+	/* 2) read nf_conntrack_locks_all, with ACQUIRE semantics
+	 * It pairs with the smp_store_release() in nf_conntrack_all_unlock()
+	 */
+	if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
+		return;
+
+	/* fast path failed, unlock */
+	spin_unlock(lock);
+
+	/* Slow path 1) get global lock */
+	spin_lock(&nf_conntrack_locks_all_lock);
+
+	/* Slow path 2) get the lock we want */
+	spin_lock(lock);
+
+	/* Slow path 3) release the global lock */
+	spin_unlock(&nf_conntrack_locks_all_lock);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
 
@@ -149,28 +156,27 @@ static void nf_conntrack_all_lock(void)
 	int i;
 
 	spin_lock(&nf_conntrack_locks_all_lock);
-	nf_conntrack_locks_all = true;
 
-	/*
-	 * Order the above store of 'nf_conntrack_locks_all' against
-	 * the spin_unlock_wait() loads below, such that if
-	 * nf_conntrack_lock() observes 'nf_conntrack_locks_all'
-	 * we must observe nf_conntrack_locks[] held:
-	 */
-	smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */
+	nf_conntrack_locks_all = true;
 
 	for (i = 0; i < CONNTRACK_LOCKS; i++) {
-		spin_unlock_wait(&nf_conntrack_locks[i]);
+		spin_lock(&nf_conntrack_locks[i]);
+
+		/* This spin_unlock provides the "release" to ensure that
+		 * nf_conntrack_locks_all==true is visible to everyone that
+		 * acquired spin_lock(&nf_conntrack_locks[]).
+		 */
+		spin_unlock(&nf_conntrack_locks[i]);
 	}
 }
 
 static void nf_conntrack_all_unlock(void)
 {
-	/*
-	 * All prior stores must be complete before we clear
+	/* All prior stores must be complete before we clear
 	 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
 	 * might observe the false value but not the entire
-	 * critical section:
+	 * critical section.
+	 * It pairs with the smp_load_acquire() in nf_conntrack_lock()
 	 */
 	smp_store_release(&nf_conntrack_locks_all, false);
 	spin_unlock(&nf_conntrack_locks_all_lock);
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index a18cb4496e1e..2e3a10e79ca9 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -258,7 +258,8 @@ ifneq ($(SKIP_STACK_VALIDATION),1)
 
 __objtool_obj := $(objtree)/tools/objtool/objtool
 
-objtool_args = check
+objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check)
+
 ifndef CONFIG_FRAME_POINTER
 objtool_args += --no-fp
 endif
@@ -279,6 +280,11 @@ objtool_obj = $(if $(patsubst y%,, \
 endif # SKIP_STACK_VALIDATION
 endif # CONFIG_STACK_VALIDATION
 
+# Rebuild all objects when objtool changes, or is enabled/disabled.
+objtool_dep = $(objtool_obj)					\
+	      $(wildcard include/config/orc/unwinder.h		\
+			 include/config/stack/validation.h)
+
 define rule_cc_o_c
 	$(call echo-cmd,checksrc) $(cmd_checksrc)			  \
 	$(call cmd_and_fixdep,cc_o_c)					  \
@@ -301,13 +307,13 @@ cmd_undef_syms = echo
 endif
 
 # Built-in and composite module parts
-$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
+$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE
 	$(call cmd,force_checksrc)
 	$(call if_changed_rule,cc_o_c)
 
 # Single-part modules are special since we need to mark them in $(MODVERDIR)
 
-$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
+$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE
 	$(call cmd,force_checksrc)
 	$(call if_changed_rule,cc_o_c)
 	@{ echo $(@:.o=.ko); echo $@; \
@@ -402,7 +408,7 @@ cmd_modversions_S =								\
 endif
 endif
 
-$(obj)/%.o: $(src)/%.S $(objtool_obj) FORCE
+$(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE
 	$(call if_changed_rule,as_o_S)
 
 targets += $(real-objs-y) $(real-objs-m) $(lib-y)
diff --git a/tools/Makefile b/tools/Makefile
index 221e1ce78b06..a19b176b914b 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -18,7 +18,6 @@ help:
 	@echo '  iio                    - IIO tools'
 	@echo '  kvm_stat               - top-like utility for displaying kvm statistics'
 	@echo '  leds                   - LEDs  tools'
-	@echo '  lguest                 - a minimal 32-bit x86 hypervisor'
 	@echo '  liblockdep             - user-space wrapper for kernel locking-validator'
 	@echo '  net                    - misc networking tools'
 	@echo '  perf                   - Linux performance measurement and analysis tool'
@@ -90,7 +89,7 @@ freefall: FORCE
 kvm_stat: FORCE
 	$(call descend,kvm/$@)
 
-all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \
+all: acpi cgroup cpupower gpio hv firewire liblockdep \
 		perf selftests turbostat usb \
 		virtio vm net x86_energy_perf_policy \
 		tmon freefall objtool kvm_stat
@@ -101,7 +100,7 @@ acpi_install:
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install:
+cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install:
 	$(call descend,$(@:_install=),install)
 
 liblockdep_install:
@@ -123,7 +122,7 @@ kvm_stat_install:
 	$(call descend,kvm/$(@:_install=),install)
 
 install: acpi_install cgroup_install cpupower_install gpio_install \
-		hv_install firewire_install lguest_install liblockdep_install \
+		hv_install firewire_install liblockdep_install \
 		perf_install selftests_install turbostat_install usb_install \
 		virtio_install vm_install net_install x86_energy_perf_policy_install \
 		tmon_install freefall_install objtool_install kvm_stat_install
@@ -134,7 +133,7 @@ acpi_clean:
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
+cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
 	$(call descend,$(@:_clean=),clean)
 
 liblockdep_clean:
@@ -168,7 +167,7 @@ freefall_clean:
 build_clean:
 	$(call descend,build,clean)
 
-clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \
+clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
 		perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \
 		vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h
index 5e3c673fa3f4..5db2d4c6a55f 100644
--- a/tools/arch/arm/include/uapi/asm/kvm.h
+++ b/tools/arch/arm/include/uapi/asm/kvm.h
@@ -203,6 +203,14 @@ struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
 #define VGIC_LEVEL_INFO_LINE_LEVEL	0
 
+/* Device Control API on vcpu fd */
+#define KVM_ARM_VCPU_PMU_V3_CTRL	0
+#define   KVM_ARM_VCPU_PMU_V3_IRQ	0
+#define   KVM_ARM_VCPU_PMU_V3_INIT	1
+#define KVM_ARM_VCPU_TIMER_CTRL		1
+#define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
+#define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
+
 #define   KVM_DEV_ARM_VGIC_CTRL_INIT		0
 #define   KVM_DEV_ARM_ITS_SAVE_TABLES		1
 #define   KVM_DEV_ARM_ITS_RESTORE_TABLES	2
diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h
index 70eea2ecc663..9f3ca24bbcc6 100644
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -232,6 +232,9 @@ struct kvm_arch_memory_slot {
 #define KVM_ARM_VCPU_PMU_V3_CTRL	0
 #define   KVM_ARM_VCPU_PMU_V3_IRQ	0
 #define   KVM_ARM_VCPU_PMU_V3_INIT	1
+#define KVM_ARM_VCPU_TIMER_CTRL		1
+#define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
+#define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
 
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT		24
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index 07fbeb927834..8cf8f0c96906 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -60,6 +60,12 @@ struct kvm_regs {
 
 #define KVM_SREGS_E_FSL_PIDn	(1 << 0) /* PID1/PID2 */
 
+/* flags for kvm_run.flags */
+#define KVM_RUN_PPC_NMI_DISP_MASK		(3 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_FULLY_RECOV	(1 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV	(2 << 0)
+#define   KVM_RUN_PPC_NMI_DISP_NOT_RECOV	(3 << 0)
+
 /*
  * Feature bits indicate which sections of the sregs struct are valid,
  * both in KVM_GET_SREGS and KVM_SET_SREGS.  On KVM_SET_SREGS, registers
diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h
index 3dd2a1d308dd..69d09c39bbcd 100644
--- a/tools/arch/s390/include/uapi/asm/kvm.h
+++ b/tools/arch/s390/include/uapi/asm/kvm.h
@@ -28,6 +28,7 @@
 #define KVM_DEV_FLIC_CLEAR_IO_IRQ	8
 #define KVM_DEV_FLIC_AISM		9
 #define KVM_DEV_FLIC_AIRQ_INJECT	10
+#define KVM_DEV_FLIC_AISM_ALL		11
 /*
  * We can have up to 4*64k pending subchannels + 8 adapter interrupts,
  * as well as up  to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts.
@@ -53,6 +54,11 @@ struct kvm_s390_ais_req {
 	__u16 mode;
 };
 
+struct kvm_s390_ais_all {
+	__u8 simm;
+	__u8 nimm;
+};
+
 #define KVM_S390_IO_ADAPTER_MASK 1
 #define KVM_S390_IO_ADAPTER_MAP 2
 #define KVM_S390_IO_ADAPTER_UNMAP 3
@@ -70,6 +76,7 @@ struct kvm_s390_io_adapter_req {
 #define KVM_S390_VM_TOD			1
 #define KVM_S390_VM_CRYPTO		2
 #define KVM_S390_VM_CPU_MODEL		3
+#define KVM_S390_VM_MIGRATION		4
 
 /* kvm attributes for mem_ctrl */
 #define KVM_S390_VM_MEM_ENABLE_CMMA	0
@@ -151,6 +158,11 @@ struct kvm_s390_vm_cpu_subfunc {
 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW	2
 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW	3
 
+/* kvm attributes for migration mode */
+#define KVM_S390_VM_MIGRATION_STOP	0
+#define KVM_S390_VM_MIGRATION_START	1
+#define KVM_S390_VM_MIGRATION_STATUS	2
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
 	/* general purpose regs for s390 */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 2701e5f8145b..8ea315a11fe0 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -177,7 +177,7 @@
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
 #define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
 #define X86_FEATURE_PTSC	( 6*32+27) /* performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_PERFCTR_LLC	( 6*32+28) /* Last Level Cache performance counter extensions */
 #define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
 
 /*
@@ -286,6 +286,7 @@
 #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
 #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
 #define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
 #define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
diff --git a/tools/arch/x86/include/asm/unistd_32.h b/tools/arch/x86/include/asm/unistd_32.h
index 88b3f8c8920c..0e4312ffc945 100644
--- a/tools/arch/x86/include/asm/unistd_32.h
+++ b/tools/arch/x86/include/asm/unistd_32.h
@@ -10,3 +10,6 @@
 #ifndef __NR_getcpu
 # define __NR_getcpu 318
 #endif
+#ifndef __NR_setns
+# define __NR_setns 346
+#endif
diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/asm/unistd_64.h
index fbdb70ee8837..dd56bb36132a 100644
--- a/tools/arch/x86/include/asm/unistd_64.h
+++ b/tools/arch/x86/include/asm/unistd_64.h
@@ -10,3 +10,6 @@
 #ifndef __NR_getcpu
 # define __NR_getcpu 309
 #endif
+#ifndef __NR_setns
+#define __NR_setns 308
+#endif
diff --git a/tools/arch/x86/include/uapi/asm/unistd.h b/tools/arch/x86/include/uapi/asm/unistd.h
new file mode 100644
index 000000000000..a26df0d75cd0
--- /dev/null
+++ b/tools/arch/x86/include/uapi/asm/unistd.h
@@ -0,0 +1,17 @@
+#ifndef _UAPI_ASM_X86_UNISTD_H
+#define _UAPI_ASM_X86_UNISTD_H
+
+/* x32 syscall flag bit */
+#define __X32_SYSCALL_BIT	0x40000000
+
+#ifndef __KERNEL__
+# ifdef __i386__
+#  include <asm/unistd_32.h>
+# elif defined(__ILP32__)
+#  include <asm/unistd_x32.h>
+# else
+#  include <asm/unistd_64.h>
+# endif
+#endif
+
+#endif /* _UAPI_ASM_X86_UNISTD_H */
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 523911f316ce..c71a05b9c984 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -64,7 +64,8 @@ FEATURE_TESTS_BASIC :=                  \
         get_cpuid                       \
         bpf                             \
         sched_getcpu			\
-        sdt
+        sdt				\
+        setns
 
 # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
 # of all feature tests
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index e35e4e5ad192..ee2546ddf028 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -49,7 +49,8 @@ FILES=                                          \
          test-sdt.bin                           \
          test-cxx.bin                           \
          test-jvmti.bin				\
-         test-sched_getcpu.bin
+         test-sched_getcpu.bin			\
+         test-setns.bin
 
 FILES := $(addprefix $(OUTPUT),$(FILES))
 
@@ -95,6 +96,9 @@ $(OUTPUT)test-glibc.bin:
 $(OUTPUT)test-sched_getcpu.bin:
 	$(BUILD)
 
+$(OUTPUT)test-setns.bin:
+	$(BUILD)
+
 DWARFLIBS := -ldw
 ifeq ($(findstring -static,${LDFLAGS}),-static)
 DWARFLIBS += -lelf -lebl -lz -llzma -lbz2
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index cc6c7c01f4ca..b5cfc6445771 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -153,6 +153,10 @@
 # include "test-sdt.c"
 #undef main
 
+#define main main_test_setns
+# include "test-setns.c"
+#undef main
+
 int main(int argc, char *argv[])
 {
 	main_test_libpython();
@@ -188,6 +192,7 @@ int main(int argc, char *argv[])
 	main_test_libcrypto();
 	main_test_sched_getcpu();
 	main_test_sdt();
+	main_test_setns();
 
 	return 0;
 }
diff --git a/tools/build/feature/test-setns.c b/tools/build/feature/test-setns.c
new file mode 100644
index 000000000000..1f714d2a658b
--- /dev/null
+++ b/tools/build/feature/test-setns.c
@@ -0,0 +1,7 @@
+#define _GNU_SOURCE
+#include <sched.h>
+
+int main(void)
+{
+	return setns(0, 0);
+}
diff --git a/tools/build/tests/ex/Makefile b/tools/build/tests/ex/Makefile
index c50d5782ad5a..027d6c8a58a7 100644
--- a/tools/build/tests/ex/Makefile
+++ b/tools/build/tests/ex/Makefile
@@ -8,7 +8,7 @@ ex:
 include $(srctree)/tools/build/Makefile.include
 
 ex: ex-in.o libex-in.o
-	gcc -o $@ $^
+	$(CC) -o $@ $^
 
 ex.%: fixdep FORCE
 	make -f $(srctree)/tools/build/Makefile.build dir=. $@
diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h
index d62b56cf8c12..a30fad536f52 100644
--- a/tools/include/linux/string.h
+++ b/tools/include/linux/string.h
@@ -1,8 +1,8 @@
 #ifndef _TOOLS_LINUX_STRING_H_
 #define _TOOLS_LINUX_STRING_H_
 
-
 #include <linux/types.h>	/* for size_t */
+#include <string.h>
 
 void *memdup(const void *src, size_t len);
 
@@ -18,6 +18,14 @@ extern size_t strlcpy(char *dest, const char *src, size_t size);
 
 char *str_error_r(int errnum, char *buf, size_t buflen);
 
-int prefixcmp(const char *str, const char *prefix);
+/**
+ * strstarts - does @str start with @prefix?
+ * @str: string to examine
+ * @prefix: prefix to look for.
+ */
+static inline bool strstarts(const char *str, const char *prefix)
+{
+	return strncmp(str, prefix, strlen(prefix)) == 0;
+}
 
 #endif /* _LINUX_STRING_H_ */
diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h
new file mode 100644
index 000000000000..ac190958c981
--- /dev/null
+++ b/tools/include/uapi/asm-generic/fcntl.h
@@ -0,0 +1,220 @@
+#ifndef _ASM_GENERIC_FCNTL_H
+#define _ASM_GENERIC_FCNTL_H
+
+#include <linux/types.h>
+
+/*
+ * FMODE_EXEC is 0x20
+ * FMODE_NONOTIFY is 0x4000000
+ * These cannot be used by userspace O_* until internal and external open
+ * flags are split.
+ * -Eric Paris
+ */
+
+/*
+ * When introducing new O_* bits, please check its uniqueness in fcntl_init().
+ */
+
+#define O_ACCMODE	00000003
+#define O_RDONLY	00000000
+#define O_WRONLY	00000001
+#define O_RDWR		00000002
+#ifndef O_CREAT
+#define O_CREAT		00000100	/* not fcntl */
+#endif
+#ifndef O_EXCL
+#define O_EXCL		00000200	/* not fcntl */
+#endif
+#ifndef O_NOCTTY
+#define O_NOCTTY	00000400	/* not fcntl */
+#endif
+#ifndef O_TRUNC
+#define O_TRUNC		00001000	/* not fcntl */
+#endif
+#ifndef O_APPEND
+#define O_APPEND	00002000
+#endif
+#ifndef O_NONBLOCK
+#define O_NONBLOCK	00004000
+#endif
+#ifndef O_DSYNC
+#define O_DSYNC		00010000	/* used to be O_SYNC, see below */
+#endif
+#ifndef FASYNC
+#define FASYNC		00020000	/* fcntl, for BSD compatibility */
+#endif
+#ifndef O_DIRECT
+#define O_DIRECT	00040000	/* direct disk access hint */
+#endif
+#ifndef O_LARGEFILE
+#define O_LARGEFILE	00100000
+#endif
+#ifndef O_DIRECTORY
+#define O_DIRECTORY	00200000	/* must be a directory */
+#endif
+#ifndef O_NOFOLLOW
+#define O_NOFOLLOW	00400000	/* don't follow links */
+#endif
+#ifndef O_NOATIME
+#define O_NOATIME	01000000
+#endif
+#ifndef O_CLOEXEC
+#define O_CLOEXEC	02000000	/* set close_on_exec */
+#endif
+
+/*
+ * Before Linux 2.6.33 only O_DSYNC semantics were implemented, but using
+ * the O_SYNC flag.  We continue to use the existing numerical value
+ * for O_DSYNC semantics now, but using the correct symbolic name for it.
+ * This new value is used to request true Posix O_SYNC semantics.  It is
+ * defined in this strange way to make sure applications compiled against
+ * new headers get at least O_DSYNC semantics on older kernels.
+ *
+ * This has the nice side-effect that we can simply test for O_DSYNC
+ * wherever we do not care if O_DSYNC or O_SYNC is used.
+ *
+ * Note: __O_SYNC must never be used directly.
+ */
+#ifndef O_SYNC
+#define __O_SYNC	04000000
+#define O_SYNC		(__O_SYNC|O_DSYNC)
+#endif
+
+#ifndef O_PATH
+#define O_PATH		010000000
+#endif
+
+#ifndef __O_TMPFILE
+#define __O_TMPFILE	020000000
+#endif
+
+/* a horrid kludge trying to make sure that this will fail on old kernels */
+#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
+#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)
+
+#ifndef O_NDELAY
+#define O_NDELAY	O_NONBLOCK
+#endif
+
+#define F_DUPFD		0	/* dup */
+#define F_GETFD		1	/* get close_on_exec */
+#define F_SETFD		2	/* set/clear close_on_exec */
+#define F_GETFL		3	/* get file->f_flags */
+#define F_SETFL		4	/* set file->f_flags */
+#ifndef F_GETLK
+#define F_GETLK		5
+#define F_SETLK		6
+#define F_SETLKW	7
+#endif
+#ifndef F_SETOWN
+#define F_SETOWN	8	/* for sockets. */
+#define F_GETOWN	9	/* for sockets. */
+#endif
+#ifndef F_SETSIG
+#define F_SETSIG	10	/* for sockets. */
+#define F_GETSIG	11	/* for sockets. */
+#endif
+
+#ifndef CONFIG_64BIT
+#ifndef F_GETLK64
+#define F_GETLK64	12	/*  using 'struct flock64' */
+#define F_SETLK64	13
+#define F_SETLKW64	14
+#endif
+#endif
+
+#ifndef F_SETOWN_EX
+#define F_SETOWN_EX	15
+#define F_GETOWN_EX	16
+#endif
+
+#ifndef F_GETOWNER_UIDS
+#define F_GETOWNER_UIDS	17
+#endif
+
+/*
+ * Open File Description Locks
+ *
+ * Usually record locks held by a process are released on *any* close and are
+ * not inherited across a fork().
+ *
+ * These cmd values will set locks that conflict with process-associated
+ * record  locks, but are "owned" by the open file description, not the
+ * process. This means that they are inherited across fork() like BSD (flock)
+ * locks, and they are only released automatically when the last reference to
+ * the the open file against which they were acquired is put.
+ */
+#define F_OFD_GETLK	36
+#define F_OFD_SETLK	37
+#define F_OFD_SETLKW	38
+
+#define F_OWNER_TID	0
+#define F_OWNER_PID	1
+#define F_OWNER_PGRP	2
+
+struct f_owner_ex {
+	int	type;
+	__kernel_pid_t	pid;
+};
+
+/* for F_[GET|SET]FL */
+#define FD_CLOEXEC	1	/* actually anything with low bit set goes */
+
+/* for posix fcntl() and lockf() */
+#ifndef F_RDLCK
+#define F_RDLCK		0
+#define F_WRLCK		1
+#define F_UNLCK		2
+#endif
+
+/* for old implementation of bsd flock () */
+#ifndef F_EXLCK
+#define F_EXLCK		4	/* or 3 */
+#define F_SHLCK		8	/* or 4 */
+#endif
+
+/* operations for bsd flock(), also used by the kernel implementation */
+#define LOCK_SH		1	/* shared lock */
+#define LOCK_EX		2	/* exclusive lock */
+#define LOCK_NB		4	/* or'd with one of the above to prevent
+				   blocking */
+#define LOCK_UN		8	/* remove lock */
+
+#define LOCK_MAND	32	/* This is a mandatory flock ... */
+#define LOCK_READ	64	/* which allows concurrent read operations */
+#define LOCK_WRITE	128	/* which allows concurrent write operations */
+#define LOCK_RW		192	/* which allows concurrent read & write ops */
+
+#define F_LINUX_SPECIFIC_BASE	1024
+
+#ifndef HAVE_ARCH_STRUCT_FLOCK
+#ifndef __ARCH_FLOCK_PAD
+#define __ARCH_FLOCK_PAD
+#endif
+
+struct flock {
+	short	l_type;
+	short	l_whence;
+	__kernel_off_t	l_start;
+	__kernel_off_t	l_len;
+	__kernel_pid_t	l_pid;
+	__ARCH_FLOCK_PAD
+};
+#endif
+
+#ifndef HAVE_ARCH_STRUCT_FLOCK64
+#ifndef __ARCH_FLOCK64_PAD
+#define __ARCH_FLOCK64_PAD
+#endif
+
+struct flock64 {
+	short  l_type;
+	short  l_whence;
+	__kernel_loff_t l_start;
+	__kernel_loff_t l_len;
+	__kernel_pid_t  l_pid;
+	__ARCH_FLOCK64_PAD
+};
+#endif
+
+#endif /* _ASM_GENERIC_FCNTL_H */
diff --git a/tools/include/uapi/asm-generic/ioctls.h b/tools/include/uapi/asm-generic/ioctls.h
new file mode 100644
index 000000000000..14baf9f23a14
--- /dev/null
+++ b/tools/include/uapi/asm-generic/ioctls.h
@@ -0,0 +1,118 @@
+#ifndef __ASM_GENERIC_IOCTLS_H
+#define __ASM_GENERIC_IOCTLS_H
+
+#include <linux/ioctl.h>
+
+/*
+ * These are the most common definitions for tty ioctl numbers.
+ * Most of them do not use the recommended _IOC(), but there is
+ * probably some source code out there hardcoding the number,
+ * so we might as well use them for all new platforms.
+ *
+ * The architectures that use different values here typically
+ * try to be compatible with some Unix variants for the same
+ * architecture.
+ */
+
+/* 0x54 is just a magic number to make these relatively unique ('T') */
+
+#define TCGETS		0x5401
+#define TCSETS		0x5402
+#define TCSETSW		0x5403
+#define TCSETSF		0x5404
+#define TCGETA		0x5405
+#define TCSETA		0x5406
+#define TCSETAW		0x5407
+#define TCSETAF		0x5408
+#define TCSBRK		0x5409
+#define TCXONC		0x540A
+#define TCFLSH		0x540B
+#define TIOCEXCL	0x540C
+#define TIOCNXCL	0x540D
+#define TIOCSCTTY	0x540E
+#define TIOCGPGRP	0x540F
+#define TIOCSPGRP	0x5410
+#define TIOCOUTQ	0x5411
+#define TIOCSTI		0x5412
+#define TIOCGWINSZ	0x5413
+#define TIOCSWINSZ	0x5414
+#define TIOCMGET	0x5415
+#define TIOCMBIS	0x5416
+#define TIOCMBIC	0x5417
+#define TIOCMSET	0x5418
+#define TIOCGSOFTCAR	0x5419
+#define TIOCSSOFTCAR	0x541A
+#define FIONREAD	0x541B
+#define TIOCINQ		FIONREAD
+#define TIOCLINUX	0x541C
+#define TIOCCONS	0x541D
+#define TIOCGSERIAL	0x541E
+#define TIOCSSERIAL	0x541F
+#define TIOCPKT		0x5420
+#define FIONBIO		0x5421
+#define TIOCNOTTY	0x5422
+#define TIOCSETD	0x5423
+#define TIOCGETD	0x5424
+#define TCSBRKP		0x5425	/* Needed for POSIX tcsendbreak() */
+#define TIOCSBRK	0x5427  /* BSD compatibility */
+#define TIOCCBRK	0x5428  /* BSD compatibility */
+#define TIOCGSID	0x5429  /* Return the session ID of FD */
+#define TCGETS2		_IOR('T', 0x2A, struct termios2)
+#define TCSETS2		_IOW('T', 0x2B, struct termios2)
+#define TCSETSW2	_IOW('T', 0x2C, struct termios2)
+#define TCSETSF2	_IOW('T', 0x2D, struct termios2)
+#define TIOCGRS485	0x542E
+#ifndef TIOCSRS485
+#define TIOCSRS485	0x542F
+#endif
+#define TIOCGPTN	_IOR('T', 0x30, unsigned int) /* Get Pty Number (of pty-mux device) */
+#define TIOCSPTLCK	_IOW('T', 0x31, int)  /* Lock/unlock Pty */
+#define TIOCGDEV	_IOR('T', 0x32, unsigned int) /* Get primary device node of /dev/console */
+#define TCGETX		0x5432 /* SYS5 TCGETX compatibility */
+#define TCSETX		0x5433
+#define TCSETXF		0x5434
+#define TCSETXW		0x5435
+#define TIOCSIG		_IOW('T', 0x36, int)  /* pty: generate signal */
+#define TIOCVHANGUP	0x5437
+#define TIOCGPKT	_IOR('T', 0x38, int) /* Get packet mode state */
+#define TIOCGPTLCK	_IOR('T', 0x39, int) /* Get Pty lock state */
+#define TIOCGEXCL	_IOR('T', 0x40, int) /* Get exclusive mode state */
+#define TIOCGPTPEER	_IO('T', 0x41) /* Safely open the slave */
+
+#define FIONCLEX	0x5450
+#define FIOCLEX		0x5451
+#define FIOASYNC	0x5452
+#define TIOCSERCONFIG	0x5453
+#define TIOCSERGWILD	0x5454
+#define TIOCSERSWILD	0x5455
+#define TIOCGLCKTRMIOS	0x5456
+#define TIOCSLCKTRMIOS	0x5457
+#define TIOCSERGSTRUCT	0x5458 /* For debugging only */
+#define TIOCSERGETLSR   0x5459 /* Get line status register */
+#define TIOCSERGETMULTI 0x545A /* Get multiport config  */
+#define TIOCSERSETMULTI 0x545B /* Set multiport config */
+
+#define TIOCMIWAIT	0x545C	/* wait for a change on serial input line(s) */
+#define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
+
+/*
+ * Some arches already define FIOQSIZE due to a historical
+ * conflict with a Hayes modem-specific ioctl value.
+ */
+#ifndef FIOQSIZE
+# define FIOQSIZE	0x5460
+#endif
+
+/* Used for packet mode */
+#define TIOCPKT_DATA		 0
+#define TIOCPKT_FLUSHREAD	 1
+#define TIOCPKT_FLUSHWRITE	 2
+#define TIOCPKT_STOP		 4
+#define TIOCPKT_START		 8
+#define TIOCPKT_NOSTOP		16
+#define TIOCPKT_DOSTOP		32
+#define TIOCPKT_IOCTL		64
+
+#define TIOCSER_TEMT	0x01	/* Transmitter physically empty */
+
+#endif /* __ASM_GENERIC_IOCTLS_H */
diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h
new file mode 100644
index 000000000000..101593ab10ac
--- /dev/null
+++ b/tools/include/uapi/drm/drm.h
@@ -0,0 +1,933 @@
+/**
+ * \file drm.h
+ * Header for the Direct Rendering Manager
+ *
+ * \author Rickard E. (Rik) Faith <faith@valinux.com>
+ *
+ * \par Acknowledgments:
+ * Dec 1999, Richard Henderson <rth@twiddle.net>, move to generic \c cmpxchg.
+ */
+
+/*
+ * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_H_
+#define _DRM_H_
+
+#if defined(__KERNEL__)
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+typedef unsigned int drm_handle_t;
+
+#elif defined(__linux__)
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+typedef unsigned int drm_handle_t;
+
+#else /* One of the BSDs */
+
+#include <sys/ioccom.h>
+#include <sys/types.h>
+typedef int8_t   __s8;
+typedef uint8_t  __u8;
+typedef int16_t  __s16;
+typedef uint16_t __u16;
+typedef int32_t  __s32;
+typedef uint32_t __u32;
+typedef int64_t  __s64;
+typedef uint64_t __u64;
+typedef size_t   __kernel_size_t;
+typedef unsigned long drm_handle_t;
+
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_NAME	"drm"	  /**< Name in kernel, /dev, and /proc */
+#define DRM_MIN_ORDER	5	  /**< At least 2^5 bytes = 32 bytes */
+#define DRM_MAX_ORDER	22	  /**< Up to 2^22 bytes = 4MB */
+#define DRM_RAM_PERCENT 10	  /**< How much system ram can we lock? */
+
+#define _DRM_LOCK_HELD	0x80000000U /**< Hardware lock is held */
+#define _DRM_LOCK_CONT	0x40000000U /**< Hardware lock is contended */
+#define _DRM_LOCK_IS_HELD(lock)	   ((lock) & _DRM_LOCK_HELD)
+#define _DRM_LOCK_IS_CONT(lock)	   ((lock) & _DRM_LOCK_CONT)
+#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT))
+
+typedef unsigned int drm_context_t;
+typedef unsigned int drm_drawable_t;
+typedef unsigned int drm_magic_t;
+
+/**
+ * Cliprect.
+ *
+ * \warning: If you change this structure, make sure you change
+ * XF86DRIClipRectRec in the server as well
+ *
+ * \note KW: Actually it's illegal to change either for
+ * backwards-compatibility reasons.
+ */
+struct drm_clip_rect {
+	unsigned short x1;
+	unsigned short y1;
+	unsigned short x2;
+	unsigned short y2;
+};
+
+/**
+ * Drawable information.
+ */
+struct drm_drawable_info {
+	unsigned int num_rects;
+	struct drm_clip_rect *rects;
+};
+
+/**
+ * Texture region,
+ */
+struct drm_tex_region {
+	unsigned char next;
+	unsigned char prev;
+	unsigned char in_use;
+	unsigned char padding;
+	unsigned int age;
+};
+
+/**
+ * Hardware lock.
+ *
+ * The lock structure is a simple cache-line aligned integer.  To avoid
+ * processor bus contention on a multiprocessor system, there should not be any
+ * other data stored in the same cache line.
+ */
+struct drm_hw_lock {
+	__volatile__ unsigned int lock;		/**< lock variable */
+	char padding[60];			/**< Pad to cache line */
+};
+
+/**
+ * DRM_IOCTL_VERSION ioctl argument type.
+ *
+ * \sa drmGetVersion().
+ */
+struct drm_version {
+	int version_major;	  /**< Major version */
+	int version_minor;	  /**< Minor version */
+	int version_patchlevel;	  /**< Patch level */
+	__kernel_size_t name_len;	  /**< Length of name buffer */
+	char __user *name;	  /**< Name of driver */
+	__kernel_size_t date_len;	  /**< Length of date buffer */
+	char __user *date;	  /**< User-space buffer to hold date */
+	__kernel_size_t desc_len;	  /**< Length of desc buffer */
+	char __user *desc;	  /**< User-space buffer to hold desc */
+};
+
+/**
+ * DRM_IOCTL_GET_UNIQUE ioctl argument type.
+ *
+ * \sa drmGetBusid() and drmSetBusId().
+ */
+struct drm_unique {
+	__kernel_size_t unique_len;	  /**< Length of unique */
+	char __user *unique;	  /**< Unique name for driver instantiation */
+};
+
+struct drm_list {
+	int count;		  /**< Length of user-space structures */
+	struct drm_version __user *version;
+};
+
+struct drm_block {
+	int unused;
+};
+
+/**
+ * DRM_IOCTL_CONTROL ioctl argument type.
+ *
+ * \sa drmCtlInstHandler() and drmCtlUninstHandler().
+ */
+struct drm_control {
+	enum {
+		DRM_ADD_COMMAND,
+		DRM_RM_COMMAND,
+		DRM_INST_HANDLER,
+		DRM_UNINST_HANDLER
+	} func;
+	int irq;
+};
+
+/**
+ * Type of memory to map.
+ */
+enum drm_map_type {
+	_DRM_FRAME_BUFFER = 0,	  /**< WC (no caching), no core dump */
+	_DRM_REGISTERS = 1,	  /**< no caching, no core dump */
+	_DRM_SHM = 2,		  /**< shared, cached */
+	_DRM_AGP = 3,		  /**< AGP/GART */
+	_DRM_SCATTER_GATHER = 4,  /**< Scatter/gather memory for PCI DMA */
+	_DRM_CONSISTENT = 5	  /**< Consistent memory for PCI DMA */
+};
+
+/**
+ * Memory mapping flags.
+ */
+enum drm_map_flags {
+	_DRM_RESTRICTED = 0x01,	     /**< Cannot be mapped to user-virtual */
+	_DRM_READ_ONLY = 0x02,
+	_DRM_LOCKED = 0x04,	     /**< shared, cached, locked */
+	_DRM_KERNEL = 0x08,	     /**< kernel requires access */
+	_DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */
+	_DRM_CONTAINS_LOCK = 0x20,   /**< SHM page that contains lock */
+	_DRM_REMOVABLE = 0x40,	     /**< Removable mapping */
+	_DRM_DRIVER = 0x80	     /**< Managed by driver */
+};
+
+struct drm_ctx_priv_map {
+	unsigned int ctx_id;	 /**< Context requesting private mapping */
+	void *handle;		 /**< Handle of map */
+};
+
+/**
+ * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls
+ * argument type.
+ *
+ * \sa drmAddMap().
+ */
+struct drm_map {
+	unsigned long offset;	 /**< Requested physical address (0 for SAREA)*/
+	unsigned long size;	 /**< Requested physical size (bytes) */
+	enum drm_map_type type;	 /**< Type of memory to map */
+	enum drm_map_flags flags;	 /**< Flags */
+	void *handle;		 /**< User-space: "Handle" to pass to mmap() */
+				 /**< Kernel-space: kernel-virtual address */
+	int mtrr;		 /**< MTRR slot used */
+	/*   Private data */
+};
+
+/**
+ * DRM_IOCTL_GET_CLIENT ioctl argument type.
+ */
+struct drm_client {
+	int idx;		/**< Which client desired? */
+	int auth;		/**< Is client authenticated? */
+	unsigned long pid;	/**< Process ID */
+	unsigned long uid;	/**< User ID */
+	unsigned long magic;	/**< Magic */
+	unsigned long iocs;	/**< Ioctl count */
+};
+
+enum drm_stat_type {
+	_DRM_STAT_LOCK,
+	_DRM_STAT_OPENS,
+	_DRM_STAT_CLOSES,
+	_DRM_STAT_IOCTLS,
+	_DRM_STAT_LOCKS,
+	_DRM_STAT_UNLOCKS,
+	_DRM_STAT_VALUE,	/**< Generic value */
+	_DRM_STAT_BYTE,		/**< Generic byte counter (1024bytes/K) */
+	_DRM_STAT_COUNT,	/**< Generic non-byte counter (1000/k) */
+
+	_DRM_STAT_IRQ,		/**< IRQ */
+	_DRM_STAT_PRIMARY,	/**< Primary DMA bytes */
+	_DRM_STAT_SECONDARY,	/**< Secondary DMA bytes */
+	_DRM_STAT_DMA,		/**< DMA */
+	_DRM_STAT_SPECIAL,	/**< Special DMA (e.g., priority or polled) */
+	_DRM_STAT_MISSED	/**< Missed DMA opportunity */
+	    /* Add to the *END* of the list */
+};
+
+/**
+ * DRM_IOCTL_GET_STATS ioctl argument type.
+ */
+struct drm_stats {
+	unsigned long count;
+	struct {
+		unsigned long value;
+		enum drm_stat_type type;
+	} data[15];
+};
+
+/**
+ * Hardware locking flags.
+ */
+enum drm_lock_flags {
+	_DRM_LOCK_READY = 0x01,	     /**< Wait until hardware is ready for DMA */
+	_DRM_LOCK_QUIESCENT = 0x02,  /**< Wait until hardware quiescent */
+	_DRM_LOCK_FLUSH = 0x04,	     /**< Flush this context's DMA queue first */
+	_DRM_LOCK_FLUSH_ALL = 0x08,  /**< Flush all DMA queues first */
+	/* These *HALT* flags aren't supported yet
+	   -- they will be used to support the
+	   full-screen DGA-like mode. */
+	_DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */
+	_DRM_HALT_CUR_QUEUES = 0x20  /**< Halt all current queues */
+};
+
+/**
+ * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type.
+ *
+ * \sa drmGetLock() and drmUnlock().
+ */
+struct drm_lock {
+	int context;
+	enum drm_lock_flags flags;
+};
+
+/**
+ * DMA flags
+ *
+ * \warning
+ * These values \e must match xf86drm.h.
+ *
+ * \sa drm_dma.
+ */
+enum drm_dma_flags {
+	/* Flags for DMA buffer dispatch */
+	_DRM_DMA_BLOCK = 0x01,	      /**<
+				       * Block until buffer dispatched.
+				       *
+				       * \note The buffer may not yet have
+				       * been processed by the hardware --
+				       * getting a hardware lock with the
+				       * hardware quiescent will ensure
+				       * that the buffer has been
+				       * processed.
+				       */
+	_DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */
+	_DRM_DMA_PRIORITY = 0x04,     /**< High priority dispatch */
+
+	/* Flags for DMA buffer request */
+	_DRM_DMA_WAIT = 0x10,	      /**< Wait for free buffers */
+	_DRM_DMA_SMALLER_OK = 0x20,   /**< Smaller-than-requested buffers OK */
+	_DRM_DMA_LARGER_OK = 0x40     /**< Larger-than-requested buffers OK */
+};
+
+/**
+ * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type.
+ *
+ * \sa drmAddBufs().
+ */
+struct drm_buf_desc {
+	int count;		 /**< Number of buffers of this size */
+	int size;		 /**< Size in bytes */
+	int low_mark;		 /**< Low water mark */
+	int high_mark;		 /**< High water mark */
+	enum {
+		_DRM_PAGE_ALIGN = 0x01,	/**< Align on page boundaries for DMA */
+		_DRM_AGP_BUFFER = 0x02,	/**< Buffer is in AGP space */
+		_DRM_SG_BUFFER = 0x04,	/**< Scatter/gather memory buffer */
+		_DRM_FB_BUFFER = 0x08,	/**< Buffer is in frame buffer */
+		_DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */
+	} flags;
+	unsigned long agp_start; /**<
+				  * Start address of where the AGP buffers are
+				  * in the AGP aperture
+				  */
+};
+
+/**
+ * DRM_IOCTL_INFO_BUFS ioctl argument type.
+ */
+struct drm_buf_info {
+	int count;		/**< Entries in list */
+	struct drm_buf_desc __user *list;
+};
+
+/**
+ * DRM_IOCTL_FREE_BUFS ioctl argument type.
+ */
+struct drm_buf_free {
+	int count;
+	int __user *list;
+};
+
+/**
+ * Buffer information
+ *
+ * \sa drm_buf_map.
+ */
+struct drm_buf_pub {
+	int idx;		       /**< Index into the master buffer list */
+	int total;		       /**< Buffer size */
+	int used;		       /**< Amount of buffer in use (for DMA) */
+	void __user *address;	       /**< Address of buffer */
+};
+
+/**
+ * DRM_IOCTL_MAP_BUFS ioctl argument type.
+ */
+struct drm_buf_map {
+	int count;		/**< Length of the buffer list */
+#ifdef __cplusplus
+	void __user *virt;
+#else
+	void __user *virtual;		/**< Mmap'd area in user-virtual */
+#endif
+	struct drm_buf_pub __user *list;	/**< Buffer information */
+};
+
+/**
+ * DRM_IOCTL_DMA ioctl argument type.
+ *
+ * Indices here refer to the offset into the buffer list in drm_buf_get.
+ *
+ * \sa drmDMA().
+ */
+struct drm_dma {
+	int context;			  /**< Context handle */
+	int send_count;			  /**< Number of buffers to send */
+	int __user *send_indices;	  /**< List of handles to buffers */
+	int __user *send_sizes;		  /**< Lengths of data to send */
+	enum drm_dma_flags flags;	  /**< Flags */
+	int request_count;		  /**< Number of buffers requested */
+	int request_size;		  /**< Desired size for buffers */
+	int __user *request_indices;	  /**< Buffer information */
+	int __user *request_sizes;
+	int granted_count;		  /**< Number of buffers granted */
+};
+
+enum drm_ctx_flags {
+	_DRM_CONTEXT_PRESERVED = 0x01,
+	_DRM_CONTEXT_2DONLY = 0x02
+};
+
+/**
+ * DRM_IOCTL_ADD_CTX ioctl argument type.
+ *
+ * \sa drmCreateContext() and drmDestroyContext().
+ */
+struct drm_ctx {
+	drm_context_t handle;
+	enum drm_ctx_flags flags;
+};
+
+/**
+ * DRM_IOCTL_RES_CTX ioctl argument type.
+ */
+struct drm_ctx_res {
+	int count;
+	struct drm_ctx __user *contexts;
+};
+
+/**
+ * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type.
+ */
+struct drm_draw {
+	drm_drawable_t handle;
+};
+
+/**
+ * DRM_IOCTL_UPDATE_DRAW ioctl argument type.
+ */
+typedef enum {
+	DRM_DRAWABLE_CLIPRECTS
+} drm_drawable_info_type_t;
+
+struct drm_update_draw {
+	drm_drawable_t handle;
+	unsigned int type;
+	unsigned int num;
+	unsigned long long data;
+};
+
+/**
+ * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type.
+ */
+struct drm_auth {
+	drm_magic_t magic;
+};
+
+/**
+ * DRM_IOCTL_IRQ_BUSID ioctl argument type.
+ *
+ * \sa drmGetInterruptFromBusID().
+ */
+struct drm_irq_busid {
+	int irq;	/**< IRQ number */
+	int busnum;	/**< bus number */
+	int devnum;	/**< device number */
+	int funcnum;	/**< function number */
+};
+
+enum drm_vblank_seq_type {
+	_DRM_VBLANK_ABSOLUTE = 0x0,	/**< Wait for specific vblank sequence number */
+	_DRM_VBLANK_RELATIVE = 0x1,	/**< Wait for given number of vblanks */
+	/* bits 1-6 are reserved for high crtcs */
+	_DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e,
+	_DRM_VBLANK_EVENT = 0x4000000,   /**< Send event instead of blocking */
+	_DRM_VBLANK_FLIP = 0x8000000,   /**< Scheduled buffer swap should flip */
+	_DRM_VBLANK_NEXTONMISS = 0x10000000,	/**< If missed, wait for next vblank */
+	_DRM_VBLANK_SECONDARY = 0x20000000,	/**< Secondary display controller */
+	_DRM_VBLANK_SIGNAL = 0x40000000	/**< Send signal instead of blocking, unsupported */
+};
+#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1
+
+#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE)
+#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \
+				_DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS)
+
+struct drm_wait_vblank_request {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	unsigned long signal;
+};
+
+struct drm_wait_vblank_reply {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	long tval_sec;
+	long tval_usec;
+};
+
+/**
+ * DRM_IOCTL_WAIT_VBLANK ioctl argument type.
+ *
+ * \sa drmWaitVBlank().
+ */
+union drm_wait_vblank {
+	struct drm_wait_vblank_request request;
+	struct drm_wait_vblank_reply reply;
+};
+
+#define _DRM_PRE_MODESET 1
+#define _DRM_POST_MODESET 2
+
+/**
+ * DRM_IOCTL_MODESET_CTL ioctl argument type
+ *
+ * \sa drmModesetCtl().
+ */
+struct drm_modeset_ctl {
+	__u32 crtc;
+	__u32 cmd;
+};
+
+/**
+ * DRM_IOCTL_AGP_ENABLE ioctl argument type.
+ *
+ * \sa drmAgpEnable().
+ */
+struct drm_agp_mode {
+	unsigned long mode;	/**< AGP mode */
+};
+
+/**
+ * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type.
+ *
+ * \sa drmAgpAlloc() and drmAgpFree().
+ */
+struct drm_agp_buffer {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for binding / unbinding */
+	unsigned long type;	/**< Type of memory to allocate */
+	unsigned long physical;	/**< Physical used by i810 */
+};
+
+/**
+ * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type.
+ *
+ * \sa drmAgpBind() and drmAgpUnbind().
+ */
+struct drm_agp_binding {
+	unsigned long handle;	/**< From drm_agp_buffer */
+	unsigned long offset;	/**< In bytes -- will round to page boundary */
+};
+
+/**
+ * DRM_IOCTL_AGP_INFO ioctl argument type.
+ *
+ * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(),
+ * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(),
+ * drmAgpVendorId() and drmAgpDeviceId().
+ */
+struct drm_agp_info {
+	int agp_version_major;
+	int agp_version_minor;
+	unsigned long mode;
+	unsigned long aperture_base;	/* physical address */
+	unsigned long aperture_size;	/* bytes */
+	unsigned long memory_allowed;	/* bytes */
+	unsigned long memory_used;
+
+	/* PCI information */
+	unsigned short id_vendor;
+	unsigned short id_device;
+};
+
+/**
+ * DRM_IOCTL_SG_ALLOC ioctl argument type.
+ */
+struct drm_scatter_gather {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for mapping / unmapping */
+};
+
+/**
+ * DRM_IOCTL_SET_VERSION ioctl argument type.
+ */
+struct drm_set_version {
+	int drm_di_major;
+	int drm_di_minor;
+	int drm_dd_major;
+	int drm_dd_minor;
+};
+
+/** DRM_IOCTL_GEM_CLOSE ioctl argument type */
+struct drm_gem_close {
+	/** Handle of the object to be closed. */
+	__u32 handle;
+	__u32 pad;
+};
+
+/** DRM_IOCTL_GEM_FLINK ioctl argument type */
+struct drm_gem_flink {
+	/** Handle for the object being named */
+	__u32 handle;
+
+	/** Returned global name */
+	__u32 name;
+};
+
+/** DRM_IOCTL_GEM_OPEN ioctl argument type */
+struct drm_gem_open {
+	/** Name of object being opened */
+	__u32 name;
+
+	/** Returned handle for the object */
+	__u32 handle;
+
+	/** Returned size of the object */
+	__u64 size;
+};
+
+#define DRM_CAP_DUMB_BUFFER		0x1
+#define DRM_CAP_VBLANK_HIGH_CRTC	0x2
+#define DRM_CAP_DUMB_PREFERRED_DEPTH	0x3
+#define DRM_CAP_DUMB_PREFER_SHADOW	0x4
+#define DRM_CAP_PRIME			0x5
+#define  DRM_PRIME_CAP_IMPORT		0x1
+#define  DRM_PRIME_CAP_EXPORT		0x2
+#define DRM_CAP_TIMESTAMP_MONOTONIC	0x6
+#define DRM_CAP_ASYNC_PAGE_FLIP		0x7
+/*
+ * The CURSOR_WIDTH and CURSOR_HEIGHT capabilities return a valid widthxheight
+ * combination for the hardware cursor. The intention is that a hardware
+ * agnostic userspace can query a cursor plane size to use.
+ *
+ * Note that the cross-driver contract is to merely return a valid size;
+ * drivers are free to attach another meaning on top, eg. i915 returns the
+ * maximum plane size.
+ */
+#define DRM_CAP_CURSOR_WIDTH		0x8
+#define DRM_CAP_CURSOR_HEIGHT		0x9
+#define DRM_CAP_ADDFB2_MODIFIERS	0x10
+#define DRM_CAP_PAGE_FLIP_TARGET	0x11
+#define DRM_CAP_CRTC_IN_VBLANK_EVENT	0x12
+#define DRM_CAP_SYNCOBJ		0x13
+
+/** DRM_IOCTL_GET_CAP ioctl argument type */
+struct drm_get_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+/**
+ * DRM_CLIENT_CAP_STEREO_3D
+ *
+ * if set to 1, the DRM core will expose the stereo 3D capabilities of the
+ * monitor by advertising the supported 3D layouts in the flags of struct
+ * drm_mode_modeinfo.
+ */
+#define DRM_CLIENT_CAP_STEREO_3D	1
+
+/**
+ * DRM_CLIENT_CAP_UNIVERSAL_PLANES
+ *
+ * If set to 1, the DRM core will expose all planes (overlay, primary, and
+ * cursor) to userspace.
+ */
+#define DRM_CLIENT_CAP_UNIVERSAL_PLANES  2
+
+/**
+ * DRM_CLIENT_CAP_ATOMIC
+ *
+ * If set to 1, the DRM core will expose atomic properties to userspace
+ */
+#define DRM_CLIENT_CAP_ATOMIC	3
+
+/** DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */
+struct drm_set_client_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+#define DRM_RDWR O_RDWR
+#define DRM_CLOEXEC O_CLOEXEC
+struct drm_prime_handle {
+	__u32 handle;
+
+	/** Flags.. only applicable for handle->fd */
+	__u32 flags;
+
+	/** Returned dmabuf file descriptor */
+	__s32 fd;
+};
+
+struct drm_syncobj_create {
+	__u32 handle;
+	__u32 flags;
+};
+
+struct drm_syncobj_destroy {
+	__u32 handle;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0)
+#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0)
+struct drm_syncobj_handle {
+	__u32 handle;
+	__u32 flags;
+
+	__s32 fd;
+	__u32 pad;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "drm_mode.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_IOCTL_BASE			'd'
+#define DRM_IO(nr)			_IO(DRM_IOCTL_BASE,nr)
+#define DRM_IOR(nr,type)		_IOR(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOW(nr,type)		_IOW(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOWR(nr,type)		_IOWR(DRM_IOCTL_BASE,nr,type)
+
+#define DRM_IOCTL_VERSION		DRM_IOWR(0x00, struct drm_version)
+#define DRM_IOCTL_GET_UNIQUE		DRM_IOWR(0x01, struct drm_unique)
+#define DRM_IOCTL_GET_MAGIC		DRM_IOR( 0x02, struct drm_auth)
+#define DRM_IOCTL_IRQ_BUSID		DRM_IOWR(0x03, struct drm_irq_busid)
+#define DRM_IOCTL_GET_MAP               DRM_IOWR(0x04, struct drm_map)
+#define DRM_IOCTL_GET_CLIENT            DRM_IOWR(0x05, struct drm_client)
+#define DRM_IOCTL_GET_STATS             DRM_IOR( 0x06, struct drm_stats)
+#define DRM_IOCTL_SET_VERSION		DRM_IOWR(0x07, struct drm_set_version)
+#define DRM_IOCTL_MODESET_CTL           DRM_IOW(0x08, struct drm_modeset_ctl)
+#define DRM_IOCTL_GEM_CLOSE		DRM_IOW (0x09, struct drm_gem_close)
+#define DRM_IOCTL_GEM_FLINK		DRM_IOWR(0x0a, struct drm_gem_flink)
+#define DRM_IOCTL_GEM_OPEN		DRM_IOWR(0x0b, struct drm_gem_open)
+#define DRM_IOCTL_GET_CAP		DRM_IOWR(0x0c, struct drm_get_cap)
+#define DRM_IOCTL_SET_CLIENT_CAP	DRM_IOW( 0x0d, struct drm_set_client_cap)
+
+#define DRM_IOCTL_SET_UNIQUE		DRM_IOW( 0x10, struct drm_unique)
+#define DRM_IOCTL_AUTH_MAGIC		DRM_IOW( 0x11, struct drm_auth)
+#define DRM_IOCTL_BLOCK			DRM_IOWR(0x12, struct drm_block)
+#define DRM_IOCTL_UNBLOCK		DRM_IOWR(0x13, struct drm_block)
+#define DRM_IOCTL_CONTROL		DRM_IOW( 0x14, struct drm_control)
+#define DRM_IOCTL_ADD_MAP		DRM_IOWR(0x15, struct drm_map)
+#define DRM_IOCTL_ADD_BUFS		DRM_IOWR(0x16, struct drm_buf_desc)
+#define DRM_IOCTL_MARK_BUFS		DRM_IOW( 0x17, struct drm_buf_desc)
+#define DRM_IOCTL_INFO_BUFS		DRM_IOWR(0x18, struct drm_buf_info)
+#define DRM_IOCTL_MAP_BUFS		DRM_IOWR(0x19, struct drm_buf_map)
+#define DRM_IOCTL_FREE_BUFS		DRM_IOW( 0x1a, struct drm_buf_free)
+
+#define DRM_IOCTL_RM_MAP		DRM_IOW( 0x1b, struct drm_map)
+
+#define DRM_IOCTL_SET_SAREA_CTX		DRM_IOW( 0x1c, struct drm_ctx_priv_map)
+#define DRM_IOCTL_GET_SAREA_CTX 	DRM_IOWR(0x1d, struct drm_ctx_priv_map)
+
+#define DRM_IOCTL_SET_MASTER            DRM_IO(0x1e)
+#define DRM_IOCTL_DROP_MASTER           DRM_IO(0x1f)
+
+#define DRM_IOCTL_ADD_CTX		DRM_IOWR(0x20, struct drm_ctx)
+#define DRM_IOCTL_RM_CTX		DRM_IOWR(0x21, struct drm_ctx)
+#define DRM_IOCTL_MOD_CTX		DRM_IOW( 0x22, struct drm_ctx)
+#define DRM_IOCTL_GET_CTX		DRM_IOWR(0x23, struct drm_ctx)
+#define DRM_IOCTL_SWITCH_CTX		DRM_IOW( 0x24, struct drm_ctx)
+#define DRM_IOCTL_NEW_CTX		DRM_IOW( 0x25, struct drm_ctx)
+#define DRM_IOCTL_RES_CTX		DRM_IOWR(0x26, struct drm_ctx_res)
+#define DRM_IOCTL_ADD_DRAW		DRM_IOWR(0x27, struct drm_draw)
+#define DRM_IOCTL_RM_DRAW		DRM_IOWR(0x28, struct drm_draw)
+#define DRM_IOCTL_DMA			DRM_IOWR(0x29, struct drm_dma)
+#define DRM_IOCTL_LOCK			DRM_IOW( 0x2a, struct drm_lock)
+#define DRM_IOCTL_UNLOCK		DRM_IOW( 0x2b, struct drm_lock)
+#define DRM_IOCTL_FINISH		DRM_IOW( 0x2c, struct drm_lock)
+
+#define DRM_IOCTL_PRIME_HANDLE_TO_FD    DRM_IOWR(0x2d, struct drm_prime_handle)
+#define DRM_IOCTL_PRIME_FD_TO_HANDLE    DRM_IOWR(0x2e, struct drm_prime_handle)
+
+#define DRM_IOCTL_AGP_ACQUIRE		DRM_IO(  0x30)
+#define DRM_IOCTL_AGP_RELEASE		DRM_IO(  0x31)
+#define DRM_IOCTL_AGP_ENABLE		DRM_IOW( 0x32, struct drm_agp_mode)
+#define DRM_IOCTL_AGP_INFO		DRM_IOR( 0x33, struct drm_agp_info)
+#define DRM_IOCTL_AGP_ALLOC		DRM_IOWR(0x34, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_FREE		DRM_IOW( 0x35, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_BIND		DRM_IOW( 0x36, struct drm_agp_binding)
+#define DRM_IOCTL_AGP_UNBIND		DRM_IOW( 0x37, struct drm_agp_binding)
+
+#define DRM_IOCTL_SG_ALLOC		DRM_IOWR(0x38, struct drm_scatter_gather)
+#define DRM_IOCTL_SG_FREE		DRM_IOW( 0x39, struct drm_scatter_gather)
+
+#define DRM_IOCTL_WAIT_VBLANK		DRM_IOWR(0x3a, union drm_wait_vblank)
+
+#define DRM_IOCTL_UPDATE_DRAW		DRM_IOW(0x3f, struct drm_update_draw)
+
+#define DRM_IOCTL_MODE_GETRESOURCES	DRM_IOWR(0xA0, struct drm_mode_card_res)
+#define DRM_IOCTL_MODE_GETCRTC		DRM_IOWR(0xA1, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_SETCRTC		DRM_IOWR(0xA2, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_CURSOR		DRM_IOWR(0xA3, struct drm_mode_cursor)
+#define DRM_IOCTL_MODE_GETGAMMA		DRM_IOWR(0xA4, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_SETGAMMA		DRM_IOWR(0xA5, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_GETENCODER	DRM_IOWR(0xA6, struct drm_mode_get_encoder)
+#define DRM_IOCTL_MODE_GETCONNECTOR	DRM_IOWR(0xA7, struct drm_mode_get_connector)
+#define DRM_IOCTL_MODE_ATTACHMODE	DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+#define DRM_IOCTL_MODE_DETACHMODE	DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+
+#define DRM_IOCTL_MODE_GETPROPERTY	DRM_IOWR(0xAA, struct drm_mode_get_property)
+#define DRM_IOCTL_MODE_SETPROPERTY	DRM_IOWR(0xAB, struct drm_mode_connector_set_property)
+#define DRM_IOCTL_MODE_GETPROPBLOB	DRM_IOWR(0xAC, struct drm_mode_get_blob)
+#define DRM_IOCTL_MODE_GETFB		DRM_IOWR(0xAD, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_ADDFB		DRM_IOWR(0xAE, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_RMFB		DRM_IOWR(0xAF, unsigned int)
+#define DRM_IOCTL_MODE_PAGE_FLIP	DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip)
+#define DRM_IOCTL_MODE_DIRTYFB		DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd)
+
+#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb)
+#define DRM_IOCTL_MODE_MAP_DUMB    DRM_IOWR(0xB3, struct drm_mode_map_dumb)
+#define DRM_IOCTL_MODE_DESTROY_DUMB    DRM_IOWR(0xB4, struct drm_mode_destroy_dumb)
+#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res)
+#define DRM_IOCTL_MODE_GETPLANE	DRM_IOWR(0xB6, struct drm_mode_get_plane)
+#define DRM_IOCTL_MODE_SETPLANE	DRM_IOWR(0xB7, struct drm_mode_set_plane)
+#define DRM_IOCTL_MODE_ADDFB2		DRM_IOWR(0xB8, struct drm_mode_fb_cmd2)
+#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES	DRM_IOWR(0xB9, struct drm_mode_obj_get_properties)
+#define DRM_IOCTL_MODE_OBJ_SETPROPERTY	DRM_IOWR(0xBA, struct drm_mode_obj_set_property)
+#define DRM_IOCTL_MODE_CURSOR2		DRM_IOWR(0xBB, struct drm_mode_cursor2)
+#define DRM_IOCTL_MODE_ATOMIC		DRM_IOWR(0xBC, struct drm_mode_atomic)
+#define DRM_IOCTL_MODE_CREATEPROPBLOB	DRM_IOWR(0xBD, struct drm_mode_create_blob)
+#define DRM_IOCTL_MODE_DESTROYPROPBLOB	DRM_IOWR(0xBE, struct drm_mode_destroy_blob)
+
+#define DRM_IOCTL_SYNCOBJ_CREATE	DRM_IOWR(0xBF, struct drm_syncobj_create)
+#define DRM_IOCTL_SYNCOBJ_DESTROY	DRM_IOWR(0xC0, struct drm_syncobj_destroy)
+#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD	DRM_IOWR(0xC1, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE	DRM_IOWR(0xC2, struct drm_syncobj_handle)
+
+/**
+ * Device specific ioctls should only be in their respective headers
+ * The device specific ioctl range is from 0x40 to 0x9f.
+ * Generic IOCTLS restart at 0xA0.
+ *
+ * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and
+ * drmCommandReadWrite().
+ */
+#define DRM_COMMAND_BASE                0x40
+#define DRM_COMMAND_END			0xA0
+
+/**
+ * Header for events written back to userspace on the drm fd.  The
+ * type defines the type of event, the length specifies the total
+ * length of the event (including the header), and user_data is
+ * typically a 64 bit value passed with the ioctl that triggered the
+ * event.  A read on the drm fd will always only return complete
+ * events, that is, if for example the read buffer is 100 bytes, and
+ * there are two 64 byte events pending, only one will be returned.
+ *
+ * Event types 0 - 0x7fffffff are generic drm events, 0x80000000 and
+ * up are chipset specific.
+ */
+struct drm_event {
+	__u32 type;
+	__u32 length;
+};
+
+#define DRM_EVENT_VBLANK 0x01
+#define DRM_EVENT_FLIP_COMPLETE 0x02
+
+struct drm_event_vblank {
+	struct drm_event base;
+	__u64 user_data;
+	__u32 tv_sec;
+	__u32 tv_usec;
+	__u32 sequence;
+	__u32 crtc_id; /* 0 on older kernels that do not support this */
+};
+
+/* typedef area */
+#ifndef __KERNEL__
+typedef struct drm_clip_rect drm_clip_rect_t;
+typedef struct drm_drawable_info drm_drawable_info_t;
+typedef struct drm_tex_region drm_tex_region_t;
+typedef struct drm_hw_lock drm_hw_lock_t;
+typedef struct drm_version drm_version_t;
+typedef struct drm_unique drm_unique_t;
+typedef struct drm_list drm_list_t;
+typedef struct drm_block drm_block_t;
+typedef struct drm_control drm_control_t;
+typedef enum drm_map_type drm_map_type_t;
+typedef enum drm_map_flags drm_map_flags_t;
+typedef struct drm_ctx_priv_map drm_ctx_priv_map_t;
+typedef struct drm_map drm_map_t;
+typedef struct drm_client drm_client_t;
+typedef enum drm_stat_type drm_stat_type_t;
+typedef struct drm_stats drm_stats_t;
+typedef enum drm_lock_flags drm_lock_flags_t;
+typedef struct drm_lock drm_lock_t;
+typedef enum drm_dma_flags drm_dma_flags_t;
+typedef struct drm_buf_desc drm_buf_desc_t;
+typedef struct drm_buf_info drm_buf_info_t;
+typedef struct drm_buf_free drm_buf_free_t;
+typedef struct drm_buf_pub drm_buf_pub_t;
+typedef struct drm_buf_map drm_buf_map_t;
+typedef struct drm_dma drm_dma_t;
+typedef union drm_wait_vblank drm_wait_vblank_t;
+typedef struct drm_agp_mode drm_agp_mode_t;
+typedef enum drm_ctx_flags drm_ctx_flags_t;
+typedef struct drm_ctx drm_ctx_t;
+typedef struct drm_ctx_res drm_ctx_res_t;
+typedef struct drm_draw drm_draw_t;
+typedef struct drm_update_draw drm_update_draw_t;
+typedef struct drm_auth drm_auth_t;
+typedef struct drm_irq_busid drm_irq_busid_t;
+typedef enum drm_vblank_seq_type drm_vblank_seq_type_t;
+
+typedef struct drm_agp_buffer drm_agp_buffer_t;
+typedef struct drm_agp_binding drm_agp_binding_t;
+typedef struct drm_agp_info drm_agp_info_t;
+typedef struct drm_scatter_gather drm_scatter_gather_t;
+typedef struct drm_set_version drm_set_version_t;
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h
new file mode 100644
index 000000000000..7ccbd6a2bbe0
--- /dev/null
+++ b/tools/include/uapi/drm/i915_drm.h
@@ -0,0 +1,1474 @@
+/*
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _UAPI_I915_DRM_H_
+#define _UAPI_I915_DRM_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Please note that modifications to all structs defined here are
+ * subject to backwards-compatibility constraints.
+ */
+
+/**
+ * DOC: uevents generated by i915 on it's device node
+ *
+ * I915_L3_PARITY_UEVENT - Generated when the driver receives a parity mismatch
+ *	event from the gpu l3 cache. Additional information supplied is ROW,
+ *	BANK, SUBBANK, SLICE of the affected cacheline. Userspace should keep
+ *	track of these events and if a specific cache-line seems to have a
+ *	persistent error remap it with the l3 remapping tool supplied in
+ *	intel-gpu-tools.  The value supplied with the event is always 1.
+ *
+ * I915_ERROR_UEVENT - Generated upon error detection, currently only via
+ *	hangcheck. The error detection event is a good indicator of when things
+ *	began to go badly. The value supplied with the event is a 1 upon error
+ *	detection, and a 0 upon reset completion, signifying no more error
+ *	exists. NOTE: Disabling hangcheck or reset via module parameter will
+ *	cause the related events to not be seen.
+ *
+ * I915_RESET_UEVENT - Event is generated just before an attempt to reset the
+ *	the GPU. The value supplied with the event is always 1. NOTE: Disable
+ *	reset via module parameter will cause this event to not be seen.
+ */
+#define I915_L3_PARITY_UEVENT		"L3_PARITY_ERROR"
+#define I915_ERROR_UEVENT		"ERROR"
+#define I915_RESET_UEVENT		"RESET"
+
+/*
+ * MOCS indexes used for GPU surfaces, defining the cacheability of the
+ * surface data and the coherency for this data wrt. CPU vs. GPU accesses.
+ */
+enum i915_mocs_table_index {
+	/*
+	 * Not cached anywhere, coherency between CPU and GPU accesses is
+	 * guaranteed.
+	 */
+	I915_MOCS_UNCACHED,
+	/*
+	 * Cacheability and coherency controlled by the kernel automatically
+	 * based on the DRM_I915_GEM_SET_CACHING IOCTL setting and the current
+	 * usage of the surface (used for display scanout or not).
+	 */
+	I915_MOCS_PTE,
+	/*
+	 * Cached in all GPU caches available on the platform.
+	 * Coherency between CPU and GPU accesses to the surface is not
+	 * guaranteed without extra synchronization.
+	 */
+	I915_MOCS_CACHED,
+};
+
+/* Each region is a minimum of 16k, and there are at most 255 of them.
+ */
+#define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use
+				 * of chars for next/prev indices */
+#define I915_LOG_MIN_TEX_REGION_SIZE 14
+
+typedef struct _drm_i915_init {
+	enum {
+		I915_INIT_DMA = 0x01,
+		I915_CLEANUP_DMA = 0x02,
+		I915_RESUME_DMA = 0x03
+	} func;
+	unsigned int mmio_offset;
+	int sarea_priv_offset;
+	unsigned int ring_start;
+	unsigned int ring_end;
+	unsigned int ring_size;
+	unsigned int front_offset;
+	unsigned int back_offset;
+	unsigned int depth_offset;
+	unsigned int w;
+	unsigned int h;
+	unsigned int pitch;
+	unsigned int pitch_bits;
+	unsigned int back_pitch;
+	unsigned int depth_pitch;
+	unsigned int cpp;
+	unsigned int chipset;
+} drm_i915_init_t;
+
+typedef struct _drm_i915_sarea {
+	struct drm_tex_region texList[I915_NR_TEX_REGIONS + 1];
+	int last_upload;	/* last time texture was uploaded */
+	int last_enqueue;	/* last time a buffer was enqueued */
+	int last_dispatch;	/* age of the most recently dispatched buffer */
+	int ctxOwner;		/* last context to upload state */
+	int texAge;
+	int pf_enabled;		/* is pageflipping allowed? */
+	int pf_active;
+	int pf_current_page;	/* which buffer is being displayed? */
+	int perf_boxes;		/* performance boxes to be displayed */
+	int width, height;      /* screen size in pixels */
+
+	drm_handle_t front_handle;
+	int front_offset;
+	int front_size;
+
+	drm_handle_t back_handle;
+	int back_offset;
+	int back_size;
+
+	drm_handle_t depth_handle;
+	int depth_offset;
+	int depth_size;
+
+	drm_handle_t tex_handle;
+	int tex_offset;
+	int tex_size;
+	int log_tex_granularity;
+	int pitch;
+	int rotation;           /* 0, 90, 180 or 270 */
+	int rotated_offset;
+	int rotated_size;
+	int rotated_pitch;
+	int virtualX, virtualY;
+
+	unsigned int front_tiled;
+	unsigned int back_tiled;
+	unsigned int depth_tiled;
+	unsigned int rotated_tiled;
+	unsigned int rotated2_tiled;
+
+	int pipeA_x;
+	int pipeA_y;
+	int pipeA_w;
+	int pipeA_h;
+	int pipeB_x;
+	int pipeB_y;
+	int pipeB_w;
+	int pipeB_h;
+
+	/* fill out some space for old userspace triple buffer */
+	drm_handle_t unused_handle;
+	__u32 unused1, unused2, unused3;
+
+	/* buffer object handles for static buffers. May change
+	 * over the lifetime of the client.
+	 */
+	__u32 front_bo_handle;
+	__u32 back_bo_handle;
+	__u32 unused_bo_handle;
+	__u32 depth_bo_handle;
+
+} drm_i915_sarea_t;
+
+/* due to userspace building against these headers we need some compat here */
+#define planeA_x pipeA_x
+#define planeA_y pipeA_y
+#define planeA_w pipeA_w
+#define planeA_h pipeA_h
+#define planeB_x pipeB_x
+#define planeB_y pipeB_y
+#define planeB_w pipeB_w
+#define planeB_h pipeB_h
+
+/* Flags for perf_boxes
+ */
+#define I915_BOX_RING_EMPTY    0x1
+#define I915_BOX_FLIP          0x2
+#define I915_BOX_WAIT          0x4
+#define I915_BOX_TEXTURE_LOAD  0x8
+#define I915_BOX_LOST_CONTEXT  0x10
+
+/*
+ * i915 specific ioctls.
+ *
+ * The device specific ioctl range is [DRM_COMMAND_BASE, DRM_COMMAND_END) ie
+ * [0x40, 0xa0) (a0 is excluded). The numbers below are defined as offset
+ * against DRM_COMMAND_BASE and should be between [0x0, 0x60).
+ */
+#define DRM_I915_INIT		0x00
+#define DRM_I915_FLUSH		0x01
+#define DRM_I915_FLIP		0x02
+#define DRM_I915_BATCHBUFFER	0x03
+#define DRM_I915_IRQ_EMIT	0x04
+#define DRM_I915_IRQ_WAIT	0x05
+#define DRM_I915_GETPARAM	0x06
+#define DRM_I915_SETPARAM	0x07
+#define DRM_I915_ALLOC		0x08
+#define DRM_I915_FREE		0x09
+#define DRM_I915_INIT_HEAP	0x0a
+#define DRM_I915_CMDBUFFER	0x0b
+#define DRM_I915_DESTROY_HEAP	0x0c
+#define DRM_I915_SET_VBLANK_PIPE	0x0d
+#define DRM_I915_GET_VBLANK_PIPE	0x0e
+#define DRM_I915_VBLANK_SWAP	0x0f
+#define DRM_I915_HWS_ADDR	0x11
+#define DRM_I915_GEM_INIT	0x13
+#define DRM_I915_GEM_EXECBUFFER	0x14
+#define DRM_I915_GEM_PIN	0x15
+#define DRM_I915_GEM_UNPIN	0x16
+#define DRM_I915_GEM_BUSY	0x17
+#define DRM_I915_GEM_THROTTLE	0x18
+#define DRM_I915_GEM_ENTERVT	0x19
+#define DRM_I915_GEM_LEAVEVT	0x1a
+#define DRM_I915_GEM_CREATE	0x1b
+#define DRM_I915_GEM_PREAD	0x1c
+#define DRM_I915_GEM_PWRITE	0x1d
+#define DRM_I915_GEM_MMAP	0x1e
+#define DRM_I915_GEM_SET_DOMAIN	0x1f
+#define DRM_I915_GEM_SW_FINISH	0x20
+#define DRM_I915_GEM_SET_TILING	0x21
+#define DRM_I915_GEM_GET_TILING	0x22
+#define DRM_I915_GEM_GET_APERTURE 0x23
+#define DRM_I915_GEM_MMAP_GTT	0x24
+#define DRM_I915_GET_PIPE_FROM_CRTC_ID	0x25
+#define DRM_I915_GEM_MADVISE	0x26
+#define DRM_I915_OVERLAY_PUT_IMAGE	0x27
+#define DRM_I915_OVERLAY_ATTRS	0x28
+#define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
+#define DRM_I915_GET_SPRITE_COLORKEY	0x2a
+#define DRM_I915_SET_SPRITE_COLORKEY	0x2b
+#define DRM_I915_GEM_WAIT	0x2c
+#define DRM_I915_GEM_CONTEXT_CREATE	0x2d
+#define DRM_I915_GEM_CONTEXT_DESTROY	0x2e
+#define DRM_I915_GEM_SET_CACHING	0x2f
+#define DRM_I915_GEM_GET_CACHING	0x30
+#define DRM_I915_REG_READ		0x31
+#define DRM_I915_GET_RESET_STATS	0x32
+#define DRM_I915_GEM_USERPTR		0x33
+#define DRM_I915_GEM_CONTEXT_GETPARAM	0x34
+#define DRM_I915_GEM_CONTEXT_SETPARAM	0x35
+#define DRM_I915_PERF_OPEN		0x36
+
+#define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
+#define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
+#define DRM_IOCTL_I915_FLIP		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLIP)
+#define DRM_IOCTL_I915_BATCHBUFFER	DRM_IOW( DRM_COMMAND_BASE + DRM_I915_BATCHBUFFER, drm_i915_batchbuffer_t)
+#define DRM_IOCTL_I915_IRQ_EMIT         DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_IRQ_EMIT, drm_i915_irq_emit_t)
+#define DRM_IOCTL_I915_IRQ_WAIT         DRM_IOW( DRM_COMMAND_BASE + DRM_I915_IRQ_WAIT, drm_i915_irq_wait_t)
+#define DRM_IOCTL_I915_GETPARAM         DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GETPARAM, drm_i915_getparam_t)
+#define DRM_IOCTL_I915_SETPARAM         DRM_IOW( DRM_COMMAND_BASE + DRM_I915_SETPARAM, drm_i915_setparam_t)
+#define DRM_IOCTL_I915_ALLOC            DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_ALLOC, drm_i915_mem_alloc_t)
+#define DRM_IOCTL_I915_FREE             DRM_IOW( DRM_COMMAND_BASE + DRM_I915_FREE, drm_i915_mem_free_t)
+#define DRM_IOCTL_I915_INIT_HEAP        DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT_HEAP, drm_i915_mem_init_heap_t)
+#define DRM_IOCTL_I915_CMDBUFFER	DRM_IOW( DRM_COMMAND_BASE + DRM_I915_CMDBUFFER, drm_i915_cmdbuffer_t)
+#define DRM_IOCTL_I915_DESTROY_HEAP	DRM_IOW( DRM_COMMAND_BASE + DRM_I915_DESTROY_HEAP, drm_i915_mem_destroy_heap_t)
+#define DRM_IOCTL_I915_SET_VBLANK_PIPE	DRM_IOW( DRM_COMMAND_BASE + DRM_I915_SET_VBLANK_PIPE, drm_i915_vblank_pipe_t)
+#define DRM_IOCTL_I915_GET_VBLANK_PIPE	DRM_IOR( DRM_COMMAND_BASE + DRM_I915_GET_VBLANK_PIPE, drm_i915_vblank_pipe_t)
+#define DRM_IOCTL_I915_VBLANK_SWAP	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_VBLANK_SWAP, drm_i915_vblank_swap_t)
+#define DRM_IOCTL_I915_HWS_ADDR		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_HWS_ADDR, struct drm_i915_gem_init)
+#define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
+#define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
+#define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
+#define DRM_IOCTL_I915_GEM_SET_CACHING		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_SET_CACHING, struct drm_i915_gem_caching)
+#define DRM_IOCTL_I915_GEM_GET_CACHING		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_GET_CACHING, struct drm_i915_gem_caching)
+#define DRM_IOCTL_I915_GEM_THROTTLE	DRM_IO ( DRM_COMMAND_BASE + DRM_I915_GEM_THROTTLE)
+#define DRM_IOCTL_I915_GEM_ENTERVT	DRM_IO(DRM_COMMAND_BASE + DRM_I915_GEM_ENTERVT)
+#define DRM_IOCTL_I915_GEM_LEAVEVT	DRM_IO(DRM_COMMAND_BASE + DRM_I915_GEM_LEAVEVT)
+#define DRM_IOCTL_I915_GEM_CREATE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_CREATE, struct drm_i915_gem_create)
+#define DRM_IOCTL_I915_GEM_PREAD	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_PREAD, struct drm_i915_gem_pread)
+#define DRM_IOCTL_I915_GEM_PWRITE	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_PWRITE, struct drm_i915_gem_pwrite)
+#define DRM_IOCTL_I915_GEM_MMAP		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP, struct drm_i915_gem_mmap)
+#define DRM_IOCTL_I915_GEM_MMAP_GTT	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP_GTT, struct drm_i915_gem_mmap_gtt)
+#define DRM_IOCTL_I915_GEM_SET_DOMAIN	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_SET_DOMAIN, struct drm_i915_gem_set_domain)
+#define DRM_IOCTL_I915_GEM_SW_FINISH	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_SW_FINISH, struct drm_i915_gem_sw_finish)
+#define DRM_IOCTL_I915_GEM_SET_TILING	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_SET_TILING, struct drm_i915_gem_set_tiling)
+#define DRM_IOCTL_I915_GEM_GET_TILING	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_GET_TILING, struct drm_i915_gem_get_tiling)
+#define DRM_IOCTL_I915_GEM_GET_APERTURE	DRM_IOR  (DRM_COMMAND_BASE + DRM_I915_GEM_GET_APERTURE, struct drm_i915_gem_get_aperture)
+#define DRM_IOCTL_I915_GET_PIPE_FROM_CRTC_ID DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_PIPE_FROM_CRTC_ID, struct drm_i915_get_pipe_from_crtc_id)
+#define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
+#define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_OVERLAY_PUT_IMAGE, struct drm_intel_overlay_put_image)
+#define DRM_IOCTL_I915_OVERLAY_ATTRS	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs)
+#define DRM_IOCTL_I915_SET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_SET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey)
+#define DRM_IOCTL_I915_GET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey)
+#define DRM_IOCTL_I915_GEM_WAIT		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_WAIT, struct drm_i915_gem_wait)
+#define DRM_IOCTL_I915_GEM_CONTEXT_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create)
+#define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy)
+#define DRM_IOCTL_I915_REG_READ			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read)
+#define DRM_IOCTL_I915_GET_RESET_STATS		DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats)
+#define DRM_IOCTL_I915_GEM_USERPTR			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_USERPTR, struct drm_i915_gem_userptr)
+#define DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_GETPARAM, struct drm_i915_gem_context_param)
+#define DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_SETPARAM, struct drm_i915_gem_context_param)
+#define DRM_IOCTL_I915_PERF_OPEN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_OPEN, struct drm_i915_perf_open_param)
+
+/* Allow drivers to submit batchbuffers directly to hardware, relying
+ * on the security mechanisms provided by hardware.
+ */
+typedef struct drm_i915_batchbuffer {
+	int start;		/* agp offset */
+	int used;		/* nr bytes in use */
+	int DR1;		/* hw flags for GFX_OP_DRAWRECT_INFO */
+	int DR4;		/* window origin for GFX_OP_DRAWRECT_INFO */
+	int num_cliprects;	/* mulitpass with multiple cliprects? */
+	struct drm_clip_rect __user *cliprects;	/* pointer to userspace cliprects */
+} drm_i915_batchbuffer_t;
+
+/* As above, but pass a pointer to userspace buffer which can be
+ * validated by the kernel prior to sending to hardware.
+ */
+typedef struct _drm_i915_cmdbuffer {
+	char __user *buf;	/* pointer to userspace command buffer */
+	int sz;			/* nr bytes in buf */
+	int DR1;		/* hw flags for GFX_OP_DRAWRECT_INFO */
+	int DR4;		/* window origin for GFX_OP_DRAWRECT_INFO */
+	int num_cliprects;	/* mulitpass with multiple cliprects? */
+	struct drm_clip_rect __user *cliprects;	/* pointer to userspace cliprects */
+} drm_i915_cmdbuffer_t;
+
+/* Userspace can request & wait on irq's:
+ */
+typedef struct drm_i915_irq_emit {
+	int __user *irq_seq;
+} drm_i915_irq_emit_t;
+
+typedef struct drm_i915_irq_wait {
+	int irq_seq;
+} drm_i915_irq_wait_t;
+
+/* Ioctl to query kernel params:
+ */
+#define I915_PARAM_IRQ_ACTIVE            1
+#define I915_PARAM_ALLOW_BATCHBUFFER     2
+#define I915_PARAM_LAST_DISPATCH         3
+#define I915_PARAM_CHIPSET_ID            4
+#define I915_PARAM_HAS_GEM               5
+#define I915_PARAM_NUM_FENCES_AVAIL      6
+#define I915_PARAM_HAS_OVERLAY           7
+#define I915_PARAM_HAS_PAGEFLIPPING	 8
+#define I915_PARAM_HAS_EXECBUF2          9
+#define I915_PARAM_HAS_BSD		 10
+#define I915_PARAM_HAS_BLT		 11
+#define I915_PARAM_HAS_RELAXED_FENCING	 12
+#define I915_PARAM_HAS_COHERENT_RINGS	 13
+#define I915_PARAM_HAS_EXEC_CONSTANTS	 14
+#define I915_PARAM_HAS_RELAXED_DELTA	 15
+#define I915_PARAM_HAS_GEN7_SOL_RESET	 16
+#define I915_PARAM_HAS_LLC     	 	 17
+#define I915_PARAM_HAS_ALIASING_PPGTT	 18
+#define I915_PARAM_HAS_WAIT_TIMEOUT	 19
+#define I915_PARAM_HAS_SEMAPHORES	 20
+#define I915_PARAM_HAS_PRIME_VMAP_FLUSH	 21
+#define I915_PARAM_HAS_VEBOX		 22
+#define I915_PARAM_HAS_SECURE_BATCHES	 23
+#define I915_PARAM_HAS_PINNED_BATCHES	 24
+#define I915_PARAM_HAS_EXEC_NO_RELOC	 25
+#define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
+#define I915_PARAM_HAS_WT     	 	 27
+#define I915_PARAM_CMD_PARSER_VERSION	 28
+#define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
+#define I915_PARAM_MMAP_VERSION          30
+#define I915_PARAM_HAS_BSD2		 31
+#define I915_PARAM_REVISION              32
+#define I915_PARAM_SUBSLICE_TOTAL	 33
+#define I915_PARAM_EU_TOTAL		 34
+#define I915_PARAM_HAS_GPU_RESET	 35
+#define I915_PARAM_HAS_RESOURCE_STREAMER 36
+#define I915_PARAM_HAS_EXEC_SOFTPIN	 37
+#define I915_PARAM_HAS_POOLED_EU	 38
+#define I915_PARAM_MIN_EU_IN_POOL	 39
+#define I915_PARAM_MMAP_GTT_VERSION	 40
+
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
+ * priorities and the driver will attempt to execute batches in priority order.
+ */
+#define I915_PARAM_HAS_SCHEDULER	 41
+#define I915_PARAM_HUC_STATUS		 42
+
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to opt-out of
+ * synchronisation with implicit fencing on individual objects.
+ * See EXEC_OBJECT_ASYNC.
+ */
+#define I915_PARAM_HAS_EXEC_ASYNC	 43
+
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports explicit fence support -
+ * both being able to pass in a sync_file fd to wait upon before executing,
+ * and being able to return a new sync_file fd that is signaled when the
+ * current request is complete. See I915_EXEC_FENCE_IN and I915_EXEC_FENCE_OUT.
+ */
+#define I915_PARAM_HAS_EXEC_FENCE	 44
+
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to capture
+ * user specified bufffers for post-mortem debugging of GPU hangs. See
+ * EXEC_OBJECT_CAPTURE.
+ */
+#define I915_PARAM_HAS_EXEC_CAPTURE	 45
+
+#define I915_PARAM_SLICE_MASK		 46
+
+/* Assuming it's uniform for each slice, this queries the mask of subslices
+ * per-slice for this system.
+ */
+#define I915_PARAM_SUBSLICE_MASK	 47
+
+/*
+ * Query whether DRM_I915_GEM_EXECBUFFER2 supports supplying the batch buffer
+ * as the first execobject as opposed to the last. See I915_EXEC_BATCH_FIRST.
+ */
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST	 48
+
+typedef struct drm_i915_getparam {
+	__s32 param;
+	/*
+	 * WARNING: Using pointers instead of fixed-size u64 means we need to write
+	 * compat32 code. Don't repeat this mistake.
+	 */
+	int __user *value;
+} drm_i915_getparam_t;
+
+/* Ioctl to set kernel params:
+ */
+#define I915_SETPARAM_USE_MI_BATCHBUFFER_START            1
+#define I915_SETPARAM_TEX_LRU_LOG_GRANULARITY             2
+#define I915_SETPARAM_ALLOW_BATCHBUFFER                   3
+#define I915_SETPARAM_NUM_USED_FENCES                     4
+
+typedef struct drm_i915_setparam {
+	int param;
+	int value;
+} drm_i915_setparam_t;
+
+/* A memory manager for regions of shared memory:
+ */
+#define I915_MEM_REGION_AGP 1
+
+typedef struct drm_i915_mem_alloc {
+	int region;
+	int alignment;
+	int size;
+	int __user *region_offset;	/* offset from start of fb or agp */
+} drm_i915_mem_alloc_t;
+
+typedef struct drm_i915_mem_free {
+	int region;
+	int region_offset;
+} drm_i915_mem_free_t;
+
+typedef struct drm_i915_mem_init_heap {
+	int region;
+	int size;
+	int start;
+} drm_i915_mem_init_heap_t;
+
+/* Allow memory manager to be torn down and re-initialized (eg on
+ * rotate):
+ */
+typedef struct drm_i915_mem_destroy_heap {
+	int region;
+} drm_i915_mem_destroy_heap_t;
+
+/* Allow X server to configure which pipes to monitor for vblank signals
+ */
+#define	DRM_I915_VBLANK_PIPE_A	1
+#define	DRM_I915_VBLANK_PIPE_B	2
+
+typedef struct drm_i915_vblank_pipe {
+	int pipe;
+} drm_i915_vblank_pipe_t;
+
+/* Schedule buffer swap at given vertical blank:
+ */
+typedef struct drm_i915_vblank_swap {
+	drm_drawable_t drawable;
+	enum drm_vblank_seq_type seqtype;
+	unsigned int sequence;
+} drm_i915_vblank_swap_t;
+
+typedef struct drm_i915_hws_addr {
+	__u64 addr;
+} drm_i915_hws_addr_t;
+
+struct drm_i915_gem_init {
+	/**
+	 * Beginning offset in the GTT to be managed by the DRM memory
+	 * manager.
+	 */
+	__u64 gtt_start;
+	/**
+	 * Ending offset in the GTT to be managed by the DRM memory
+	 * manager.
+	 */
+	__u64 gtt_end;
+};
+
+struct drm_i915_gem_create {
+	/**
+	 * Requested size for the object.
+	 *
+	 * The (page-aligned) allocated size for the object will be returned.
+	 */
+	__u64 size;
+	/**
+	 * Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+	__u32 pad;
+};
+
+struct drm_i915_gem_pread {
+	/** Handle for the object being read. */
+	__u32 handle;
+	__u32 pad;
+	/** Offset into the object to read from */
+	__u64 offset;
+	/** Length of data to read */
+	__u64 size;
+	/**
+	 * Pointer to write the data into.
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 data_ptr;
+};
+
+struct drm_i915_gem_pwrite {
+	/** Handle for the object being written to. */
+	__u32 handle;
+	__u32 pad;
+	/** Offset into the object to write to */
+	__u64 offset;
+	/** Length of data to write */
+	__u64 size;
+	/**
+	 * Pointer to read the data from.
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 data_ptr;
+};
+
+struct drm_i915_gem_mmap {
+	/** Handle for the object being mapped. */
+	__u32 handle;
+	__u32 pad;
+	/** Offset in the object to map. */
+	__u64 offset;
+	/**
+	 * Length of data to map.
+	 *
+	 * The value will be page-aligned.
+	 */
+	__u64 size;
+	/**
+	 * Returned pointer the data was mapped at.
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 addr_ptr;
+
+	/**
+	 * Flags for extended behaviour.
+	 *
+	 * Added in version 2.
+	 */
+	__u64 flags;
+#define I915_MMAP_WC 0x1
+};
+
+struct drm_i915_gem_mmap_gtt {
+	/** Handle for the object being mapped. */
+	__u32 handle;
+	__u32 pad;
+	/**
+	 * Fake offset to use for subsequent mmap call
+	 *
+	 * This is a fixed-size type for 32/64 compatibility.
+	 */
+	__u64 offset;
+};
+
+struct drm_i915_gem_set_domain {
+	/** Handle for the object */
+	__u32 handle;
+
+	/** New read domains */
+	__u32 read_domains;
+
+	/** New write domain */
+	__u32 write_domain;
+};
+
+struct drm_i915_gem_sw_finish {
+	/** Handle for the object */
+	__u32 handle;
+};
+
+struct drm_i915_gem_relocation_entry {
+	/**
+	 * Handle of the buffer being pointed to by this relocation entry.
+	 *
+	 * It's appealing to make this be an index into the mm_validate_entry
+	 * list to refer to the buffer, but this allows the driver to create
+	 * a relocation list for state buffers and not re-write it per
+	 * exec using the buffer.
+	 */
+	__u32 target_handle;
+
+	/**
+	 * Value to be added to the offset of the target buffer to make up
+	 * the relocation entry.
+	 */
+	__u32 delta;
+
+	/** Offset in the buffer the relocation entry will be written into */
+	__u64 offset;
+
+	/**
+	 * Offset value of the target buffer that the relocation entry was last
+	 * written as.
+	 *
+	 * If the buffer has the same offset as last time, we can skip syncing
+	 * and writing the relocation.  This value is written back out by
+	 * the execbuffer ioctl when the relocation is written.
+	 */
+	__u64 presumed_offset;
+
+	/**
+	 * Target memory domains read by this operation.
+	 */
+	__u32 read_domains;
+
+	/**
+	 * Target memory domains written by this operation.
+	 *
+	 * Note that only one domain may be written by the whole
+	 * execbuffer operation, so that where there are conflicts,
+	 * the application will get -EINVAL back.
+	 */
+	__u32 write_domain;
+};
+
+/** @{
+ * Intel memory domains
+ *
+ * Most of these just align with the various caches in
+ * the system and are used to flush and invalidate as
+ * objects end up cached in different domains.
+ */
+/** CPU cache */
+#define I915_GEM_DOMAIN_CPU		0x00000001
+/** Render cache, used by 2D and 3D drawing */
+#define I915_GEM_DOMAIN_RENDER		0x00000002
+/** Sampler cache, used by texture engine */
+#define I915_GEM_DOMAIN_SAMPLER		0x00000004
+/** Command queue, used to load batch buffers */
+#define I915_GEM_DOMAIN_COMMAND		0x00000008
+/** Instruction cache, used by shader programs */
+#define I915_GEM_DOMAIN_INSTRUCTION	0x00000010
+/** Vertex address cache */
+#define I915_GEM_DOMAIN_VERTEX		0x00000020
+/** GTT domain - aperture and scanout */
+#define I915_GEM_DOMAIN_GTT		0x00000040
+/** WC domain - uncached access */
+#define I915_GEM_DOMAIN_WC		0x00000080
+/** @} */
+
+struct drm_i915_gem_exec_object {
+	/**
+	 * User's handle for a buffer to be bound into the GTT for this
+	 * operation.
+	 */
+	__u32 handle;
+
+	/** Number of relocations to be performed on this buffer */
+	__u32 relocation_count;
+	/**
+	 * Pointer to array of struct drm_i915_gem_relocation_entry containing
+	 * the relocations to be performed in this buffer.
+	 */
+	__u64 relocs_ptr;
+
+	/** Required alignment in graphics aperture */
+	__u64 alignment;
+
+	/**
+	 * Returned value of the updated offset of the object, for future
+	 * presumed_offset writes.
+	 */
+	__u64 offset;
+};
+
+struct drm_i915_gem_execbuffer {
+	/**
+	 * List of buffers to be validated with their relocations to be
+	 * performend on them.
+	 *
+	 * This is a pointer to an array of struct drm_i915_gem_validate_entry.
+	 *
+	 * These buffers must be listed in an order such that all relocations
+	 * a buffer is performing refer to buffers that have already appeared
+	 * in the validate list.
+	 */
+	__u64 buffers_ptr;
+	__u32 buffer_count;
+
+	/** Offset in the batchbuffer to start execution from. */
+	__u32 batch_start_offset;
+	/** Bytes used in batchbuffer from batch_start_offset */
+	__u32 batch_len;
+	__u32 DR1;
+	__u32 DR4;
+	__u32 num_cliprects;
+	/** This is a struct drm_clip_rect *cliprects */
+	__u64 cliprects_ptr;
+};
+
+struct drm_i915_gem_exec_object2 {
+	/**
+	 * User's handle for a buffer to be bound into the GTT for this
+	 * operation.
+	 */
+	__u32 handle;
+
+	/** Number of relocations to be performed on this buffer */
+	__u32 relocation_count;
+	/**
+	 * Pointer to array of struct drm_i915_gem_relocation_entry containing
+	 * the relocations to be performed in this buffer.
+	 */
+	__u64 relocs_ptr;
+
+	/** Required alignment in graphics aperture */
+	__u64 alignment;
+
+	/**
+	 * When the EXEC_OBJECT_PINNED flag is specified this is populated by
+	 * the user with the GTT offset at which this object will be pinned.
+	 * When the I915_EXEC_NO_RELOC flag is specified this must contain the
+	 * presumed_offset of the object.
+	 * During execbuffer2 the kernel populates it with the value of the
+	 * current GTT offset of the object, for future presumed_offset writes.
+	 */
+	__u64 offset;
+
+#define EXEC_OBJECT_NEEDS_FENCE		 (1<<0)
+#define EXEC_OBJECT_NEEDS_GTT		 (1<<1)
+#define EXEC_OBJECT_WRITE		 (1<<2)
+#define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
+#define EXEC_OBJECT_PINNED		 (1<<4)
+#define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+/* The kernel implicitly tracks GPU activity on all GEM objects, and
+ * synchronises operations with outstanding rendering. This includes
+ * rendering on other devices if exported via dma-buf. However, sometimes
+ * this tracking is too coarse and the user knows better. For example,
+ * if the object is split into non-overlapping ranges shared between different
+ * clients or engines (i.e. suballocating objects), the implicit tracking
+ * by kernel assumes that each operation affects the whole object rather
+ * than an individual range, causing needless synchronisation between clients.
+ * The kernel will also forgo any CPU cache flushes prior to rendering from
+ * the object as the client is expected to be also handling such domain
+ * tracking.
+ *
+ * The kernel maintains the implicit tracking in order to manage resources
+ * used by the GPU - this flag only disables the synchronisation prior to
+ * rendering with this object in this execbuf.
+ *
+ * Opting out of implicit synhronisation requires the user to do its own
+ * explicit tracking to avoid rendering corruption. See, for example,
+ * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
+ */
+#define EXEC_OBJECT_ASYNC		(1<<6)
+/* Request that the contents of this execobject be copied into the error
+ * state upon a GPU hang involving this batch for post-mortem debugging.
+ * These buffers are recorded in no particular order as "user" in
+ * /sys/class/drm/cardN/error. Query I915_PARAM_HAS_EXEC_CAPTURE to see
+ * if the kernel supports this flag.
+ */
+#define EXEC_OBJECT_CAPTURE		(1<<7)
+/* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_CAPTURE<<1)
+	__u64 flags;
+
+	union {
+		__u64 rsvd1;
+		__u64 pad_to_size;
+	};
+	__u64 rsvd2;
+};
+
+struct drm_i915_gem_execbuffer2 {
+	/**
+	 * List of gem_exec_object2 structs
+	 */
+	__u64 buffers_ptr;
+	__u32 buffer_count;
+
+	/** Offset in the batchbuffer to start execution from. */
+	__u32 batch_start_offset;
+	/** Bytes used in batchbuffer from batch_start_offset */
+	__u32 batch_len;
+	__u32 DR1;
+	__u32 DR4;
+	__u32 num_cliprects;
+	/** This is a struct drm_clip_rect *cliprects */
+	__u64 cliprects_ptr;
+#define I915_EXEC_RING_MASK              (7<<0)
+#define I915_EXEC_DEFAULT                (0<<0)
+#define I915_EXEC_RENDER                 (1<<0)
+#define I915_EXEC_BSD                    (2<<0)
+#define I915_EXEC_BLT                    (3<<0)
+#define I915_EXEC_VEBOX                  (4<<0)
+
+/* Used for switching the constants addressing mode on gen4+ RENDER ring.
+ * Gen6+ only supports relative addressing to dynamic state (default) and
+ * absolute addressing.
+ *
+ * These flags are ignored for the BSD and BLT rings.
+ */
+#define I915_EXEC_CONSTANTS_MASK 	(3<<6)
+#define I915_EXEC_CONSTANTS_REL_GENERAL (0<<6) /* default */
+#define I915_EXEC_CONSTANTS_ABSOLUTE 	(1<<6)
+#define I915_EXEC_CONSTANTS_REL_SURFACE (2<<6) /* gen4/5 only */
+	__u64 flags;
+	__u64 rsvd1; /* now used for context info */
+	__u64 rsvd2;
+};
+
+/** Resets the SO write offset registers for transform feedback on gen7. */
+#define I915_EXEC_GEN7_SOL_RESET	(1<<8)
+
+/** Request a privileged ("secure") batch buffer. Note only available for
+ * DRM_ROOT_ONLY | DRM_MASTER processes.
+ */
+#define I915_EXEC_SECURE		(1<<9)
+
+/** Inform the kernel that the batch is and will always be pinned. This
+ * negates the requirement for a workaround to be performed to avoid
+ * an incoherent CS (such as can be found on 830/845). If this flag is
+ * not passed, the kernel will endeavour to make sure the batch is
+ * coherent with the CS before execution. If this flag is passed,
+ * userspace assumes the responsibility for ensuring the same.
+ */
+#define I915_EXEC_IS_PINNED		(1<<10)
+
+/** Provide a hint to the kernel that the command stream and auxiliary
+ * state buffers already holds the correct presumed addresses and so the
+ * relocation process may be skipped if no buffers need to be moved in
+ * preparation for the execbuffer.
+ */
+#define I915_EXEC_NO_RELOC		(1<<11)
+
+/** Use the reloc.handle as an index into the exec object array rather
+ * than as the per-file handle.
+ */
+#define I915_EXEC_HANDLE_LUT		(1<<12)
+
+/** Used for switching BSD rings on the platforms with two BSD rings */
+#define I915_EXEC_BSD_SHIFT	 (13)
+#define I915_EXEC_BSD_MASK	 (3 << I915_EXEC_BSD_SHIFT)
+/* default ping-pong mode */
+#define I915_EXEC_BSD_DEFAULT	 (0 << I915_EXEC_BSD_SHIFT)
+#define I915_EXEC_BSD_RING1	 (1 << I915_EXEC_BSD_SHIFT)
+#define I915_EXEC_BSD_RING2	 (2 << I915_EXEC_BSD_SHIFT)
+
+/** Tell the kernel that the batchbuffer is processed by
+ *  the resource streamer.
+ */
+#define I915_EXEC_RESOURCE_STREAMER     (1<<15)
+
+/* Setting I915_EXEC_FENCE_IN implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_IN		(1<<16)
+
+/* Setting I915_EXEC_FENCE_OUT causes the ioctl to return a sync_file fd
+ * in the upper_32_bits(rsvd2) upon success. Ownership of the fd is given
+ * to the caller, and it should be close() after use. (The fd is a regular
+ * file descriptor and will be cleaned up on process termination. It holds
+ * a reference to the request, but nothing else.)
+ *
+ * The sync_file fd can be combined with other sync_file and passed either
+ * to execbuf using I915_EXEC_FENCE_IN, to atomic KMS ioctls (so that a flip
+ * will only occur after this request completes), or to other devices.
+ *
+ * Using I915_EXEC_FENCE_OUT requires use of
+ * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR ioctl so that the result is written
+ * back to userspace. Failure to do so will cause the out-fence to always
+ * be reported as zero, and the real fence fd to be leaked.
+ */
+#define I915_EXEC_FENCE_OUT		(1<<17)
+
+/*
+ * Traditionally the execbuf ioctl has only considered the final element in
+ * the execobject[] to be the executable batch. Often though, the client
+ * will known the batch object prior to construction and being able to place
+ * it into the execobject[] array first can simplify the relocation tracking.
+ * Setting I915_EXEC_BATCH_FIRST tells execbuf to use element 0 of the
+ * execobject[] as the * batch instead (the default is to use the last
+ * element).
+ */
+#define I915_EXEC_BATCH_FIRST		(1<<18)
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_BATCH_FIRST<<1))
+
+#define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
+#define i915_execbuffer2_set_context_id(eb2, context) \
+	(eb2).rsvd1 = context & I915_EXEC_CONTEXT_ID_MASK
+#define i915_execbuffer2_get_context_id(eb2) \
+	((eb2).rsvd1 & I915_EXEC_CONTEXT_ID_MASK)
+
+struct drm_i915_gem_pin {
+	/** Handle of the buffer to be pinned. */
+	__u32 handle;
+	__u32 pad;
+
+	/** alignment required within the aperture */
+	__u64 alignment;
+
+	/** Returned GTT offset of the buffer. */
+	__u64 offset;
+};
+
+struct drm_i915_gem_unpin {
+	/** Handle of the buffer to be unpinned. */
+	__u32 handle;
+	__u32 pad;
+};
+
+struct drm_i915_gem_busy {
+	/** Handle of the buffer to check for busy */
+	__u32 handle;
+
+	/** Return busy status
+	 *
+	 * A return of 0 implies that the object is idle (after
+	 * having flushed any pending activity), and a non-zero return that
+	 * the object is still in-flight on the GPU. (The GPU has not yet
+	 * signaled completion for all pending requests that reference the
+	 * object.) An object is guaranteed to become idle eventually (so
+	 * long as no new GPU commands are executed upon it). Due to the
+	 * asynchronous nature of the hardware, an object reported
+	 * as busy may become idle before the ioctl is completed.
+	 *
+	 * Furthermore, if the object is busy, which engine is busy is only
+	 * provided as a guide. There are race conditions which prevent the
+	 * report of which engines are busy from being always accurate.
+	 * However, the converse is not true. If the object is idle, the
+	 * result of the ioctl, that all engines are idle, is accurate.
+	 *
+	 * The returned dword is split into two fields to indicate both
+	 * the engines on which the object is being read, and the
+	 * engine on which it is currently being written (if any).
+	 *
+	 * The low word (bits 0:15) indicate if the object is being written
+	 * to by any engine (there can only be one, as the GEM implicit
+	 * synchronisation rules force writes to be serialised). Only the
+	 * engine for the last write is reported.
+	 *
+	 * The high word (bits 16:31) are a bitmask of which engines are
+	 * currently reading from the object. Multiple engines may be
+	 * reading from the object simultaneously.
+	 *
+	 * The value of each engine is the same as specified in the
+	 * EXECBUFFER2 ioctl, i.e. I915_EXEC_RENDER, I915_EXEC_BSD etc.
+	 * Note I915_EXEC_DEFAULT is a symbolic value and is mapped to
+	 * the I915_EXEC_RENDER engine for execution, and so it is never
+	 * reported as active itself. Some hardware may have parallel
+	 * execution engines, e.g. multiple media engines, which are
+	 * mapped to the same identifier in the EXECBUFFER2 ioctl and
+	 * so are not separately reported for busyness.
+	 *
+	 * Caveat emptor:
+	 * Only the boolean result of this query is reliable; that is whether
+	 * the object is idle or busy. The report of which engines are busy
+	 * should be only used as a heuristic.
+	 */
+	__u32 busy;
+};
+
+/**
+ * I915_CACHING_NONE
+ *
+ * GPU access is not coherent with cpu caches. Default for machines without an
+ * LLC.
+ */
+#define I915_CACHING_NONE		0
+/**
+ * I915_CACHING_CACHED
+ *
+ * GPU access is coherent with cpu caches and furthermore the data is cached in
+ * last-level caches shared between cpu cores and the gpu GT. Default on
+ * machines with HAS_LLC.
+ */
+#define I915_CACHING_CACHED		1
+/**
+ * I915_CACHING_DISPLAY
+ *
+ * Special GPU caching mode which is coherent with the scanout engines.
+ * Transparently falls back to I915_CACHING_NONE on platforms where no special
+ * cache mode (like write-through or gfdt flushing) is available. The kernel
+ * automatically sets this mode when using a buffer as a scanout target.
+ * Userspace can manually set this mode to avoid a costly stall and clflush in
+ * the hotpath of drawing the first frame.
+ */
+#define I915_CACHING_DISPLAY		2
+
+struct drm_i915_gem_caching {
+	/**
+	 * Handle of the buffer to set/get the caching level of. */
+	__u32 handle;
+
+	/**
+	 * Cacheing level to apply or return value
+	 *
+	 * bits0-15 are for generic caching control (i.e. the above defined
+	 * values). bits16-31 are reserved for platform-specific variations
+	 * (e.g. l3$ caching on gen7). */
+	__u32 caching;
+};
+
+#define I915_TILING_NONE	0
+#define I915_TILING_X		1
+#define I915_TILING_Y		2
+#define I915_TILING_LAST	I915_TILING_Y
+
+#define I915_BIT_6_SWIZZLE_NONE		0
+#define I915_BIT_6_SWIZZLE_9		1
+#define I915_BIT_6_SWIZZLE_9_10		2
+#define I915_BIT_6_SWIZZLE_9_11		3
+#define I915_BIT_6_SWIZZLE_9_10_11	4
+/* Not seen by userland */
+#define I915_BIT_6_SWIZZLE_UNKNOWN	5
+/* Seen by userland. */
+#define I915_BIT_6_SWIZZLE_9_17		6
+#define I915_BIT_6_SWIZZLE_9_10_17	7
+
+struct drm_i915_gem_set_tiling {
+	/** Handle of the buffer to have its tiling state updated */
+	__u32 handle;
+
+	/**
+	 * Tiling mode for the object (I915_TILING_NONE, I915_TILING_X,
+	 * I915_TILING_Y).
+	 *
+	 * This value is to be set on request, and will be updated by the
+	 * kernel on successful return with the actual chosen tiling layout.
+	 *
+	 * The tiling mode may be demoted to I915_TILING_NONE when the system
+	 * has bit 6 swizzling that can't be managed correctly by GEM.
+	 *
+	 * Buffer contents become undefined when changing tiling_mode.
+	 */
+	__u32 tiling_mode;
+
+	/**
+	 * Stride in bytes for the object when in I915_TILING_X or
+	 * I915_TILING_Y.
+	 */
+	__u32 stride;
+
+	/**
+	 * Returned address bit 6 swizzling required for CPU access through
+	 * mmap mapping.
+	 */
+	__u32 swizzle_mode;
+};
+
+struct drm_i915_gem_get_tiling {
+	/** Handle of the buffer to get tiling state for. */
+	__u32 handle;
+
+	/**
+	 * Current tiling mode for the object (I915_TILING_NONE, I915_TILING_X,
+	 * I915_TILING_Y).
+	 */
+	__u32 tiling_mode;
+
+	/**
+	 * Returned address bit 6 swizzling required for CPU access through
+	 * mmap mapping.
+	 */
+	__u32 swizzle_mode;
+
+	/**
+	 * Returned address bit 6 swizzling required for CPU access through
+	 * mmap mapping whilst bound.
+	 */
+	__u32 phys_swizzle_mode;
+};
+
+struct drm_i915_gem_get_aperture {
+	/** Total size of the aperture used by i915_gem_execbuffer, in bytes */
+	__u64 aper_size;
+
+	/**
+	 * Available space in the aperture used by i915_gem_execbuffer, in
+	 * bytes
+	 */
+	__u64 aper_available_size;
+};
+
+struct drm_i915_get_pipe_from_crtc_id {
+	/** ID of CRTC being requested **/
+	__u32 crtc_id;
+
+	/** pipe of requested CRTC **/
+	__u32 pipe;
+};
+
+#define I915_MADV_WILLNEED 0
+#define I915_MADV_DONTNEED 1
+#define __I915_MADV_PURGED 2 /* internal state */
+
+struct drm_i915_gem_madvise {
+	/** Handle of the buffer to change the backing store advice */
+	__u32 handle;
+
+	/* Advice: either the buffer will be needed again in the near future,
+	 *         or wont be and could be discarded under memory pressure.
+	 */
+	__u32 madv;
+
+	/** Whether the backing store still exists. */
+	__u32 retained;
+};
+
+/* flags */
+#define I915_OVERLAY_TYPE_MASK 		0xff
+#define I915_OVERLAY_YUV_PLANAR 	0x01
+#define I915_OVERLAY_YUV_PACKED 	0x02
+#define I915_OVERLAY_RGB		0x03
+
+#define I915_OVERLAY_DEPTH_MASK		0xff00
+#define I915_OVERLAY_RGB24		0x1000
+#define I915_OVERLAY_RGB16		0x2000
+#define I915_OVERLAY_RGB15		0x3000
+#define I915_OVERLAY_YUV422		0x0100
+#define I915_OVERLAY_YUV411		0x0200
+#define I915_OVERLAY_YUV420		0x0300
+#define I915_OVERLAY_YUV410		0x0400
+
+#define I915_OVERLAY_SWAP_MASK		0xff0000
+#define I915_OVERLAY_NO_SWAP		0x000000
+#define I915_OVERLAY_UV_SWAP		0x010000
+#define I915_OVERLAY_Y_SWAP		0x020000
+#define I915_OVERLAY_Y_AND_UV_SWAP	0x030000
+
+#define I915_OVERLAY_FLAGS_MASK		0xff000000
+#define I915_OVERLAY_ENABLE		0x01000000
+
+struct drm_intel_overlay_put_image {
+	/* various flags and src format description */
+	__u32 flags;
+	/* source picture description */
+	__u32 bo_handle;
+	/* stride values and offsets are in bytes, buffer relative */
+	__u16 stride_Y; /* stride for packed formats */
+	__u16 stride_UV;
+	__u32 offset_Y; /* offset for packet formats */
+	__u32 offset_U;
+	__u32 offset_V;
+	/* in pixels */
+	__u16 src_width;
+	__u16 src_height;
+	/* to compensate the scaling factors for partially covered surfaces */
+	__u16 src_scan_width;
+	__u16 src_scan_height;
+	/* output crtc description */
+	__u32 crtc_id;
+	__u16 dst_x;
+	__u16 dst_y;
+	__u16 dst_width;
+	__u16 dst_height;
+};
+
+/* flags */
+#define I915_OVERLAY_UPDATE_ATTRS	(1<<0)
+#define I915_OVERLAY_UPDATE_GAMMA	(1<<1)
+#define I915_OVERLAY_DISABLE_DEST_COLORKEY	(1<<2)
+struct drm_intel_overlay_attrs {
+	__u32 flags;
+	__u32 color_key;
+	__s32 brightness;
+	__u32 contrast;
+	__u32 saturation;
+	__u32 gamma0;
+	__u32 gamma1;
+	__u32 gamma2;
+	__u32 gamma3;
+	__u32 gamma4;
+	__u32 gamma5;
+};
+
+/*
+ * Intel sprite handling
+ *
+ * Color keying works with a min/mask/max tuple.  Both source and destination
+ * color keying is allowed.
+ *
+ * Source keying:
+ * Sprite pixels within the min & max values, masked against the color channels
+ * specified in the mask field, will be transparent.  All other pixels will
+ * be displayed on top of the primary plane.  For RGB surfaces, only the min
+ * and mask fields will be used; ranged compares are not allowed.
+ *
+ * Destination keying:
+ * Primary plane pixels that match the min value, masked against the color
+ * channels specified in the mask field, will be replaced by corresponding
+ * pixels from the sprite plane.
+ *
+ * Note that source & destination keying are exclusive; only one can be
+ * active on a given plane.
+ */
+
+#define I915_SET_COLORKEY_NONE		(1<<0) /* disable color key matching */
+#define I915_SET_COLORKEY_DESTINATION	(1<<1)
+#define I915_SET_COLORKEY_SOURCE	(1<<2)
+struct drm_intel_sprite_colorkey {
+	__u32 plane_id;
+	__u32 min_value;
+	__u32 channel_mask;
+	__u32 max_value;
+	__u32 flags;
+};
+
+struct drm_i915_gem_wait {
+	/** Handle of BO we shall wait on */
+	__u32 bo_handle;
+	__u32 flags;
+	/** Number of nanoseconds to wait, Returns time remaining. */
+	__s64 timeout_ns;
+};
+
+struct drm_i915_gem_context_create {
+	/*  output: id of new context*/
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+struct drm_i915_gem_context_destroy {
+	__u32 ctx_id;
+	__u32 pad;
+};
+
+struct drm_i915_reg_read {
+	/*
+	 * Register offset.
+	 * For 64bit wide registers where the upper 32bits don't immediately
+	 * follow the lower 32bits, the offset of the lower 32bits must
+	 * be specified
+	 */
+	__u64 offset;
+	__u64 val; /* Return value */
+};
+/* Known registers:
+ *
+ * Render engine timestamp - 0x2358 + 64bit - gen7+
+ * - Note this register returns an invalid value if using the default
+ *   single instruction 8byte read, in order to workaround that use
+ *   offset (0x2538 | 1) instead.
+ *
+ */
+
+struct drm_i915_reset_stats {
+	__u32 ctx_id;
+	__u32 flags;
+
+	/* All resets since boot/module reload, for all contexts */
+	__u32 reset_count;
+
+	/* Number of batches lost when active in GPU, for this context */
+	__u32 batch_active;
+
+	/* Number of batches lost pending for execution, for this context */
+	__u32 batch_pending;
+
+	__u32 pad;
+};
+
+struct drm_i915_gem_userptr {
+	__u64 user_ptr;
+	__u64 user_size;
+	__u32 flags;
+#define I915_USERPTR_READ_ONLY 0x1
+#define I915_USERPTR_UNSYNCHRONIZED 0x80000000
+	/**
+	 * Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+};
+
+struct drm_i915_gem_context_param {
+	__u32 ctx_id;
+	__u32 size;
+	__u64 param;
+#define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
+#define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
+#define I915_CONTEXT_PARAM_GTT_SIZE	0x3
+#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
+#define I915_CONTEXT_PARAM_BANNABLE	0x5
+	__u64 value;
+};
+
+enum drm_i915_oa_format {
+	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
+	I915_OA_FORMAT_A29,	    /* HSW only */
+	I915_OA_FORMAT_A13_B8_C8,   /* HSW only */
+	I915_OA_FORMAT_B4_C8,	    /* HSW only */
+	I915_OA_FORMAT_A45_B8_C8,   /* HSW only */
+	I915_OA_FORMAT_B4_C8_A16,   /* HSW only */
+	I915_OA_FORMAT_C4_B8,	    /* HSW+ */
+
+	/* Gen8+ */
+	I915_OA_FORMAT_A12,
+	I915_OA_FORMAT_A12_B8_C8,
+	I915_OA_FORMAT_A32u40_A4u32_B8_C8,
+
+	I915_OA_FORMAT_MAX	    /* non-ABI */
+};
+
+enum drm_i915_perf_property_id {
+	/**
+	 * Open the stream for a specific context handle (as used with
+	 * execbuffer2). A stream opened for a specific context this way
+	 * won't typically require root privileges.
+	 */
+	DRM_I915_PERF_PROP_CTX_HANDLE = 1,
+
+	/**
+	 * A value of 1 requests the inclusion of raw OA unit reports as
+	 * part of stream samples.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_OA,
+
+	/**
+	 * The value specifies which set of OA unit metrics should be
+	 * be configured, defining the contents of any OA unit reports.
+	 */
+	DRM_I915_PERF_PROP_OA_METRICS_SET,
+
+	/**
+	 * The value specifies the size and layout of OA unit reports.
+	 */
+	DRM_I915_PERF_PROP_OA_FORMAT,
+
+	/**
+	 * Specifying this property implicitly requests periodic OA unit
+	 * sampling and (at least on Haswell) the sampling frequency is derived
+	 * from this exponent as follows:
+	 *
+	 *   80ns * 2^(period_exponent + 1)
+	 */
+	DRM_I915_PERF_PROP_OA_EXPONENT,
+
+	DRM_I915_PERF_PROP_MAX /* non-ABI */
+};
+
+struct drm_i915_perf_open_param {
+	__u32 flags;
+#define I915_PERF_FLAG_FD_CLOEXEC	(1<<0)
+#define I915_PERF_FLAG_FD_NONBLOCK	(1<<1)
+#define I915_PERF_FLAG_DISABLED		(1<<2)
+
+	/** The number of u64 (id, value) pairs */
+	__u32 num_properties;
+
+	/**
+	 * Pointer to array of u64 (id, value) pairs configuring the stream
+	 * to open.
+	 */
+	__u64 properties_ptr;
+};
+
+/**
+ * Enable data capture for a stream that was either opened in a disabled state
+ * via I915_PERF_FLAG_DISABLED or was later disabled via
+ * I915_PERF_IOCTL_DISABLE.
+ *
+ * It is intended to be cheaper to disable and enable a stream than it may be
+ * to close and re-open a stream with the same configuration.
+ *
+ * It's undefined whether any pending data for the stream will be lost.
+ */
+#define I915_PERF_IOCTL_ENABLE	_IO('i', 0x0)
+
+/**
+ * Disable data capture for a stream.
+ *
+ * It is an error to try and read a stream that is disabled.
+ */
+#define I915_PERF_IOCTL_DISABLE	_IO('i', 0x1)
+
+/**
+ * Common to all i915 perf records
+ */
+struct drm_i915_perf_record_header {
+	__u32 type;
+	__u16 pad;
+	__u16 size;
+};
+
+enum drm_i915_perf_record_type {
+
+	/**
+	 * Samples are the work horse record type whose contents are extensible
+	 * and defined when opening an i915 perf stream based on the given
+	 * properties.
+	 *
+	 * Boolean properties following the naming convention
+	 * DRM_I915_PERF_SAMPLE_xyz_PROP request the inclusion of 'xyz' data in
+	 * every sample.
+	 *
+	 * The order of these sample properties given by userspace has no
+	 * affect on the ordering of data within a sample. The order is
+	 * documented here.
+	 *
+	 * struct {
+	 *     struct drm_i915_perf_record_header header;
+	 *
+	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
+	 * };
+	 */
+	DRM_I915_PERF_RECORD_SAMPLE = 1,
+
+	/*
+	 * Indicates that one or more OA reports were not written by the
+	 * hardware. This can happen for example if an MI_REPORT_PERF_COUNT
+	 * command collides with periodic sampling - which would be more likely
+	 * at higher sampling frequencies.
+	 */
+	DRM_I915_PERF_RECORD_OA_REPORT_LOST = 2,
+
+	/**
+	 * An error occurred that resulted in all pending OA reports being lost.
+	 */
+	DRM_I915_PERF_RECORD_OA_BUFFER_LOST = 3,
+
+	DRM_I915_PERF_RECORD_MAX /* non-ABI */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _UAPI_I915_DRM_H_ */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ce2988be4f0e..e99e3e6f8b37 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -643,7 +643,7 @@ enum bpf_func_id {
 
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
-	BPF_ADJ_ROOM_NET_OPTS,
+	BPF_ADJ_ROOM_NET,
 };
 
 /* user accessible mirror of in-kernel sk_buff.
@@ -750,6 +750,8 @@ struct bpf_map_info {
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
  * and their replies.
+ * Some of this fields are in network (bigendian) byte order and may need
+ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
  * New fields can only be added at the end of this structure
  */
 struct bpf_sock_ops {
@@ -759,12 +761,12 @@ struct bpf_sock_ops {
 		__u32 replylong[4];
 	};
 	__u32 family;
-	__u32 remote_ip4;
-	__u32 local_ip4;
-	__u32 remote_ip6[4];
-	__u32 local_ip6[4];
-	__u32 remote_port;
-	__u32 local_port;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index 813afd6eee71..ec69d55bcec7 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -43,6 +43,27 @@
 /* (1U << 31) is reserved for signed error codes */
 
 /*
+ * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the
+ * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on
+ * the specific file.
+ */
+#define F_GET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
+#define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
+#define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
+
+/*
+ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
+ * used to clear any hints previously set.
+ */
+#define RWF_WRITE_LIFE_NOT_SET	0
+#define RWH_WRITE_LIFE_NONE	1
+#define RWH_WRITE_LIFE_SHORT	2
+#define RWH_WRITE_LIFE_MEDIUM	3
+#define RWH_WRITE_LIFE_LONG	4
+#define RWH_WRITE_LIFE_EXTREME	5
+
+/*
  * Types of directory notifications that may be requested.
  */
 #define DN_ACCESS	0x00000001	/* File accessed */
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
new file mode 100644
index 000000000000..6cd63c18708a
--- /dev/null
+++ b/tools/include/uapi/linux/kvm.h
@@ -0,0 +1,1419 @@
+#ifndef __LINUX_KVM_H
+#define __LINUX_KVM_H
+
+/*
+ * Userspace interface for /dev/kvm - kernel based virtual machine
+ *
+ * Note: you must update KVM_API_VERSION if you change this interface.
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/ioctl.h>
+#include <asm/kvm.h>
+
+#define KVM_API_VERSION 12
+
+/* *** Deprecated interfaces *** */
+
+#define KVM_TRC_SHIFT           16
+
+#define KVM_TRC_ENTRYEXIT       (1 << KVM_TRC_SHIFT)
+#define KVM_TRC_HANDLER         (1 << (KVM_TRC_SHIFT + 1))
+
+#define KVM_TRC_VMENTRY         (KVM_TRC_ENTRYEXIT + 0x01)
+#define KVM_TRC_VMEXIT          (KVM_TRC_ENTRYEXIT + 0x02)
+#define KVM_TRC_PAGE_FAULT      (KVM_TRC_HANDLER + 0x01)
+
+#define KVM_TRC_HEAD_SIZE       12
+#define KVM_TRC_CYCLE_SIZE      8
+#define KVM_TRC_EXTRA_MAX       7
+
+#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
+
+struct kvm_user_trace_setup {
+	__u32 buf_size;
+	__u32 buf_nr;
+};
+
+#define __KVM_DEPRECATED_MAIN_W_0x06 \
+	_IOW(KVMIO, 0x06, struct kvm_user_trace_setup)
+#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07)
+#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08)
+
+#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq)
+
+struct kvm_breakpoint {
+	__u32 enabled;
+	__u32 padding;
+	__u64 address;
+};
+
+struct kvm_debug_guest {
+	__u32 enabled;
+	__u32 pad;
+	struct kvm_breakpoint breakpoints[4];
+	__u32 singlestep;
+};
+
+#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest)
+
+/* *** End of deprecated interfaces *** */
+
+
+/* for KVM_CREATE_MEMORY_REGION */
+struct kvm_memory_region {
+	__u32 slot;
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+};
+
+/* for KVM_SET_USER_MEMORY_REGION */
+struct kvm_userspace_memory_region {
+	__u32 slot;
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr; /* start of the userspace allocated memory */
+};
+
+/*
+ * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
+ * other bits are reserved for kvm internal use which are defined in
+ * include/linux/kvm_host.h.
+ */
+#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
+#define KVM_MEM_READONLY	(1UL << 1)
+
+/* for KVM_IRQ_LINE */
+struct kvm_irq_level {
+	/*
+	 * ACPI gsi notion of irq.
+	 * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
+	 * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
+	 * For ARM: See Documentation/virtual/kvm/api.txt
+	 */
+	union {
+		__u32 irq;
+		__s32 status;
+	};
+	__u32 level;
+};
+
+
+struct kvm_irqchip {
+	__u32 chip_id;
+	__u32 pad;
+        union {
+		char dummy[512];  /* reserving space */
+#ifdef __KVM_HAVE_PIT
+		struct kvm_pic_state pic;
+#endif
+#ifdef __KVM_HAVE_IOAPIC
+		struct kvm_ioapic_state ioapic;
+#endif
+	} chip;
+};
+
+/* for KVM_CREATE_PIT2 */
+struct kvm_pit_config {
+	__u32 flags;
+	__u32 pad[15];
+};
+
+#define KVM_PIT_SPEAKER_DUMMY     1
+
+struct kvm_s390_skeys {
+	__u64 start_gfn;
+	__u64 count;
+	__u64 skeydata_addr;
+	__u32 flags;
+	__u32 reserved[9];
+};
+
+#define KVM_S390_CMMA_PEEK (1 << 0)
+
+/**
+ * kvm_s390_cmma_log - Used for CMMA migration.
+ *
+ * Used both for input and output.
+ *
+ * @start_gfn: Guest page number to start from.
+ * @count: Size of the result buffer.
+ * @flags: Control operation mode via KVM_S390_CMMA_* flags
+ * @remaining: Used with KVM_S390_GET_CMMA_BITS. Indicates how many dirty
+ *             pages are still remaining.
+ * @mask: Used with KVM_S390_SET_CMMA_BITS. Bitmap of bits to actually set
+ *        in the PGSTE.
+ * @values: Pointer to the values buffer.
+ *
+ * Used in KVM_S390_{G,S}ET_CMMA_BITS ioctls.
+ */
+struct kvm_s390_cmma_log {
+	__u64 start_gfn;
+	__u32 count;
+	__u32 flags;
+	union {
+		__u64 remaining;
+		__u64 mask;
+	};
+	__u64 values;
+};
+
+struct kvm_hyperv_exit {
+#define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
+	__u32 type;
+	union {
+		struct {
+			__u32 msr;
+			__u64 control;
+			__u64 evt_page;
+			__u64 msg_page;
+		} synic;
+		struct {
+			__u64 input;
+			__u64 result;
+			__u64 params[2];
+		} hcall;
+	} u;
+};
+
+#define KVM_S390_GET_SKEYS_NONE   1
+#define KVM_S390_SKEYS_MAX        1048576
+
+#define KVM_EXIT_UNKNOWN          0
+#define KVM_EXIT_EXCEPTION        1
+#define KVM_EXIT_IO               2
+#define KVM_EXIT_HYPERCALL        3
+#define KVM_EXIT_DEBUG            4
+#define KVM_EXIT_HLT              5
+#define KVM_EXIT_MMIO             6
+#define KVM_EXIT_IRQ_WINDOW_OPEN  7
+#define KVM_EXIT_SHUTDOWN         8
+#define KVM_EXIT_FAIL_ENTRY       9
+#define KVM_EXIT_INTR             10
+#define KVM_EXIT_SET_TPR          11
+#define KVM_EXIT_TPR_ACCESS       12
+#define KVM_EXIT_S390_SIEIC       13
+#define KVM_EXIT_S390_RESET       14
+#define KVM_EXIT_DCR              15 /* deprecated */
+#define KVM_EXIT_NMI              16
+#define KVM_EXIT_INTERNAL_ERROR   17
+#define KVM_EXIT_OSI              18
+#define KVM_EXIT_PAPR_HCALL	  19
+#define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WATCHDOG         21
+#define KVM_EXIT_S390_TSCH        22
+#define KVM_EXIT_EPR              23
+#define KVM_EXIT_SYSTEM_EVENT     24
+#define KVM_EXIT_S390_STSI        25
+#define KVM_EXIT_IOAPIC_EOI       26
+#define KVM_EXIT_HYPERV           27
+
+/* For KVM_EXIT_INTERNAL_ERROR */
+/* Emulate instruction failed. */
+#define KVM_INTERNAL_ERROR_EMULATION	1
+/* Encounter unexpected simultaneous exceptions. */
+#define KVM_INTERNAL_ERROR_SIMUL_EX	2
+/* Encounter unexpected vm-exit due to delivery event. */
+#define KVM_INTERNAL_ERROR_DELIVERY_EV	3
+
+/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
+struct kvm_run {
+	/* in */
+	__u8 request_interrupt_window;
+	__u8 immediate_exit;
+	__u8 padding1[6];
+
+	/* out */
+	__u32 exit_reason;
+	__u8 ready_for_interrupt_injection;
+	__u8 if_flag;
+	__u16 flags;
+
+	/* in (pre_kvm_run), out (post_kvm_run) */
+	__u64 cr8;
+	__u64 apic_base;
+
+#ifdef __KVM_S390
+	/* the processor status word for s390 */
+	__u64 psw_mask; /* psw upper half */
+	__u64 psw_addr; /* psw lower half */
+#endif
+	union {
+		/* KVM_EXIT_UNKNOWN */
+		struct {
+			__u64 hardware_exit_reason;
+		} hw;
+		/* KVM_EXIT_FAIL_ENTRY */
+		struct {
+			__u64 hardware_entry_failure_reason;
+		} fail_entry;
+		/* KVM_EXIT_EXCEPTION */
+		struct {
+			__u32 exception;
+			__u32 error_code;
+		} ex;
+		/* KVM_EXIT_IO */
+		struct {
+#define KVM_EXIT_IO_IN  0
+#define KVM_EXIT_IO_OUT 1
+			__u8 direction;
+			__u8 size; /* bytes */
+			__u16 port;
+			__u32 count;
+			__u64 data_offset; /* relative to kvm_run start */
+		} io;
+		/* KVM_EXIT_DEBUG */
+		struct {
+			struct kvm_debug_exit_arch arch;
+		} debug;
+		/* KVM_EXIT_MMIO */
+		struct {
+			__u64 phys_addr;
+			__u8  data[8];
+			__u32 len;
+			__u8  is_write;
+		} mmio;
+		/* KVM_EXIT_HYPERCALL */
+		struct {
+			__u64 nr;
+			__u64 args[6];
+			__u64 ret;
+			__u32 longmode;
+			__u32 pad;
+		} hypercall;
+		/* KVM_EXIT_TPR_ACCESS */
+		struct {
+			__u64 rip;
+			__u32 is_write;
+			__u32 pad;
+		} tpr_access;
+		/* KVM_EXIT_S390_SIEIC */
+		struct {
+			__u8 icptcode;
+			__u16 ipa;
+			__u32 ipb;
+		} s390_sieic;
+		/* KVM_EXIT_S390_RESET */
+#define KVM_S390_RESET_POR       1
+#define KVM_S390_RESET_CLEAR     2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT  8
+#define KVM_S390_RESET_IPL       16
+		__u64 s390_reset_flags;
+		/* KVM_EXIT_S390_UCONTROL */
+		struct {
+			__u64 trans_exc_code;
+			__u32 pgm_code;
+		} s390_ucontrol;
+		/* KVM_EXIT_DCR (deprecated) */
+		struct {
+			__u32 dcrn;
+			__u32 data;
+			__u8  is_write;
+		} dcr;
+		/* KVM_EXIT_INTERNAL_ERROR */
+		struct {
+			__u32 suberror;
+			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
+			__u32 ndata;
+			__u64 data[16];
+		} internal;
+		/* KVM_EXIT_OSI */
+		struct {
+			__u64 gprs[32];
+		} osi;
+		/* KVM_EXIT_PAPR_HCALL */
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
+		/* KVM_EXIT_S390_TSCH */
+		struct {
+			__u16 subchannel_id;
+			__u16 subchannel_nr;
+			__u32 io_int_parm;
+			__u32 io_int_word;
+			__u32 ipb;
+			__u8 dequeued;
+		} s390_tsch;
+		/* KVM_EXIT_EPR */
+		struct {
+			__u32 epr;
+		} epr;
+		/* KVM_EXIT_SYSTEM_EVENT */
+		struct {
+#define KVM_SYSTEM_EVENT_SHUTDOWN       1
+#define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
+			__u32 type;
+			__u64 flags;
+		} system_event;
+		/* KVM_EXIT_S390_STSI */
+		struct {
+			__u64 addr;
+			__u8 ar;
+			__u8 reserved;
+			__u8 fc;
+			__u8 sel1;
+			__u16 sel2;
+		} s390_stsi;
+		/* KVM_EXIT_IOAPIC_EOI */
+		struct {
+			__u8 vector;
+		} eoi;
+		/* KVM_EXIT_HYPERV */
+		struct kvm_hyperv_exit hyperv;
+		/* Fix the size of the union. */
+		char padding[256];
+	};
+
+	/*
+	 * shared registers between kvm and userspace.
+	 * kvm_valid_regs specifies the register classes set by the host
+	 * kvm_dirty_regs specified the register classes dirtied by userspace
+	 * struct kvm_sync_regs is architecture specific, as well as the
+	 * bits for kvm_valid_regs and kvm_dirty_regs
+	 */
+	__u64 kvm_valid_regs;
+	__u64 kvm_dirty_regs;
+	union {
+		struct kvm_sync_regs regs;
+		char padding[2048];
+	} s;
+};
+
+/* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
+
+struct kvm_coalesced_mmio_zone {
+	__u64 addr;
+	__u32 size;
+	__u32 pad;
+};
+
+struct kvm_coalesced_mmio {
+	__u64 phys_addr;
+	__u32 len;
+	__u32 pad;
+	__u8  data[8];
+};
+
+struct kvm_coalesced_mmio_ring {
+	__u32 first, last;
+	struct kvm_coalesced_mmio coalesced_mmio[0];
+};
+
+#define KVM_COALESCED_MMIO_MAX \
+	((PAGE_SIZE - sizeof(struct kvm_coalesced_mmio_ring)) / \
+	 sizeof(struct kvm_coalesced_mmio))
+
+/* for KVM_TRANSLATE */
+struct kvm_translation {
+	/* in */
+	__u64 linear_address;
+
+	/* out */
+	__u64 physical_address;
+	__u8  valid;
+	__u8  writeable;
+	__u8  usermode;
+	__u8  pad[5];
+};
+
+/* for KVM_S390_MEM_OP */
+struct kvm_s390_mem_op {
+	/* in */
+	__u64 gaddr;		/* the guest address */
+	__u64 flags;		/* flags */
+	__u32 size;		/* amount of bytes */
+	__u32 op;		/* type of operation */
+	__u64 buf;		/* buffer in userspace */
+	__u8 ar;		/* the access register number */
+	__u8 reserved[31];	/* should be set to 0 */
+};
+/* types for kvm_s390_mem_op->op */
+#define KVM_S390_MEMOP_LOGICAL_READ	0
+#define KVM_S390_MEMOP_LOGICAL_WRITE	1
+/* flags for kvm_s390_mem_op->flags */
+#define KVM_S390_MEMOP_F_CHECK_ONLY		(1ULL << 0)
+#define KVM_S390_MEMOP_F_INJECT_EXCEPTION	(1ULL << 1)
+
+/* for KVM_INTERRUPT */
+struct kvm_interrupt {
+	/* in */
+	__u32 irq;
+};
+
+/* for KVM_GET_DIRTY_LOG */
+struct kvm_dirty_log {
+	__u32 slot;
+	__u32 padding1;
+	union {
+		void __user *dirty_bitmap; /* one bit per page */
+		__u64 padding2;
+	};
+};
+
+/* for KVM_SET_SIGNAL_MASK */
+struct kvm_signal_mask {
+	__u32 len;
+	__u8  sigset[0];
+};
+
+/* for KVM_TPR_ACCESS_REPORTING */
+struct kvm_tpr_access_ctl {
+	__u32 enabled;
+	__u32 flags;
+	__u32 reserved[8];
+};
+
+/* for KVM_SET_VAPIC_ADDR */
+struct kvm_vapic_addr {
+	__u64 vapic_addr;
+};
+
+/* for KVM_SET_MP_STATE */
+
+/* not all states are valid on all architectures */
+#define KVM_MP_STATE_RUNNABLE          0
+#define KVM_MP_STATE_UNINITIALIZED     1
+#define KVM_MP_STATE_INIT_RECEIVED     2
+#define KVM_MP_STATE_HALTED            3
+#define KVM_MP_STATE_SIPI_RECEIVED     4
+#define KVM_MP_STATE_STOPPED           5
+#define KVM_MP_STATE_CHECK_STOP        6
+#define KVM_MP_STATE_OPERATING         7
+#define KVM_MP_STATE_LOAD              8
+
+struct kvm_mp_state {
+	__u32 mp_state;
+};
+
+struct kvm_s390_psw {
+	__u64 mask;
+	__u64 addr;
+};
+
+/* valid values for type in kvm_s390_interrupt */
+#define KVM_S390_SIGP_STOP		0xfffe0000u
+#define KVM_S390_PROGRAM_INT		0xfffe0001u
+#define KVM_S390_SIGP_SET_PREFIX	0xfffe0002u
+#define KVM_S390_RESTART		0xfffe0003u
+#define KVM_S390_INT_PFAULT_INIT	0xfffe0004u
+#define KVM_S390_INT_PFAULT_DONE	0xfffe0005u
+#define KVM_S390_MCHK			0xfffe1000u
+#define KVM_S390_INT_CLOCK_COMP		0xffff1004u
+#define KVM_S390_INT_CPU_TIMER		0xffff1005u
+#define KVM_S390_INT_VIRTIO		0xffff2603u
+#define KVM_S390_INT_SERVICE		0xffff2401u
+#define KVM_S390_INT_EMERGENCY		0xffff1201u
+#define KVM_S390_INT_EXTERNAL_CALL	0xffff1202u
+/* Anything below 0xfffe0000u is taken by INT_IO */
+#define KVM_S390_INT_IO(ai,cssid,ssid,schid)   \
+	(((schid)) |			       \
+	 ((ssid) << 16) |		       \
+	 ((cssid) << 18) |		       \
+	 ((ai) << 26))
+#define KVM_S390_INT_IO_MIN		0x00000000u
+#define KVM_S390_INT_IO_MAX		0xfffdffffu
+#define KVM_S390_INT_IO_AI_MASK		0x04000000u
+
+
+struct kvm_s390_interrupt {
+	__u32 type;
+	__u32 parm;
+	__u64 parm64;
+};
+
+struct kvm_s390_io_info {
+	__u16 subchannel_id;
+	__u16 subchannel_nr;
+	__u32 io_int_parm;
+	__u32 io_int_word;
+};
+
+struct kvm_s390_ext_info {
+	__u32 ext_params;
+	__u32 pad;
+	__u64 ext_params2;
+};
+
+struct kvm_s390_pgm_info {
+	__u64 trans_exc_code;
+	__u64 mon_code;
+	__u64 per_address;
+	__u32 data_exc_code;
+	__u16 code;
+	__u16 mon_class_nr;
+	__u8 per_code;
+	__u8 per_atmid;
+	__u8 exc_access_id;
+	__u8 per_access_id;
+	__u8 op_access_id;
+#define KVM_S390_PGM_FLAGS_ILC_VALID	0x01
+#define KVM_S390_PGM_FLAGS_ILC_0	0x02
+#define KVM_S390_PGM_FLAGS_ILC_1	0x04
+#define KVM_S390_PGM_FLAGS_ILC_MASK	0x06
+#define KVM_S390_PGM_FLAGS_NO_REWIND	0x08
+	__u8 flags;
+	__u8 pad[2];
+};
+
+struct kvm_s390_prefix_info {
+	__u32 address;
+};
+
+struct kvm_s390_extcall_info {
+	__u16 code;
+};
+
+struct kvm_s390_emerg_info {
+	__u16 code;
+};
+
+#define KVM_S390_STOP_FLAG_STORE_STATUS	0x01
+struct kvm_s390_stop_info {
+	__u32 flags;
+};
+
+struct kvm_s390_mchk_info {
+	__u64 cr14;
+	__u64 mcic;
+	__u64 failing_storage_address;
+	__u32 ext_damage_code;
+	__u32 pad;
+	__u8 fixed_logout[16];
+};
+
+struct kvm_s390_irq {
+	__u64 type;
+	union {
+		struct kvm_s390_io_info io;
+		struct kvm_s390_ext_info ext;
+		struct kvm_s390_pgm_info pgm;
+		struct kvm_s390_emerg_info emerg;
+		struct kvm_s390_extcall_info extcall;
+		struct kvm_s390_prefix_info prefix;
+		struct kvm_s390_stop_info stop;
+		struct kvm_s390_mchk_info mchk;
+		char reserved[64];
+	} u;
+};
+
+struct kvm_s390_irq_state {
+	__u64 buf;
+	__u32 flags;
+	__u32 len;
+	__u32 reserved[4];
+};
+
+/* for KVM_SET_GUEST_DEBUG */
+
+#define KVM_GUESTDBG_ENABLE		0x00000001
+#define KVM_GUESTDBG_SINGLESTEP		0x00000002
+
+struct kvm_guest_debug {
+	__u32 control;
+	__u32 pad;
+	struct kvm_guest_debug_arch arch;
+};
+
+enum {
+	kvm_ioeventfd_flag_nr_datamatch,
+	kvm_ioeventfd_flag_nr_pio,
+	kvm_ioeventfd_flag_nr_deassign,
+	kvm_ioeventfd_flag_nr_virtio_ccw_notify,
+	kvm_ioeventfd_flag_nr_fast_mmio,
+	kvm_ioeventfd_flag_nr_max,
+};
+
+#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
+#define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
+#define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
+	(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+
+#define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
+
+struct kvm_ioeventfd {
+	__u64 datamatch;
+	__u64 addr;        /* legal pio/mmio address */
+	__u32 len;         /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
+	__s32 fd;
+	__u32 flags;
+	__u8  pad[36];
+};
+
+/* for KVM_ENABLE_CAP */
+struct kvm_enable_cap {
+	/* in */
+	__u32 cap;
+	__u32 flags;
+	__u64 args[4];
+	__u8  pad[64];
+};
+
+/* for KVM_PPC_GET_PVINFO */
+
+#define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
+
+struct kvm_ppc_pvinfo {
+	/* out */
+	__u32 flags;
+	__u32 hcall[4];
+	__u8  pad[108];
+};
+
+/* for KVM_PPC_GET_SMMU_INFO */
+#define KVM_PPC_PAGE_SIZES_MAX_SZ	8
+
+struct kvm_ppc_one_page_size {
+	__u32 page_shift;	/* Page shift (or 0) */
+	__u32 pte_enc;		/* Encoding in the HPTE (>>12) */
+};
+
+struct kvm_ppc_one_seg_page_size {
+	__u32 page_shift;	/* Base page shift of segment (or 0) */
+	__u32 slb_enc;		/* SLB encoding for BookS */
+	struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
+#define KVM_PPC_PAGE_SIZES_REAL		0x00000001
+#define KVM_PPC_1T_SEGMENTS		0x00000002
+
+struct kvm_ppc_smmu_info {
+	__u64 flags;
+	__u32 slb_size;
+	__u32 pad;
+	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
+};
+
+/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */
+struct kvm_ppc_resize_hpt {
+	__u64 flags;
+	__u32 shift;
+	__u32 pad;
+};
+
+#define KVMIO 0xAE
+
+/* machine type bits, to be used as argument to KVM_CREATE_VM */
+#define KVM_VM_S390_UCONTROL	1
+
+/* on ppc, 0 indicate default, 1 should force HV and 2 PR */
+#define KVM_VM_PPC_HV 1
+#define KVM_VM_PPC_PR 2
+
+/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
+#define KVM_VM_MIPS_TE		0
+#define KVM_VM_MIPS_VZ		1
+
+#define KVM_S390_SIE_PAGE_OFFSET 1
+
+/*
+ * ioctls for /dev/kvm fds:
+ */
+#define KVM_GET_API_VERSION       _IO(KVMIO,   0x00)
+#define KVM_CREATE_VM             _IO(KVMIO,   0x01) /* returns a VM fd */
+#define KVM_GET_MSR_INDEX_LIST    _IOWR(KVMIO, 0x02, struct kvm_msr_list)
+
+#define KVM_S390_ENABLE_SIE       _IO(KVMIO,   0x06)
+/*
+ * Check if a kvm extension is available.  Argument is extension number,
+ * return is 1 (yes) or 0 (no, sorry).
+ */
+#define KVM_CHECK_EXTENSION       _IO(KVMIO,   0x03)
+/*
+ * Get size for mmap(vcpu_fd)
+ */
+#define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
+#define KVM_GET_SUPPORTED_CPUID   _IOWR(KVMIO, 0x05, struct kvm_cpuid2)
+#define KVM_TRACE_ENABLE          __KVM_DEPRECATED_MAIN_W_0x06
+#define KVM_TRACE_PAUSE           __KVM_DEPRECATED_MAIN_0x07
+#define KVM_TRACE_DISABLE         __KVM_DEPRECATED_MAIN_0x08
+#define KVM_GET_EMULATED_CPUID	  _IOWR(KVMIO, 0x09, struct kvm_cpuid2)
+
+/*
+ * Extension capability list.
+ */
+#define KVM_CAP_IRQCHIP	  0
+#define KVM_CAP_HLT	  1
+#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
+#define KVM_CAP_USER_MEMORY 3
+#define KVM_CAP_SET_TSS_ADDR 4
+#define KVM_CAP_VAPIC 6
+#define KVM_CAP_EXT_CPUID 7
+#define KVM_CAP_CLOCKSOURCE 8
+#define KVM_CAP_NR_VCPUS 9       /* returns recommended max vcpus per vm */
+#define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
+#define KVM_CAP_PIT 11
+#define KVM_CAP_NOP_IO_DELAY 12
+#define KVM_CAP_PV_MMU 13
+#define KVM_CAP_MP_STATE 14
+#define KVM_CAP_COALESCED_MMIO 15
+#define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
+#define KVM_CAP_IOMMU 18
+/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
+#define KVM_CAP_USER_NMI 22
+#ifdef __KVM_HAVE_GUEST_DEBUG
+#define KVM_CAP_SET_GUEST_DEBUG 23
+#endif
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_REINJECT_CONTROL 24
+#endif
+#define KVM_CAP_IRQ_ROUTING 25
+#define KVM_CAP_IRQ_INJECT_STATUS 26
+#define KVM_CAP_ASSIGN_DEV_IRQ 29
+/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#ifdef __KVM_HAVE_MCE
+#define KVM_CAP_MCE 31
+#endif
+#define KVM_CAP_IRQFD 32
+#ifdef __KVM_HAVE_PIT
+#define KVM_CAP_PIT2 33
+#endif
+#define KVM_CAP_SET_BOOT_CPU_ID 34
+#ifdef __KVM_HAVE_PIT_STATE2
+#define KVM_CAP_PIT_STATE2 35
+#endif
+#define KVM_CAP_IOEVENTFD 36
+#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
+#ifdef __KVM_HAVE_XEN_HVM
+#define KVM_CAP_XEN_HVM 38
+#endif
+#define KVM_CAP_ADJUST_CLOCK 39
+#define KVM_CAP_INTERNAL_ERROR_DATA 40
+#ifdef __KVM_HAVE_VCPU_EVENTS
+#define KVM_CAP_VCPU_EVENTS 41
+#endif
+#define KVM_CAP_S390_PSW 42
+#define KVM_CAP_PPC_SEGSTATE 43
+#define KVM_CAP_HYPERV 44
+#define KVM_CAP_HYPERV_VAPIC 45
+#define KVM_CAP_HYPERV_SPIN 46
+#define KVM_CAP_PCI_SEGMENT 47
+#define KVM_CAP_PPC_PAIRED_SINGLES 48
+#define KVM_CAP_INTR_SHADOW 49
+#ifdef __KVM_HAVE_DEBUGREGS
+#define KVM_CAP_DEBUGREGS 50
+#endif
+#define KVM_CAP_X86_ROBUST_SINGLESTEP 51
+#define KVM_CAP_PPC_OSI 52
+#define KVM_CAP_PPC_UNSET_IRQ 53
+#define KVM_CAP_ENABLE_CAP 54
+#ifdef __KVM_HAVE_XSAVE
+#define KVM_CAP_XSAVE 55
+#endif
+#ifdef __KVM_HAVE_XCRS
+#define KVM_CAP_XCRS 56
+#endif
+#define KVM_CAP_PPC_GET_PVINFO 57
+#define KVM_CAP_PPC_IRQ_LEVEL 58
+#define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_TSC_CONTROL 60
+#define KVM_CAP_GET_TSC_KHZ 61
+#define KVM_CAP_PPC_BOOKE_SREGS 62
+#define KVM_CAP_SPAPR_TCE 63
+#define KVM_CAP_PPC_SMT 64
+#define KVM_CAP_PPC_RMA	65
+#define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
+#define KVM_CAP_PPC_HIOR 67
+#define KVM_CAP_PPC_PAPR 68
+#define KVM_CAP_SW_TLB 69
+#define KVM_CAP_ONE_REG 70
+#define KVM_CAP_S390_GMAP 71
+#define KVM_CAP_TSC_DEADLINE_TIMER 72
+#define KVM_CAP_S390_UCONTROL 73
+#define KVM_CAP_SYNC_REGS 74
+#define KVM_CAP_PCI_2_3 75
+#define KVM_CAP_KVMCLOCK_CTRL 76
+#define KVM_CAP_SIGNAL_MSI 77
+#define KVM_CAP_PPC_GET_SMMU_INFO 78
+#define KVM_CAP_S390_COW 79
+#define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_READONLY_MEM 81
+#define KVM_CAP_IRQFD_RESAMPLE 82
+#define KVM_CAP_PPC_BOOKE_WATCHDOG 83
+#define KVM_CAP_PPC_HTAB_FD 84
+#define KVM_CAP_S390_CSS_SUPPORT 85
+#define KVM_CAP_PPC_EPR 86
+#define KVM_CAP_ARM_PSCI 87
+#define KVM_CAP_ARM_SET_DEVICE_ADDR 88
+#define KVM_CAP_DEVICE_CTRL 89
+#define KVM_CAP_IRQ_MPIC 90
+#define KVM_CAP_PPC_RTAS 91
+#define KVM_CAP_IRQ_XICS 92
+#define KVM_CAP_ARM_EL1_32BIT 93
+#define KVM_CAP_SPAPR_MULTITCE 94
+#define KVM_CAP_EXT_EMUL_CPUID 95
+#define KVM_CAP_HYPERV_TIME 96
+#define KVM_CAP_IOAPIC_POLARITY_IGNORED 97
+#define KVM_CAP_ENABLE_CAP_VM 98
+#define KVM_CAP_S390_IRQCHIP 99
+#define KVM_CAP_IOEVENTFD_NO_LENGTH 100
+#define KVM_CAP_VM_ATTRIBUTES 101
+#define KVM_CAP_ARM_PSCI_0_2 102
+#define KVM_CAP_PPC_FIXUP_HCALL 103
+#define KVM_CAP_PPC_ENABLE_HCALL 104
+#define KVM_CAP_CHECK_EXTENSION_VM 105
+#define KVM_CAP_S390_USER_SIGP 106
+#define KVM_CAP_S390_VECTOR_REGISTERS 107
+#define KVM_CAP_S390_MEM_OP 108
+#define KVM_CAP_S390_USER_STSI 109
+#define KVM_CAP_S390_SKEYS 110
+#define KVM_CAP_MIPS_FPU 111
+#define KVM_CAP_MIPS_MSA 112
+#define KVM_CAP_S390_INJECT_IRQ 113
+#define KVM_CAP_S390_IRQ_STATE 114
+#define KVM_CAP_PPC_HWRNG 115
+#define KVM_CAP_DISABLE_QUIRKS 116
+#define KVM_CAP_X86_SMM 117
+#define KVM_CAP_MULTI_ADDRESS_SPACE 118
+#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
+#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
+#define KVM_CAP_SPLIT_IRQCHIP 121
+#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
+#define KVM_CAP_HYPERV_SYNIC 123
+#define KVM_CAP_S390_RI 124
+#define KVM_CAP_SPAPR_TCE_64 125
+#define KVM_CAP_ARM_PMU_V3 126
+#define KVM_CAP_VCPU_ATTRIBUTES 127
+#define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_S390_USER_INSTR0 130
+#define KVM_CAP_MSI_DEVID 131
+#define KVM_CAP_PPC_HTM 132
+#define KVM_CAP_SPAPR_RESIZE_HPT 133
+#define KVM_CAP_PPC_MMU_RADIX 134
+#define KVM_CAP_PPC_MMU_HASH_V3 135
+#define KVM_CAP_IMMEDIATE_EXIT 136
+#define KVM_CAP_MIPS_VZ 137
+#define KVM_CAP_MIPS_TE 138
+#define KVM_CAP_MIPS_64BIT 139
+#define KVM_CAP_S390_GS 140
+#define KVM_CAP_S390_AIS 141
+#define KVM_CAP_SPAPR_TCE_VFIO 142
+#define KVM_CAP_X86_GUEST_MWAIT 143
+#define KVM_CAP_ARM_USER_IRQ 144
+#define KVM_CAP_S390_CMMA_MIGRATION 145
+#define KVM_CAP_PPC_FWNMI 146
+#define KVM_CAP_PPC_SMT_POSSIBLE 147
+#define KVM_CAP_HYPERV_SYNIC2 148
+#define KVM_CAP_HYPERV_VP_INDEX 149
+
+#ifdef KVM_CAP_IRQ_ROUTING
+
+struct kvm_irq_routing_irqchip {
+	__u32 irqchip;
+	__u32 pin;
+};
+
+struct kvm_irq_routing_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	union {
+		__u32 pad;
+		__u32 devid;
+	};
+};
+
+struct kvm_irq_routing_s390_adapter {
+	__u64 ind_addr;
+	__u64 summary_addr;
+	__u64 ind_offset;
+	__u32 summary_offset;
+	__u32 adapter_id;
+};
+
+struct kvm_irq_routing_hv_sint {
+	__u32 vcpu;
+	__u32 sint;
+};
+
+/* gsi routing entry types */
+#define KVM_IRQ_ROUTING_IRQCHIP 1
+#define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_S390_ADAPTER 3
+#define KVM_IRQ_ROUTING_HV_SINT 4
+
+struct kvm_irq_routing_entry {
+	__u32 gsi;
+	__u32 type;
+	__u32 flags;
+	__u32 pad;
+	union {
+		struct kvm_irq_routing_irqchip irqchip;
+		struct kvm_irq_routing_msi msi;
+		struct kvm_irq_routing_s390_adapter adapter;
+		struct kvm_irq_routing_hv_sint hv_sint;
+		__u32 pad[8];
+	} u;
+};
+
+struct kvm_irq_routing {
+	__u32 nr;
+	__u32 flags;
+	struct kvm_irq_routing_entry entries[0];
+};
+
+#endif
+
+#ifdef KVM_CAP_MCE
+/* x86 MCE */
+struct kvm_x86_mce {
+	__u64 status;
+	__u64 addr;
+	__u64 misc;
+	__u64 mcg_status;
+	__u8 bank;
+	__u8 pad1[7];
+	__u64 pad2[3];
+};
+#endif
+
+#ifdef KVM_CAP_XEN_HVM
+struct kvm_xen_hvm_config {
+	__u32 flags;
+	__u32 msr;
+	__u64 blob_addr_32;
+	__u64 blob_addr_64;
+	__u8 blob_size_32;
+	__u8 blob_size_64;
+	__u8 pad2[30];
+};
+#endif
+
+#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+/*
+ * Available with KVM_CAP_IRQFD_RESAMPLE
+ *
+ * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
+ * the irqfd to operate in resampling mode for level triggered interrupt
+ * emulation.  See Documentation/virtual/kvm/api.txt.
+ */
+#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)
+
+struct kvm_irqfd {
+	__u32 fd;
+	__u32 gsi;
+	__u32 flags;
+	__u32 resamplefd;
+	__u8  pad[16];
+};
+
+/* For KVM_CAP_ADJUST_CLOCK */
+
+/* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags.  */
+#define KVM_CLOCK_TSC_STABLE		2
+
+struct kvm_clock_data {
+	__u64 clock;
+	__u32 flags;
+	__u32 pad[9];
+};
+
+/* For KVM_CAP_SW_TLB */
+
+#define KVM_MMU_FSL_BOOKE_NOHV		0
+#define KVM_MMU_FSL_BOOKE_HV		1
+
+struct kvm_config_tlb {
+	__u64 params;
+	__u64 array;
+	__u32 mmu_type;
+	__u32 array_len;
+};
+
+struct kvm_dirty_tlb {
+	__u64 bitmap;
+	__u32 num_dirty;
+};
+
+/* Available with KVM_CAP_ONE_REG */
+
+#define KVM_REG_ARCH_MASK	0xff00000000000000ULL
+#define KVM_REG_GENERIC		0x0000000000000000ULL
+
+/*
+ * Architecture specific registers are to be defined in arch headers and
+ * ORed with the arch identifier.
+ */
+#define KVM_REG_PPC		0x1000000000000000ULL
+#define KVM_REG_X86		0x2000000000000000ULL
+#define KVM_REG_IA64		0x3000000000000000ULL
+#define KVM_REG_ARM		0x4000000000000000ULL
+#define KVM_REG_S390		0x5000000000000000ULL
+#define KVM_REG_ARM64		0x6000000000000000ULL
+#define KVM_REG_MIPS		0x7000000000000000ULL
+
+#define KVM_REG_SIZE_SHIFT	52
+#define KVM_REG_SIZE_MASK	0x00f0000000000000ULL
+#define KVM_REG_SIZE_U8		0x0000000000000000ULL
+#define KVM_REG_SIZE_U16	0x0010000000000000ULL
+#define KVM_REG_SIZE_U32	0x0020000000000000ULL
+#define KVM_REG_SIZE_U64	0x0030000000000000ULL
+#define KVM_REG_SIZE_U128	0x0040000000000000ULL
+#define KVM_REG_SIZE_U256	0x0050000000000000ULL
+#define KVM_REG_SIZE_U512	0x0060000000000000ULL
+#define KVM_REG_SIZE_U1024	0x0070000000000000ULL
+
+struct kvm_reg_list {
+	__u64 n; /* number of regs */
+	__u64 reg[0];
+};
+
+struct kvm_one_reg {
+	__u64 id;
+	__u64 addr;
+};
+
+#define KVM_MSI_VALID_DEVID	(1U << 0)
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u32 devid;
+	__u8  pad[12];
+};
+
+struct kvm_arm_device_addr {
+	__u64 id;
+	__u64 addr;
+};
+
+/*
+ * Device control API, available with KVM_CAP_DEVICE_CTRL
+ */
+#define KVM_CREATE_DEVICE_TEST		1
+
+struct kvm_create_device {
+	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
+	__u32	fd;	/* out: device handle */
+	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
+};
+
+struct kvm_device_attr {
+	__u32	flags;		/* no flags currently defined */
+	__u32	group;		/* device-defined */
+	__u64	attr;		/* group-defined */
+	__u64	addr;		/* userspace address of attr data */
+};
+
+#define  KVM_DEV_VFIO_GROUP			1
+#define   KVM_DEV_VFIO_GROUP_ADD			1
+#define   KVM_DEV_VFIO_GROUP_DEL			2
+#define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE		3
+
+enum kvm_device_type {
+	KVM_DEV_TYPE_FSL_MPIC_20	= 1,
+#define KVM_DEV_TYPE_FSL_MPIC_20	KVM_DEV_TYPE_FSL_MPIC_20
+	KVM_DEV_TYPE_FSL_MPIC_42,
+#define KVM_DEV_TYPE_FSL_MPIC_42	KVM_DEV_TYPE_FSL_MPIC_42
+	KVM_DEV_TYPE_XICS,
+#define KVM_DEV_TYPE_XICS		KVM_DEV_TYPE_XICS
+	KVM_DEV_TYPE_VFIO,
+#define KVM_DEV_TYPE_VFIO		KVM_DEV_TYPE_VFIO
+	KVM_DEV_TYPE_ARM_VGIC_V2,
+#define KVM_DEV_TYPE_ARM_VGIC_V2	KVM_DEV_TYPE_ARM_VGIC_V2
+	KVM_DEV_TYPE_FLIC,
+#define KVM_DEV_TYPE_FLIC		KVM_DEV_TYPE_FLIC
+	KVM_DEV_TYPE_ARM_VGIC_V3,
+#define KVM_DEV_TYPE_ARM_VGIC_V3	KVM_DEV_TYPE_ARM_VGIC_V3
+	KVM_DEV_TYPE_ARM_VGIC_ITS,
+#define KVM_DEV_TYPE_ARM_VGIC_ITS	KVM_DEV_TYPE_ARM_VGIC_ITS
+	KVM_DEV_TYPE_MAX,
+};
+
+struct kvm_vfio_spapr_tce {
+	__s32	groupfd;
+	__s32	tablefd;
+};
+
+/*
+ * ioctls for VM fds
+ */
+#define KVM_SET_MEMORY_REGION     _IOW(KVMIO,  0x40, struct kvm_memory_region)
+/*
+ * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
+ * a vcpu fd.
+ */
+#define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
+#define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
+/* KVM_SET_MEMORY_ALIAS is obsolete: */
+#define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO,  0x43, struct kvm_memory_alias)
+#define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
+#define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
+					struct kvm_userspace_memory_region)
+#define KVM_SET_TSS_ADDR          _IO(KVMIO,   0x47)
+#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+
+/* enable ucontrol for s390 */
+struct kvm_s390_ucas_mapping {
+	__u64 user_addr;
+	__u64 vcpu_addr;
+	__u64 length;
+};
+#define KVM_S390_UCAS_MAP        _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping)
+#define KVM_S390_UCAS_UNMAP      _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping)
+#define KVM_S390_VCPU_FAULT	 _IOW(KVMIO, 0x52, unsigned long)
+
+/* Device model IOC */
+#define KVM_CREATE_IRQCHIP        _IO(KVMIO,   0x60)
+#define KVM_IRQ_LINE              _IOW(KVMIO,  0x61, struct kvm_irq_level)
+#define KVM_GET_IRQCHIP           _IOWR(KVMIO, 0x62, struct kvm_irqchip)
+#define KVM_SET_IRQCHIP           _IOR(KVMIO,  0x63, struct kvm_irqchip)
+#define KVM_CREATE_PIT            _IO(KVMIO,   0x64)
+#define KVM_GET_PIT               _IOWR(KVMIO, 0x65, struct kvm_pit_state)
+#define KVM_SET_PIT               _IOR(KVMIO,  0x66, struct kvm_pit_state)
+#define KVM_IRQ_LINE_STATUS       _IOWR(KVMIO, 0x67, struct kvm_irq_level)
+#define KVM_REGISTER_COALESCED_MMIO \
+			_IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
+#define KVM_UNREGISTER_COALESCED_MMIO \
+			_IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
+#define KVM_ASSIGN_PCI_DEVICE     _IOR(KVMIO,  0x69, \
+				       struct kvm_assigned_pci_dev)
+#define KVM_SET_GSI_ROUTING       _IOW(KVMIO,  0x6a, struct kvm_irq_routing)
+/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
+#define KVM_ASSIGN_IRQ            __KVM_DEPRECATED_VM_R_0x70
+#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO,  0x70, struct kvm_assigned_irq)
+#define KVM_REINJECT_CONTROL      _IO(KVMIO,   0x71)
+#define KVM_DEASSIGN_PCI_DEVICE   _IOW(KVMIO,  0x72, \
+				       struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR    _IOW(KVMIO,  0x73, \
+				       struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO,  0x74, \
+				       struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ      _IOW(KVMIO,  0x75, struct kvm_assigned_irq)
+#define KVM_IRQFD                 _IOW(KVMIO,  0x76, struct kvm_irqfd)
+#define KVM_CREATE_PIT2		  _IOW(KVMIO,  0x77, struct kvm_pit_config)
+#define KVM_SET_BOOT_CPU_ID       _IO(KVMIO,   0x78)
+#define KVM_IOEVENTFD             _IOW(KVMIO,  0x79, struct kvm_ioeventfd)
+#define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
+#define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
+#define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_PIT_STATE2 */
+#define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
+#define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
+/* Available with KVM_CAP_PPC_GET_PVINFO */
+#define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
+/* Available with KVM_CAP_TSC_CONTROL */
+#define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
+#define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
+				       struct kvm_assigned_pci_dev)
+/* Available with KVM_CAP_SIGNAL_MSI */
+#define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
+/* Available with KVM_CAP_PPC_GET_SMMU_INFO */
+#define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
+/* Available with KVM_CAP_PPC_ALLOC_HTAB */
+#define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
+#define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_64	  _IOW(KVMIO,  0xa8, \
+				       struct kvm_create_spapr_tce_64)
+/* Available with KVM_CAP_RMA */
+#define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
+/* Available with KVM_CAP_PPC_HTAB_FD */
+#define KVM_PPC_GET_HTAB_FD	  _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
+/* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */
+#define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
+/* Available with KVM_CAP_PPC_RTAS */
+#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+/* Available with KVM_CAP_SPAPR_RESIZE_HPT */
+#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
+#define KVM_PPC_RESIZE_HPT_COMMIT  _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
+/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
+#define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
+/* Available with KVM_CAP_PPC_RADIX_MMU */
+#define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
+
+/* ioctl for vm fd */
+#define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
+
+/* ioctls for fds returned by KVM_CREATE_DEVICE */
+#define KVM_SET_DEVICE_ATTR	  _IOW(KVMIO,  0xe1, struct kvm_device_attr)
+#define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
+#define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
+
+/*
+ * ioctls for vcpu fds
+ */
+#define KVM_RUN                   _IO(KVMIO,   0x80)
+#define KVM_GET_REGS              _IOR(KVMIO,  0x81, struct kvm_regs)
+#define KVM_SET_REGS              _IOW(KVMIO,  0x82, struct kvm_regs)
+#define KVM_GET_SREGS             _IOR(KVMIO,  0x83, struct kvm_sregs)
+#define KVM_SET_SREGS             _IOW(KVMIO,  0x84, struct kvm_sregs)
+#define KVM_TRANSLATE             _IOWR(KVMIO, 0x85, struct kvm_translation)
+#define KVM_INTERRUPT             _IOW(KVMIO,  0x86, struct kvm_interrupt)
+/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */
+#define KVM_DEBUG_GUEST           __KVM_DEPRECATED_VCPU_W_0x87
+#define KVM_GET_MSRS              _IOWR(KVMIO, 0x88, struct kvm_msrs)
+#define KVM_SET_MSRS              _IOW(KVMIO,  0x89, struct kvm_msrs)
+#define KVM_SET_CPUID             _IOW(KVMIO,  0x8a, struct kvm_cpuid)
+#define KVM_SET_SIGNAL_MASK       _IOW(KVMIO,  0x8b, struct kvm_signal_mask)
+#define KVM_GET_FPU               _IOR(KVMIO,  0x8c, struct kvm_fpu)
+#define KVM_SET_FPU               _IOW(KVMIO,  0x8d, struct kvm_fpu)
+#define KVM_GET_LAPIC             _IOR(KVMIO,  0x8e, struct kvm_lapic_state)
+#define KVM_SET_LAPIC             _IOW(KVMIO,  0x8f, struct kvm_lapic_state)
+#define KVM_SET_CPUID2            _IOW(KVMIO,  0x90, struct kvm_cpuid2)
+#define KVM_GET_CPUID2            _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_TPR_ACCESS_REPORTING  _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
+/* Available with KVM_CAP_VAPIC */
+#define KVM_SET_VAPIC_ADDR        _IOW(KVMIO,  0x93, struct kvm_vapic_addr)
+/* valid for virtual machine (for floating interrupt)_and_ vcpu */
+#define KVM_S390_INTERRUPT        _IOW(KVMIO,  0x94, struct kvm_s390_interrupt)
+/* store status for s390 */
+#define KVM_S390_STORE_STATUS_NOADDR    (-1ul)
+#define KVM_S390_STORE_STATUS_PREFIXED  (-2ul)
+#define KVM_S390_STORE_STATUS	  _IOW(KVMIO,  0x95, unsigned long)
+/* initial ipl psw for s390 */
+#define KVM_S390_SET_INITIAL_PSW  _IOW(KVMIO,  0x96, struct kvm_s390_psw)
+/* initial reset for s390 */
+#define KVM_S390_INITIAL_RESET    _IO(KVMIO,   0x97)
+#define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
+#define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
+/* Available with KVM_CAP_USER_NMI */
+#define KVM_NMI                   _IO(KVMIO,   0x9a)
+/* Available with KVM_CAP_SET_GUEST_DEBUG */
+#define KVM_SET_GUEST_DEBUG       _IOW(KVMIO,  0x9b, struct kvm_guest_debug)
+/* MCE for x86 */
+#define KVM_X86_SETUP_MCE         _IOW(KVMIO,  0x9c, __u64)
+#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO,  0x9d, __u64)
+#define KVM_X86_SET_MCE           _IOW(KVMIO,  0x9e, struct kvm_x86_mce)
+/* Available with KVM_CAP_VCPU_EVENTS */
+#define KVM_GET_VCPU_EVENTS       _IOR(KVMIO,  0x9f, struct kvm_vcpu_events)
+#define KVM_SET_VCPU_EVENTS       _IOW(KVMIO,  0xa0, struct kvm_vcpu_events)
+/* Available with KVM_CAP_DEBUGREGS */
+#define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
+#define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
+/*
+ * vcpu version available with KVM_ENABLE_CAP
+ * vm version available with KVM_CAP_ENABLE_CAP_VM
+ */
+#define KVM_ENABLE_CAP            _IOW(KVMIO,  0xa3, struct kvm_enable_cap)
+/* Available with KVM_CAP_XSAVE */
+#define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
+#define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XCRS */
+#define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
+#define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
+/* Available with KVM_CAP_SW_TLB */
+#define KVM_DIRTY_TLB		  _IOW(KVMIO,  0xaa, struct kvm_dirty_tlb)
+/* Available with KVM_CAP_ONE_REG */
+#define KVM_GET_ONE_REG		  _IOW(KVMIO,  0xab, struct kvm_one_reg)
+#define KVM_SET_ONE_REG		  _IOW(KVMIO,  0xac, struct kvm_one_reg)
+/* VM is being stopped by host */
+#define KVM_KVMCLOCK_CTRL	  _IO(KVMIO,   0xad)
+#define KVM_ARM_VCPU_INIT	  _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
+#define KVM_ARM_PREFERRED_TARGET  _IOR(KVMIO,  0xaf, struct kvm_vcpu_init)
+#define KVM_GET_REG_LIST	  _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
+/* Available with KVM_CAP_S390_MEM_OP */
+#define KVM_S390_MEM_OP		  _IOW(KVMIO,  0xb1, struct kvm_s390_mem_op)
+/* Available with KVM_CAP_S390_SKEYS */
+#define KVM_S390_GET_SKEYS      _IOW(KVMIO, 0xb2, struct kvm_s390_skeys)
+#define KVM_S390_SET_SKEYS      _IOW(KVMIO, 0xb3, struct kvm_s390_skeys)
+/* Available with KVM_CAP_S390_INJECT_IRQ */
+#define KVM_S390_IRQ              _IOW(KVMIO,  0xb4, struct kvm_s390_irq)
+/* Available with KVM_CAP_S390_IRQ_STATE */
+#define KVM_S390_SET_IRQ_STATE	  _IOW(KVMIO, 0xb5, struct kvm_s390_irq_state)
+#define KVM_S390_GET_IRQ_STATE	  _IOW(KVMIO, 0xb6, struct kvm_s390_irq_state)
+/* Available with KVM_CAP_X86_SMM */
+#define KVM_SMI                   _IO(KVMIO,   0xb7)
+/* Available with KVM_CAP_S390_CMMA_MIGRATION */
+#define KVM_S390_GET_CMMA_BITS      _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
+#define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
+
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
+#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
+
+struct kvm_assigned_pci_dev {
+	__u32 assigned_dev_id;
+	__u32 busnr;
+	__u32 devfn;
+	__u32 flags;
+	__u32 segnr;
+	union {
+		__u32 reserved[11];
+	};
+};
+
+#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
+
+#define KVM_DEV_IRQ_HOST_MASK	 0x00ff
+#define KVM_DEV_IRQ_GUEST_MASK   0xff00
+
+struct kvm_assigned_irq {
+	__u32 assigned_dev_id;
+	__u32 host_irq; /* ignored (legacy field) */
+	__u32 guest_irq;
+	__u32 flags;
+	union {
+		__u32 reserved[12];
+	};
+};
+
+struct kvm_assigned_msix_nr {
+	__u32 assigned_dev_id;
+	__u16 entry_nr;
+	__u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV		256
+struct kvm_assigned_msix_entry {
+	__u32 assigned_dev_id;
+	__u32 gsi;
+	__u16 entry; /* The index of entry in the MSI-X table */
+	__u16 padding[3];
+};
+
+#define KVM_X2APIC_API_USE_32BIT_IDS            (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK  (1ULL << 1)
+
+/* Available with KVM_CAP_ARM_USER_IRQ */
+
+/* Bits for run->s.regs.device_irq_level */
+#define KVM_ARM_DEV_EL1_VTIMER		(1 << 0)
+#define KVM_ARM_DEV_EL1_PTIMER		(1 << 1)
+#define KVM_ARM_DEV_PMU			(1 << 2)
+
+#endif /* __LINUX_KVM_H */
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index b1c0b187acfe..2a37ae925d85 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
 	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT	= 16, /* save branch type */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -198,9 +200,30 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
 	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE	=
+		1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
+/*
+ * Common flow change classification
+ */
+enum {
+	PERF_BR_UNKNOWN		= 0,	/* unknown */
+	PERF_BR_COND		= 1,	/* conditional */
+	PERF_BR_UNCOND		= 2,	/* unconditional  */
+	PERF_BR_IND		= 3,	/* indirect */
+	PERF_BR_CALL		= 4,	/* function call */
+	PERF_BR_IND_CALL	= 5,	/* indirect function call */
+	PERF_BR_RET		= 6,	/* function return */
+	PERF_BR_SYSCALL		= 7,	/* syscall */
+	PERF_BR_SYSRET		= 8,	/* syscall return */
+	PERF_BR_COND_CALL	= 9,	/* conditional function call */
+	PERF_BR_COND_RET	= 10,	/* conditional function return */
+	PERF_BR_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -931,14 +954,20 @@ union perf_mem_data_src {
 			mem_snoop:5,	/* snoop mode */
 			mem_lock:2,	/* lock instr */
 			mem_dtlb:7,	/* tlb access */
-			mem_rsvd:31;
+			mem_lvl_num:4,	/* memory hierarchy level number */
+			mem_remote:1,   /* remote */
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_rsvd:24;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd:31,
+		__u64	mem_rsvd:24,
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_remote:1,   /* remote */
+			mem_lvl_num:4,	/* memory hierarchy level number */
 			mem_dtlb:7,	/* tlb access */
 			mem_lock:2,	/* lock instr */
 			mem_snoop:5,	/* snoop mode */
@@ -975,6 +1004,22 @@ union perf_mem_data_src {
 #define PERF_MEM_LVL_UNC	0x2000 /* Uncached memory */
 #define PERF_MEM_LVL_SHIFT	5
 
+#define PERF_MEM_REMOTE_REMOTE	0x01  /* Remote */
+#define PERF_MEM_REMOTE_SHIFT	37
+
+#define PERF_MEM_LVLNUM_L1	0x01 /* L1 */
+#define PERF_MEM_LVLNUM_L2	0x02 /* L2 */
+#define PERF_MEM_LVLNUM_L3	0x03 /* L3 */
+#define PERF_MEM_LVLNUM_L4	0x04 /* L4 */
+/* 5-0xa available */
+#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
+#define PERF_MEM_LVLNUM_LFB	0x0c /* LFB */
+#define PERF_MEM_LVLNUM_RAM	0x0d /* RAM */
+#define PERF_MEM_LVLNUM_PMEM	0x0e /* PMEM */
+#define PERF_MEM_LVLNUM_NA	0x0f /* N/A */
+
+#define PERF_MEM_LVLNUM_SHIFT	33
+
 /* snoop mode */
 #define PERF_MEM_SNOOP_NA	0x01 /* not available */
 #define PERF_MEM_SNOOP_NONE	0x02 /* no snoop */
@@ -983,6 +1028,10 @@ union perf_mem_data_src {
 #define PERF_MEM_SNOOP_HITM	0x10 /* snoop hit modified */
 #define PERF_MEM_SNOOP_SHIFT	19
 
+#define PERF_MEM_SNOOPX_FWD	0x01 /* forward */
+/* 1 free */
+#define PERF_MEM_SNOOPX_SHIFT	37
+
 /* locked instruction */
 #define PERF_MEM_LOCK_NA	0x01 /* not available */
 #define PERF_MEM_LOCK_LOCKED	0x02 /* locked transaction */
@@ -1015,6 +1064,7 @@ union perf_mem_data_src {
  *     in_tx: running in a hardware transaction
  *     abort: aborting a hardware transaction
  *    cycles: cycles from last branch (or 0 if not supported)
+ *      type: branch type
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -1024,7 +1074,8 @@ struct perf_branch_entry {
 		in_tx:1,    /* in transaction */
 		abort:1,    /* transaction abort */
 		cycles:16,  /* cycle count to last branch */
-		reserved:44;
+		type:4,     /* branch type */
+		reserved:40;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
new file mode 100644
index 000000000000..e2a6c7b3510b
--- /dev/null
+++ b/tools/include/uapi/linux/sched.h
@@ -0,0 +1,52 @@
+#ifndef _UAPI_LINUX_SCHED_H
+#define _UAPI_LINUX_SCHED_H
+
+/*
+ * cloning flags:
+ */
+#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+#define CLONE_VM	0x00000100	/* set if VM shared between processes */
+#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
+#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
+#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD	0x00010000	/* Same thread group? */
+#define CLONE_NEWNS	0x00020000	/* New mount namespace group */
+#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
+#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
+#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
+#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
+#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
+#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
+#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
+#define CLONE_NEWCGROUP		0x02000000	/* New cgroup namespace */
+#define CLONE_NEWUTS		0x04000000	/* New utsname namespace */
+#define CLONE_NEWIPC		0x08000000	/* New ipc namespace */
+#define CLONE_NEWUSER		0x10000000	/* New user namespace */
+#define CLONE_NEWPID		0x20000000	/* New pid namespace */
+#define CLONE_NEWNET		0x40000000	/* New network namespace */
+#define CLONE_IO		0x80000000	/* Clone io context */
+
+/*
+ * Scheduling policies
+ */
+#define SCHED_NORMAL		0
+#define SCHED_FIFO		1
+#define SCHED_RR		2
+#define SCHED_BATCH		3
+/* SCHED_ISO: reserved but not implemented yet */
+#define SCHED_IDLE		5
+#define SCHED_DEADLINE		6
+
+/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
+#define SCHED_RESET_ON_FORK     0x40000000
+
+/*
+ * For the sched_{set,get}attr() calls
+ */
+#define SCHED_FLAG_RESET_ON_FORK	0x01
+#define SCHED_FLAG_RECLAIM		0x02
+
+#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/tools/include/uapi/linux/vhost.h b/tools/include/uapi/linux/vhost.h
new file mode 100644
index 000000000000..60180c0b5dc6
--- /dev/null
+++ b/tools/include/uapi/linux/vhost.h
@@ -0,0 +1,209 @@
+#ifndef _LINUX_VHOST_H
+#define _LINUX_VHOST_H
+/* Userspace interface for in-kernel virtio accelerators. */
+
+/* vhost is used to reduce the number of system calls involved in virtio.
+ *
+ * Existing virtio net code is used in the guest without modification.
+ *
+ * This header includes interface used by userspace hypervisor for
+ * device configuration.
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/ioctl.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+
+struct vhost_vring_state {
+	unsigned int index;
+	unsigned int num;
+};
+
+struct vhost_vring_file {
+	unsigned int index;
+	int fd; /* Pass -1 to unbind from file. */
+
+};
+
+struct vhost_vring_addr {
+	unsigned int index;
+	/* Option flags. */
+	unsigned int flags;
+	/* Flag values: */
+	/* Whether log address is valid. If set enables logging. */
+#define VHOST_VRING_F_LOG 0
+
+	/* Start of array of descriptors (virtually contiguous) */
+	__u64 desc_user_addr;
+	/* Used structure address. Must be 32 bit aligned */
+	__u64 used_user_addr;
+	/* Available structure address. Must be 16 bit aligned */
+	__u64 avail_user_addr;
+	/* Logging support. */
+	/* Log writes to used structure, at offset calculated from specified
+	 * address. Address must be 32 bit aligned. */
+	__u64 log_guest_addr;
+};
+
+/* no alignment requirement */
+struct vhost_iotlb_msg {
+	__u64 iova;
+	__u64 size;
+	__u64 uaddr;
+#define VHOST_ACCESS_RO      0x1
+#define VHOST_ACCESS_WO      0x2
+#define VHOST_ACCESS_RW      0x3
+	__u8 perm;
+#define VHOST_IOTLB_MISS           1
+#define VHOST_IOTLB_UPDATE         2
+#define VHOST_IOTLB_INVALIDATE     3
+#define VHOST_IOTLB_ACCESS_FAIL    4
+	__u8 type;
+};
+
+#define VHOST_IOTLB_MSG 0x1
+
+struct vhost_msg {
+	int type;
+	union {
+		struct vhost_iotlb_msg iotlb;
+		__u8 padding[64];
+	};
+};
+
+struct vhost_memory_region {
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr;
+	__u64 flags_padding; /* No flags are currently specified. */
+};
+
+/* All region addresses and sizes must be 4K aligned. */
+#define VHOST_PAGE_SIZE 0x1000
+
+struct vhost_memory {
+	__u32 nregions;
+	__u32 padding;
+	struct vhost_memory_region regions[0];
+};
+
+/* ioctls */
+
+#define VHOST_VIRTIO 0xAF
+
+/* Features bitmask for forward compatibility.  Transport bits are used for
+ * vhost specific features. */
+#define VHOST_GET_FEATURES	_IOR(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_FEATURES	_IOW(VHOST_VIRTIO, 0x00, __u64)
+
+/* Set current process as the (exclusive) owner of this file descriptor.  This
+ * must be called before any other vhost command.  Further calls to
+ * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */
+#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
+/* Give up ownership, and reset the device to default values.
+ * Allows subsequent call to VHOST_OWNER_SET to succeed. */
+#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
+
+/* Set up/modify memory layout */
+#define VHOST_SET_MEM_TABLE	_IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
+
+/* Write logging setup. */
+/* Memory writes can optionally be logged by setting bit at an offset
+ * (calculated from the physical address) from specified log base.
+ * The bit is set using an atomic 32 bit operation. */
+/* Set base address for logging. */
+#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
+/* Specify an eventfd file descriptor to signal on log write. */
+#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
+
+/* Ring setup. */
+/* Set number of descriptors in ring. This parameter can not
+ * be modified while ring is running (bound to a device). */
+#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
+/* Set addresses for the ring. */
+#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
+/* Base value where queue looks for available descriptors */
+#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
+/* Get accessor: reads index, writes value in num */
+#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
+
+/* Set the vring byte order in num. Valid values are VHOST_VRING_LITTLE_ENDIAN
+ * or VHOST_VRING_BIG_ENDIAN (other values return -EINVAL).
+ * The byte order cannot be changed while the device is active: trying to do so
+ * returns -EBUSY.
+ * This is a legacy only API that is simply ignored when VIRTIO_F_VERSION_1 is
+ * set.
+ * Not all kernel configurations support this ioctl, but all configurations that
+ * support SET also support GET.
+ */
+#define VHOST_VRING_LITTLE_ENDIAN 0
+#define VHOST_VRING_BIG_ENDIAN 1
+#define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state)
+#define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state)
+
+/* The following ioctls use eventfd file descriptors to signal and poll
+ * for events. */
+
+/* Set eventfd to poll for added buffers */
+#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
+/* Set eventfd to signal when buffers have beed used */
+#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
+/* Set eventfd to signal an error */
+#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+/* Set busy loop timeout (in us) */
+#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23,	\
+					 struct vhost_vring_state)
+/* Get busy loop timeout (in us) */
+#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24,	\
+					 struct vhost_vring_state)
+
+/* VHOST_NET specific defines */
+
+/* Attach virtio net ring to a raw socket, or tap device.
+ * The socket must be already bound to an ethernet device, this device will be
+ * used for transmit.  Pass fd -1 to unbind from the socket and the transmit
+ * device.  This can be used to stop the ring (e.g. for migration). */
+#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
+
+/* Feature bits */
+/* Log all write descriptors. Can be changed while device is active. */
+#define VHOST_F_LOG_ALL 26
+/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
+#define VHOST_NET_F_VIRTIO_NET_HDR 27
+
+/* VHOST_SCSI specific definitions */
+
+/*
+ * Used by QEMU userspace to ensure a consistent vhost-scsi ABI.
+ *
+ * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate +
+ *            RFC-v2 vhost-scsi userspace.  Add GET_ABI_VERSION ioctl usage
+ * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target.
+ *            All the targets under vhost_wwpn can be seen and used by guset.
+ */
+
+#define VHOST_SCSI_ABI_VERSION	1
+
+struct vhost_scsi_target {
+	int abi_version;
+	char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */
+	unsigned short vhost_tpgt;
+	unsigned short reserved;
+};
+
+#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target)
+#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target)
+/* Changing this breaks userspace. */
+#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int)
+/* Set and get the events missed flag */
+#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
+#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
+
+/* VHOST_VSOCK specific defines */
+
+#define VHOST_VSOCK_SET_GUEST_CID	_IOW(VHOST_VIRTIO, 0x60, __u64)
+#define VHOST_VSOCK_SET_RUNNING		_IOW(VHOST_VIRTIO, 0x61, int)
+
+#endif
diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h
new file mode 100644
index 000000000000..87bf30b182df
--- /dev/null
+++ b/tools/include/uapi/sound/asound.h
@@ -0,0 +1,1026 @@
+/*
+ *  Advanced Linux Sound Architecture - ALSA - Driver
+ *  Copyright (c) 1994-2003 by Jaroslav Kysela <perex@perex.cz>,
+ *                             Abramo Bagnara <abramo@alsa-project.org>
+ *
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ */
+
+#ifndef _UAPI__SOUND_ASOUND_H
+#define _UAPI__SOUND_ASOUND_H
+
+#if defined(__KERNEL__) || defined(__linux__)
+#include <linux/types.h>
+#else
+#include <sys/ioctl.h>
+#endif
+
+#ifndef __KERNEL__
+#include <stdlib.h>
+#endif
+
+/*
+ *  protocol version
+ */
+
+#define SNDRV_PROTOCOL_VERSION(major, minor, subminor) (((major)<<16)|((minor)<<8)|(subminor))
+#define SNDRV_PROTOCOL_MAJOR(version) (((version)>>16)&0xffff)
+#define SNDRV_PROTOCOL_MINOR(version) (((version)>>8)&0xff)
+#define SNDRV_PROTOCOL_MICRO(version) ((version)&0xff)
+#define SNDRV_PROTOCOL_INCOMPATIBLE(kversion, uversion) \
+	(SNDRV_PROTOCOL_MAJOR(kversion) != SNDRV_PROTOCOL_MAJOR(uversion) || \
+	 (SNDRV_PROTOCOL_MAJOR(kversion) == SNDRV_PROTOCOL_MAJOR(uversion) && \
+	   SNDRV_PROTOCOL_MINOR(kversion) != SNDRV_PROTOCOL_MINOR(uversion)))
+
+/****************************************************************************
+ *                                                                          *
+ *        Digital audio interface					    *
+ *                                                                          *
+ ****************************************************************************/
+
+struct snd_aes_iec958 {
+	unsigned char status[24];	/* AES/IEC958 channel status bits */
+	unsigned char subcode[147];	/* AES/IEC958 subcode bits */
+	unsigned char pad;		/* nothing */
+	unsigned char dig_subframe[4];	/* AES/IEC958 subframe bits */
+};
+
+/****************************************************************************
+ *                                                                          *
+ *        CEA-861 Audio InfoFrame. Used in HDMI and DisplayPort		    *
+ *                                                                          *
+ ****************************************************************************/
+
+struct snd_cea_861_aud_if {
+	unsigned char db1_ct_cc; /* coding type and channel count */
+	unsigned char db2_sf_ss; /* sample frequency and size */
+	unsigned char db3; /* not used, all zeros */
+	unsigned char db4_ca; /* channel allocation code */
+	unsigned char db5_dminh_lsv; /* downmix inhibit & level-shit values */
+};
+
+/****************************************************************************
+ *                                                                          *
+ *      Section for driver hardware dependent interface - /dev/snd/hw?      *
+ *                                                                          *
+ ****************************************************************************/
+
+#define SNDRV_HWDEP_VERSION		SNDRV_PROTOCOL_VERSION(1, 0, 1)
+
+enum {
+	SNDRV_HWDEP_IFACE_OPL2 = 0,
+	SNDRV_HWDEP_IFACE_OPL3,
+	SNDRV_HWDEP_IFACE_OPL4,
+	SNDRV_HWDEP_IFACE_SB16CSP,	/* Creative Signal Processor */
+	SNDRV_HWDEP_IFACE_EMU10K1,	/* FX8010 processor in EMU10K1 chip */
+	SNDRV_HWDEP_IFACE_YSS225,	/* Yamaha FX processor */
+	SNDRV_HWDEP_IFACE_ICS2115,	/* Wavetable synth */
+	SNDRV_HWDEP_IFACE_SSCAPE,	/* Ensoniq SoundScape ISA card (MC68EC000) */
+	SNDRV_HWDEP_IFACE_VX,		/* Digigram VX cards */
+	SNDRV_HWDEP_IFACE_MIXART,	/* Digigram miXart cards */
+	SNDRV_HWDEP_IFACE_USX2Y,	/* Tascam US122, US224 & US428 usb */
+	SNDRV_HWDEP_IFACE_EMUX_WAVETABLE, /* EmuX wavetable */
+	SNDRV_HWDEP_IFACE_BLUETOOTH,	/* Bluetooth audio */
+	SNDRV_HWDEP_IFACE_USX2Y_PCM,	/* Tascam US122, US224 & US428 rawusb pcm */
+	SNDRV_HWDEP_IFACE_PCXHR,	/* Digigram PCXHR */
+	SNDRV_HWDEP_IFACE_SB_RC,	/* SB Extigy/Audigy2NX remote control */
+	SNDRV_HWDEP_IFACE_HDA,		/* HD-audio */
+	SNDRV_HWDEP_IFACE_USB_STREAM,	/* direct access to usb stream */
+	SNDRV_HWDEP_IFACE_FW_DICE,	/* TC DICE FireWire device */
+	SNDRV_HWDEP_IFACE_FW_FIREWORKS,	/* Echo Audio Fireworks based device */
+	SNDRV_HWDEP_IFACE_FW_BEBOB,	/* BridgeCo BeBoB based device */
+	SNDRV_HWDEP_IFACE_FW_OXFW,	/* Oxford OXFW970/971 based device */
+	SNDRV_HWDEP_IFACE_FW_DIGI00X,	/* Digidesign Digi 002/003 family */
+	SNDRV_HWDEP_IFACE_FW_TASCAM,	/* TASCAM FireWire series */
+	SNDRV_HWDEP_IFACE_LINE6,	/* Line6 USB processors */
+	SNDRV_HWDEP_IFACE_FW_MOTU,	/* MOTU FireWire series */
+	SNDRV_HWDEP_IFACE_FW_FIREFACE,	/* RME Fireface series */
+
+	/* Don't forget to change the following: */
+	SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_FW_FIREFACE
+};
+
+struct snd_hwdep_info {
+	unsigned int device;		/* WR: device number */
+	int card;			/* R: card number */
+	unsigned char id[64];		/* ID (user selectable) */
+	unsigned char name[80];		/* hwdep name */
+	int iface;			/* hwdep interface */
+	unsigned char reserved[64];	/* reserved for future */
+};
+
+/* generic DSP loader */
+struct snd_hwdep_dsp_status {
+	unsigned int version;		/* R: driver-specific version */
+	unsigned char id[32];		/* R: driver-specific ID string */
+	unsigned int num_dsps;		/* R: number of DSP images to transfer */
+	unsigned int dsp_loaded;	/* R: bit flags indicating the loaded DSPs */
+	unsigned int chip_ready;	/* R: 1 = initialization finished */
+	unsigned char reserved[16];	/* reserved for future use */
+};
+
+struct snd_hwdep_dsp_image {
+	unsigned int index;		/* W: DSP index */
+	unsigned char name[64];		/* W: ID (e.g. file name) */
+	unsigned char __user *image;	/* W: binary image */
+	size_t length;			/* W: size of image in bytes */
+	unsigned long driver_data;	/* W: driver-specific data */
+};
+
+#define SNDRV_HWDEP_IOCTL_PVERSION	_IOR ('H', 0x00, int)
+#define SNDRV_HWDEP_IOCTL_INFO		_IOR ('H', 0x01, struct snd_hwdep_info)
+#define SNDRV_HWDEP_IOCTL_DSP_STATUS	_IOR('H', 0x02, struct snd_hwdep_dsp_status)
+#define SNDRV_HWDEP_IOCTL_DSP_LOAD	_IOW('H', 0x03, struct snd_hwdep_dsp_image)
+
+/*****************************************************************************
+ *                                                                           *
+ *             Digital Audio (PCM) interface - /dev/snd/pcm??                *
+ *                                                                           *
+ *****************************************************************************/
+
+#define SNDRV_PCM_VERSION		SNDRV_PROTOCOL_VERSION(2, 0, 14)
+
+typedef unsigned long snd_pcm_uframes_t;
+typedef signed long snd_pcm_sframes_t;
+
+enum {
+	SNDRV_PCM_CLASS_GENERIC = 0,	/* standard mono or stereo device */
+	SNDRV_PCM_CLASS_MULTI,		/* multichannel device */
+	SNDRV_PCM_CLASS_MODEM,		/* software modem class */
+	SNDRV_PCM_CLASS_DIGITIZER,	/* digitizer class */
+	/* Don't forget to change the following: */
+	SNDRV_PCM_CLASS_LAST = SNDRV_PCM_CLASS_DIGITIZER,
+};
+
+enum {
+	SNDRV_PCM_SUBCLASS_GENERIC_MIX = 0, /* mono or stereo subdevices are mixed together */
+	SNDRV_PCM_SUBCLASS_MULTI_MIX,	/* multichannel subdevices are mixed together */
+	/* Don't forget to change the following: */
+	SNDRV_PCM_SUBCLASS_LAST = SNDRV_PCM_SUBCLASS_MULTI_MIX,
+};
+
+enum {
+	SNDRV_PCM_STREAM_PLAYBACK = 0,
+	SNDRV_PCM_STREAM_CAPTURE,
+	SNDRV_PCM_STREAM_LAST = SNDRV_PCM_STREAM_CAPTURE,
+};
+
+typedef int __bitwise snd_pcm_access_t;
+#define	SNDRV_PCM_ACCESS_MMAP_INTERLEAVED	((__force snd_pcm_access_t) 0) /* interleaved mmap */
+#define	SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED	((__force snd_pcm_access_t) 1) /* noninterleaved mmap */
+#define	SNDRV_PCM_ACCESS_MMAP_COMPLEX		((__force snd_pcm_access_t) 2) /* complex mmap */
+#define	SNDRV_PCM_ACCESS_RW_INTERLEAVED		((__force snd_pcm_access_t) 3) /* readi/writei */
+#define	SNDRV_PCM_ACCESS_RW_NONINTERLEAVED	((__force snd_pcm_access_t) 4) /* readn/writen */
+#define	SNDRV_PCM_ACCESS_LAST		SNDRV_PCM_ACCESS_RW_NONINTERLEAVED
+
+typedef int __bitwise snd_pcm_format_t;
+#define	SNDRV_PCM_FORMAT_S8	((__force snd_pcm_format_t) 0)
+#define	SNDRV_PCM_FORMAT_U8	((__force snd_pcm_format_t) 1)
+#define	SNDRV_PCM_FORMAT_S16_LE	((__force snd_pcm_format_t) 2)
+#define	SNDRV_PCM_FORMAT_S16_BE	((__force snd_pcm_format_t) 3)
+#define	SNDRV_PCM_FORMAT_U16_LE	((__force snd_pcm_format_t) 4)
+#define	SNDRV_PCM_FORMAT_U16_BE	((__force snd_pcm_format_t) 5)
+#define	SNDRV_PCM_FORMAT_S24_LE	((__force snd_pcm_format_t) 6) /* low three bytes */
+#define	SNDRV_PCM_FORMAT_S24_BE	((__force snd_pcm_format_t) 7) /* low three bytes */
+#define	SNDRV_PCM_FORMAT_U24_LE	((__force snd_pcm_format_t) 8) /* low three bytes */
+#define	SNDRV_PCM_FORMAT_U24_BE	((__force snd_pcm_format_t) 9) /* low three bytes */
+#define	SNDRV_PCM_FORMAT_S32_LE	((__force snd_pcm_format_t) 10)
+#define	SNDRV_PCM_FORMAT_S32_BE	((__force snd_pcm_format_t) 11)
+#define	SNDRV_PCM_FORMAT_U32_LE	((__force snd_pcm_format_t) 12)
+#define	SNDRV_PCM_FORMAT_U32_BE	((__force snd_pcm_format_t) 13)
+#define	SNDRV_PCM_FORMAT_FLOAT_LE	((__force snd_pcm_format_t) 14) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */
+#define	SNDRV_PCM_FORMAT_FLOAT_BE	((__force snd_pcm_format_t) 15) /* 4-byte float, IEEE-754 32-bit, range -1.0 to 1.0 */
+#define	SNDRV_PCM_FORMAT_FLOAT64_LE	((__force snd_pcm_format_t) 16) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */
+#define	SNDRV_PCM_FORMAT_FLOAT64_BE	((__force snd_pcm_format_t) 17) /* 8-byte float, IEEE-754 64-bit, range -1.0 to 1.0 */
+#define	SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE ((__force snd_pcm_format_t) 18) /* IEC-958 subframe, Little Endian */
+#define	SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE ((__force snd_pcm_format_t) 19) /* IEC-958 subframe, Big Endian */
+#define	SNDRV_PCM_FORMAT_MU_LAW		((__force snd_pcm_format_t) 20)
+#define	SNDRV_PCM_FORMAT_A_LAW		((__force snd_pcm_format_t) 21)
+#define	SNDRV_PCM_FORMAT_IMA_ADPCM	((__force snd_pcm_format_t) 22)
+#define	SNDRV_PCM_FORMAT_MPEG		((__force snd_pcm_format_t) 23)
+#define	SNDRV_PCM_FORMAT_GSM		((__force snd_pcm_format_t) 24)
+#define	SNDRV_PCM_FORMAT_SPECIAL	((__force snd_pcm_format_t) 31)
+#define	SNDRV_PCM_FORMAT_S24_3LE	((__force snd_pcm_format_t) 32)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_S24_3BE	((__force snd_pcm_format_t) 33)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_U24_3LE	((__force snd_pcm_format_t) 34)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_U24_3BE	((__force snd_pcm_format_t) 35)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_S20_3LE	((__force snd_pcm_format_t) 36)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_S20_3BE	((__force snd_pcm_format_t) 37)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_U20_3LE	((__force snd_pcm_format_t) 38)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_U20_3BE	((__force snd_pcm_format_t) 39)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_S18_3LE	((__force snd_pcm_format_t) 40)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_S18_3BE	((__force snd_pcm_format_t) 41)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_U18_3LE	((__force snd_pcm_format_t) 42)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_U18_3BE	((__force snd_pcm_format_t) 43)	/* in three bytes */
+#define	SNDRV_PCM_FORMAT_G723_24	((__force snd_pcm_format_t) 44) /* 8 samples in 3 bytes */
+#define	SNDRV_PCM_FORMAT_G723_24_1B	((__force snd_pcm_format_t) 45) /* 1 sample in 1 byte */
+#define	SNDRV_PCM_FORMAT_G723_40	((__force snd_pcm_format_t) 46) /* 8 Samples in 5 bytes */
+#define	SNDRV_PCM_FORMAT_G723_40_1B	((__force snd_pcm_format_t) 47) /* 1 sample in 1 byte */
+#define	SNDRV_PCM_FORMAT_DSD_U8		((__force snd_pcm_format_t) 48) /* DSD, 1-byte samples DSD (x8) */
+#define	SNDRV_PCM_FORMAT_DSD_U16_LE	((__force snd_pcm_format_t) 49) /* DSD, 2-byte samples DSD (x16), little endian */
+#define	SNDRV_PCM_FORMAT_DSD_U32_LE	((__force snd_pcm_format_t) 50) /* DSD, 4-byte samples DSD (x32), little endian */
+#define	SNDRV_PCM_FORMAT_DSD_U16_BE	((__force snd_pcm_format_t) 51) /* DSD, 2-byte samples DSD (x16), big endian */
+#define	SNDRV_PCM_FORMAT_DSD_U32_BE	((__force snd_pcm_format_t) 52) /* DSD, 4-byte samples DSD (x32), big endian */
+#define	SNDRV_PCM_FORMAT_LAST		SNDRV_PCM_FORMAT_DSD_U32_BE
+
+#ifdef SNDRV_LITTLE_ENDIAN
+#define	SNDRV_PCM_FORMAT_S16		SNDRV_PCM_FORMAT_S16_LE
+#define	SNDRV_PCM_FORMAT_U16		SNDRV_PCM_FORMAT_U16_LE
+#define	SNDRV_PCM_FORMAT_S24		SNDRV_PCM_FORMAT_S24_LE
+#define	SNDRV_PCM_FORMAT_U24		SNDRV_PCM_FORMAT_U24_LE
+#define	SNDRV_PCM_FORMAT_S32		SNDRV_PCM_FORMAT_S32_LE
+#define	SNDRV_PCM_FORMAT_U32		SNDRV_PCM_FORMAT_U32_LE
+#define	SNDRV_PCM_FORMAT_FLOAT		SNDRV_PCM_FORMAT_FLOAT_LE
+#define	SNDRV_PCM_FORMAT_FLOAT64	SNDRV_PCM_FORMAT_FLOAT64_LE
+#define	SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE
+#endif
+#ifdef SNDRV_BIG_ENDIAN
+#define	SNDRV_PCM_FORMAT_S16		SNDRV_PCM_FORMAT_S16_BE
+#define	SNDRV_PCM_FORMAT_U16		SNDRV_PCM_FORMAT_U16_BE
+#define	SNDRV_PCM_FORMAT_S24		SNDRV_PCM_FORMAT_S24_BE
+#define	SNDRV_PCM_FORMAT_U24		SNDRV_PCM_FORMAT_U24_BE
+#define	SNDRV_PCM_FORMAT_S32		SNDRV_PCM_FORMAT_S32_BE
+#define	SNDRV_PCM_FORMAT_U32		SNDRV_PCM_FORMAT_U32_BE
+#define	SNDRV_PCM_FORMAT_FLOAT		SNDRV_PCM_FORMAT_FLOAT_BE
+#define	SNDRV_PCM_FORMAT_FLOAT64	SNDRV_PCM_FORMAT_FLOAT64_BE
+#define	SNDRV_PCM_FORMAT_IEC958_SUBFRAME SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE
+#endif
+
+typedef int __bitwise snd_pcm_subformat_t;
+#define	SNDRV_PCM_SUBFORMAT_STD		((__force snd_pcm_subformat_t) 0)
+#define	SNDRV_PCM_SUBFORMAT_LAST	SNDRV_PCM_SUBFORMAT_STD
+
+#define SNDRV_PCM_INFO_MMAP		0x00000001	/* hardware supports mmap */
+#define SNDRV_PCM_INFO_MMAP_VALID	0x00000002	/* period data are valid during transfer */
+#define SNDRV_PCM_INFO_DOUBLE		0x00000004	/* Double buffering needed for PCM start/stop */
+#define SNDRV_PCM_INFO_BATCH		0x00000010	/* double buffering */
+#define SNDRV_PCM_INFO_SYNC_APPLPTR	0x00000020	/* need the explicit sync of appl_ptr update */
+#define SNDRV_PCM_INFO_INTERLEAVED	0x00000100	/* channels are interleaved */
+#define SNDRV_PCM_INFO_NONINTERLEAVED	0x00000200	/* channels are not interleaved */
+#define SNDRV_PCM_INFO_COMPLEX		0x00000400	/* complex frame organization (mmap only) */
+#define SNDRV_PCM_INFO_BLOCK_TRANSFER	0x00010000	/* hardware transfer block of samples */
+#define SNDRV_PCM_INFO_OVERRANGE	0x00020000	/* hardware supports ADC (capture) overrange detection */
+#define SNDRV_PCM_INFO_RESUME		0x00040000	/* hardware supports stream resume after suspend */
+#define SNDRV_PCM_INFO_PAUSE		0x00080000	/* pause ioctl is supported */
+#define SNDRV_PCM_INFO_HALF_DUPLEX	0x00100000	/* only half duplex */
+#define SNDRV_PCM_INFO_JOINT_DUPLEX	0x00200000	/* playback and capture stream are somewhat correlated */
+#define SNDRV_PCM_INFO_SYNC_START	0x00400000	/* pcm support some kind of sync go */
+#define SNDRV_PCM_INFO_NO_PERIOD_WAKEUP	0x00800000	/* period wakeup can be disabled */
+#define SNDRV_PCM_INFO_HAS_WALL_CLOCK   0x01000000      /* (Deprecated)has audio wall clock for audio/system time sync */
+#define SNDRV_PCM_INFO_HAS_LINK_ATIME              0x01000000  /* report hardware link audio time, reset on startup */
+#define SNDRV_PCM_INFO_HAS_LINK_ABSOLUTE_ATIME     0x02000000  /* report absolute hardware link audio time, not reset on startup */
+#define SNDRV_PCM_INFO_HAS_LINK_ESTIMATED_ATIME    0x04000000  /* report estimated link audio time */
+#define SNDRV_PCM_INFO_HAS_LINK_SYNCHRONIZED_ATIME 0x08000000  /* report synchronized audio/system time */
+
+#define SNDRV_PCM_INFO_DRAIN_TRIGGER	0x40000000		/* internal kernel flag - trigger in drain */
+#define SNDRV_PCM_INFO_FIFO_IN_FRAMES	0x80000000	/* internal kernel flag - FIFO size is in frames */
+
+
+
+typedef int __bitwise snd_pcm_state_t;
+#define	SNDRV_PCM_STATE_OPEN		((__force snd_pcm_state_t) 0) /* stream is open */
+#define	SNDRV_PCM_STATE_SETUP		((__force snd_pcm_state_t) 1) /* stream has a setup */
+#define	SNDRV_PCM_STATE_PREPARED	((__force snd_pcm_state_t) 2) /* stream is ready to start */
+#define	SNDRV_PCM_STATE_RUNNING		((__force snd_pcm_state_t) 3) /* stream is running */
+#define	SNDRV_PCM_STATE_XRUN		((__force snd_pcm_state_t) 4) /* stream reached an xrun */
+#define	SNDRV_PCM_STATE_DRAINING	((__force snd_pcm_state_t) 5) /* stream is draining */
+#define	SNDRV_PCM_STATE_PAUSED		((__force snd_pcm_state_t) 6) /* stream is paused */
+#define	SNDRV_PCM_STATE_SUSPENDED	((__force snd_pcm_state_t) 7) /* hardware is suspended */
+#define	SNDRV_PCM_STATE_DISCONNECTED	((__force snd_pcm_state_t) 8) /* hardware is disconnected */
+#define	SNDRV_PCM_STATE_LAST		SNDRV_PCM_STATE_DISCONNECTED
+
+enum {
+	SNDRV_PCM_MMAP_OFFSET_DATA = 0x00000000,
+	SNDRV_PCM_MMAP_OFFSET_STATUS = 0x80000000,
+	SNDRV_PCM_MMAP_OFFSET_CONTROL = 0x81000000,
+};
+
+union snd_pcm_sync_id {
+	unsigned char id[16];
+	unsigned short id16[8];
+	unsigned int id32[4];
+};
+
+struct snd_pcm_info {
+	unsigned int device;		/* RO/WR (control): device number */
+	unsigned int subdevice;		/* RO/WR (control): subdevice number */
+	int stream;			/* RO/WR (control): stream direction */
+	int card;			/* R: card number */
+	unsigned char id[64];		/* ID (user selectable) */
+	unsigned char name[80];		/* name of this device */
+	unsigned char subname[32];	/* subdevice name */
+	int dev_class;			/* SNDRV_PCM_CLASS_* */
+	int dev_subclass;		/* SNDRV_PCM_SUBCLASS_* */
+	unsigned int subdevices_count;
+	unsigned int subdevices_avail;
+	union snd_pcm_sync_id sync;	/* hardware synchronization ID */
+	unsigned char reserved[64];	/* reserved for future... */
+};
+
+typedef int snd_pcm_hw_param_t;
+#define	SNDRV_PCM_HW_PARAM_ACCESS	0	/* Access type */
+#define	SNDRV_PCM_HW_PARAM_FORMAT	1	/* Format */
+#define	SNDRV_PCM_HW_PARAM_SUBFORMAT	2	/* Subformat */
+#define	SNDRV_PCM_HW_PARAM_FIRST_MASK	SNDRV_PCM_HW_PARAM_ACCESS
+#define	SNDRV_PCM_HW_PARAM_LAST_MASK	SNDRV_PCM_HW_PARAM_SUBFORMAT
+
+#define	SNDRV_PCM_HW_PARAM_SAMPLE_BITS	8	/* Bits per sample */
+#define	SNDRV_PCM_HW_PARAM_FRAME_BITS	9	/* Bits per frame */
+#define	SNDRV_PCM_HW_PARAM_CHANNELS	10	/* Channels */
+#define	SNDRV_PCM_HW_PARAM_RATE		11	/* Approx rate */
+#define	SNDRV_PCM_HW_PARAM_PERIOD_TIME	12	/* Approx distance between
+						 * interrupts in us
+						 */
+#define	SNDRV_PCM_HW_PARAM_PERIOD_SIZE	13	/* Approx frames between
+						 * interrupts
+						 */
+#define	SNDRV_PCM_HW_PARAM_PERIOD_BYTES	14	/* Approx bytes between
+						 * interrupts
+						 */
+#define	SNDRV_PCM_HW_PARAM_PERIODS	15	/* Approx interrupts per
+						 * buffer
+						 */
+#define	SNDRV_PCM_HW_PARAM_BUFFER_TIME	16	/* Approx duration of buffer
+						 * in us
+						 */
+#define	SNDRV_PCM_HW_PARAM_BUFFER_SIZE	17	/* Size of buffer in frames */
+#define	SNDRV_PCM_HW_PARAM_BUFFER_BYTES	18	/* Size of buffer in bytes */
+#define	SNDRV_PCM_HW_PARAM_TICK_TIME	19	/* Approx tick duration in us */
+#define	SNDRV_PCM_HW_PARAM_FIRST_INTERVAL	SNDRV_PCM_HW_PARAM_SAMPLE_BITS
+#define	SNDRV_PCM_HW_PARAM_LAST_INTERVAL	SNDRV_PCM_HW_PARAM_TICK_TIME
+
+#define SNDRV_PCM_HW_PARAMS_NORESAMPLE	(1<<0)	/* avoid rate resampling */
+#define SNDRV_PCM_HW_PARAMS_EXPORT_BUFFER	(1<<1)	/* export buffer */
+#define SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP	(1<<2)	/* disable period wakeups */
+
+struct snd_interval {
+	unsigned int min, max;
+	unsigned int openmin:1,
+		     openmax:1,
+		     integer:1,
+		     empty:1;
+};
+
+#define SNDRV_MASK_MAX	256
+
+struct snd_mask {
+	__u32 bits[(SNDRV_MASK_MAX+31)/32];
+};
+
+struct snd_pcm_hw_params {
+	unsigned int flags;
+	struct snd_mask masks[SNDRV_PCM_HW_PARAM_LAST_MASK -
+			       SNDRV_PCM_HW_PARAM_FIRST_MASK + 1];
+	struct snd_mask mres[5];	/* reserved masks */
+	struct snd_interval intervals[SNDRV_PCM_HW_PARAM_LAST_INTERVAL -
+				        SNDRV_PCM_HW_PARAM_FIRST_INTERVAL + 1];
+	struct snd_interval ires[9];	/* reserved intervals */
+	unsigned int rmask;		/* W: requested masks */
+	unsigned int cmask;		/* R: changed masks */
+	unsigned int info;		/* R: Info flags for returned setup */
+	unsigned int msbits;		/* R: used most significant bits */
+	unsigned int rate_num;		/* R: rate numerator */
+	unsigned int rate_den;		/* R: rate denominator */
+	snd_pcm_uframes_t fifo_size;	/* R: chip FIFO size in frames */
+	unsigned char reserved[64];	/* reserved for future */
+};
+
+enum {
+	SNDRV_PCM_TSTAMP_NONE = 0,
+	SNDRV_PCM_TSTAMP_ENABLE,
+	SNDRV_PCM_TSTAMP_LAST = SNDRV_PCM_TSTAMP_ENABLE,
+};
+
+struct snd_pcm_sw_params {
+	int tstamp_mode;			/* timestamp mode */
+	unsigned int period_step;
+	unsigned int sleep_min;			/* min ticks to sleep */
+	snd_pcm_uframes_t avail_min;		/* min avail frames for wakeup */
+	snd_pcm_uframes_t xfer_align;		/* obsolete: xfer size need to be a multiple */
+	snd_pcm_uframes_t start_threshold;	/* min hw_avail frames for automatic start */
+	snd_pcm_uframes_t stop_threshold;	/* min avail frames for automatic stop */
+	snd_pcm_uframes_t silence_threshold;	/* min distance from noise for silence filling */
+	snd_pcm_uframes_t silence_size;		/* silence block size */
+	snd_pcm_uframes_t boundary;		/* pointers wrap point */
+	unsigned int proto;			/* protocol version */
+	unsigned int tstamp_type;		/* timestamp type (req. proto >= 2.0.12) */
+	unsigned char reserved[56];		/* reserved for future */
+};
+
+struct snd_pcm_channel_info {
+	unsigned int channel;
+	__kernel_off_t offset;		/* mmap offset */
+	unsigned int first;		/* offset to first sample in bits */
+	unsigned int step;		/* samples distance in bits */
+};
+
+enum {
+	/*
+	 *  first definition for backwards compatibility only,
+	 *  maps to wallclock/link time for HDAudio playback and DEFAULT/DMA time for everything else
+	 */
+	SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT = 0,
+
+	/* timestamp definitions */
+	SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT = 1,           /* DMA time, reported as per hw_ptr */
+	SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK = 2,	           /* link time reported by sample or wallclock counter, reset on startup */
+	SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ABSOLUTE = 3,	   /* link time reported by sample or wallclock counter, not reset on startup */
+	SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_ESTIMATED = 4,    /* link time estimated indirectly */
+	SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED = 5, /* link time synchronized with system time */
+	SNDRV_PCM_AUDIO_TSTAMP_TYPE_LAST = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED
+};
+
+struct snd_pcm_status {
+	snd_pcm_state_t state;		/* stream state */
+	struct timespec trigger_tstamp;	/* time when stream was started/stopped/paused */
+	struct timespec tstamp;		/* reference timestamp */
+	snd_pcm_uframes_t appl_ptr;	/* appl ptr */
+	snd_pcm_uframes_t hw_ptr;	/* hw ptr */
+	snd_pcm_sframes_t delay;	/* current delay in frames */
+	snd_pcm_uframes_t avail;	/* number of frames available */
+	snd_pcm_uframes_t avail_max;	/* max frames available on hw since last status */
+	snd_pcm_uframes_t overrange;	/* count of ADC (capture) overrange detections from last status */
+	snd_pcm_state_t suspended_state; /* suspended stream state */
+	__u32 audio_tstamp_data;	 /* needed for 64-bit alignment, used for configs/report to/from userspace */
+	struct timespec audio_tstamp;	/* sample counter, wall clock, PHC or on-demand sync'ed */
+	struct timespec driver_tstamp;	/* useful in case reference system tstamp is reported with delay */
+	__u32 audio_tstamp_accuracy;	/* in ns units, only valid if indicated in audio_tstamp_data */
+	unsigned char reserved[52-2*sizeof(struct timespec)]; /* must be filled with zero */
+};
+
+struct snd_pcm_mmap_status {
+	snd_pcm_state_t state;		/* RO: state - SNDRV_PCM_STATE_XXXX */
+	int pad1;			/* Needed for 64 bit alignment */
+	snd_pcm_uframes_t hw_ptr;	/* RO: hw ptr (0...boundary-1) */
+	struct timespec tstamp;		/* Timestamp */
+	snd_pcm_state_t suspended_state; /* RO: suspended stream state */
+	struct timespec audio_tstamp;	/* from sample counter or wall clock */
+};
+
+struct snd_pcm_mmap_control {
+	snd_pcm_uframes_t appl_ptr;	/* RW: appl ptr (0...boundary-1) */
+	snd_pcm_uframes_t avail_min;	/* RW: min available frames for wakeup */
+};
+
+#define SNDRV_PCM_SYNC_PTR_HWSYNC	(1<<0)	/* execute hwsync */
+#define SNDRV_PCM_SYNC_PTR_APPL		(1<<1)	/* get appl_ptr from driver (r/w op) */
+#define SNDRV_PCM_SYNC_PTR_AVAIL_MIN	(1<<2)	/* get avail_min from driver */
+
+struct snd_pcm_sync_ptr {
+	unsigned int flags;
+	union {
+		struct snd_pcm_mmap_status status;
+		unsigned char reserved[64];
+	} s;
+	union {
+		struct snd_pcm_mmap_control control;
+		unsigned char reserved[64];
+	} c;
+};
+
+struct snd_xferi {
+	snd_pcm_sframes_t result;
+	void __user *buf;
+	snd_pcm_uframes_t frames;
+};
+
+struct snd_xfern {
+	snd_pcm_sframes_t result;
+	void __user * __user *bufs;
+	snd_pcm_uframes_t frames;
+};
+
+enum {
+	SNDRV_PCM_TSTAMP_TYPE_GETTIMEOFDAY = 0,	/* gettimeofday equivalent */
+	SNDRV_PCM_TSTAMP_TYPE_MONOTONIC,	/* posix_clock_monotonic equivalent */
+	SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW,    /* monotonic_raw (no NTP) */
+	SNDRV_PCM_TSTAMP_TYPE_LAST = SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW,
+};
+
+/* channel positions */
+enum {
+	SNDRV_CHMAP_UNKNOWN = 0,
+	SNDRV_CHMAP_NA,		/* N/A, silent */
+	SNDRV_CHMAP_MONO,	/* mono stream */
+	/* this follows the alsa-lib mixer channel value + 3 */
+	SNDRV_CHMAP_FL,		/* front left */
+	SNDRV_CHMAP_FR,		/* front right */
+	SNDRV_CHMAP_RL,		/* rear left */
+	SNDRV_CHMAP_RR,		/* rear right */
+	SNDRV_CHMAP_FC,		/* front center */
+	SNDRV_CHMAP_LFE,	/* LFE */
+	SNDRV_CHMAP_SL,		/* side left */
+	SNDRV_CHMAP_SR,		/* side right */
+	SNDRV_CHMAP_RC,		/* rear center */
+	/* new definitions */
+	SNDRV_CHMAP_FLC,	/* front left center */
+	SNDRV_CHMAP_FRC,	/* front right center */
+	SNDRV_CHMAP_RLC,	/* rear left center */
+	SNDRV_CHMAP_RRC,	/* rear right center */
+	SNDRV_CHMAP_FLW,	/* front left wide */
+	SNDRV_CHMAP_FRW,	/* front right wide */
+	SNDRV_CHMAP_FLH,	/* front left high */
+	SNDRV_CHMAP_FCH,	/* front center high */
+	SNDRV_CHMAP_FRH,	/* front right high */
+	SNDRV_CHMAP_TC,		/* top center */
+	SNDRV_CHMAP_TFL,	/* top front left */
+	SNDRV_CHMAP_TFR,	/* top front right */
+	SNDRV_CHMAP_TFC,	/* top front center */
+	SNDRV_CHMAP_TRL,	/* top rear left */
+	SNDRV_CHMAP_TRR,	/* top rear right */
+	SNDRV_CHMAP_TRC,	/* top rear center */
+	/* new definitions for UAC2 */
+	SNDRV_CHMAP_TFLC,	/* top front left center */
+	SNDRV_CHMAP_TFRC,	/* top front right center */
+	SNDRV_CHMAP_TSL,	/* top side left */
+	SNDRV_CHMAP_TSR,	/* top side right */
+	SNDRV_CHMAP_LLFE,	/* left LFE */
+	SNDRV_CHMAP_RLFE,	/* right LFE */
+	SNDRV_CHMAP_BC,		/* bottom center */
+	SNDRV_CHMAP_BLC,	/* bottom left center */
+	SNDRV_CHMAP_BRC,	/* bottom right center */
+	SNDRV_CHMAP_LAST = SNDRV_CHMAP_BRC,
+};
+
+#define SNDRV_CHMAP_POSITION_MASK	0xffff
+#define SNDRV_CHMAP_PHASE_INVERSE	(0x01 << 16)
+#define SNDRV_CHMAP_DRIVER_SPEC		(0x02 << 16)
+
+#define SNDRV_PCM_IOCTL_PVERSION	_IOR('A', 0x00, int)
+#define SNDRV_PCM_IOCTL_INFO		_IOR('A', 0x01, struct snd_pcm_info)
+#define SNDRV_PCM_IOCTL_TSTAMP		_IOW('A', 0x02, int)
+#define SNDRV_PCM_IOCTL_TTSTAMP		_IOW('A', 0x03, int)
+#define SNDRV_PCM_IOCTL_USER_PVERSION	_IOW('A', 0x04, int)
+#define SNDRV_PCM_IOCTL_HW_REFINE	_IOWR('A', 0x10, struct snd_pcm_hw_params)
+#define SNDRV_PCM_IOCTL_HW_PARAMS	_IOWR('A', 0x11, struct snd_pcm_hw_params)
+#define SNDRV_PCM_IOCTL_HW_FREE		_IO('A', 0x12)
+#define SNDRV_PCM_IOCTL_SW_PARAMS	_IOWR('A', 0x13, struct snd_pcm_sw_params)
+#define SNDRV_PCM_IOCTL_STATUS		_IOR('A', 0x20, struct snd_pcm_status)
+#define SNDRV_PCM_IOCTL_DELAY		_IOR('A', 0x21, snd_pcm_sframes_t)
+#define SNDRV_PCM_IOCTL_HWSYNC		_IO('A', 0x22)
+#define SNDRV_PCM_IOCTL_SYNC_PTR	_IOWR('A', 0x23, struct snd_pcm_sync_ptr)
+#define SNDRV_PCM_IOCTL_STATUS_EXT	_IOWR('A', 0x24, struct snd_pcm_status)
+#define SNDRV_PCM_IOCTL_CHANNEL_INFO	_IOR('A', 0x32, struct snd_pcm_channel_info)
+#define SNDRV_PCM_IOCTL_PREPARE		_IO('A', 0x40)
+#define SNDRV_PCM_IOCTL_RESET		_IO('A', 0x41)
+#define SNDRV_PCM_IOCTL_START		_IO('A', 0x42)
+#define SNDRV_PCM_IOCTL_DROP		_IO('A', 0x43)
+#define SNDRV_PCM_IOCTL_DRAIN		_IO('A', 0x44)
+#define SNDRV_PCM_IOCTL_PAUSE		_IOW('A', 0x45, int)
+#define SNDRV_PCM_IOCTL_REWIND		_IOW('A', 0x46, snd_pcm_uframes_t)
+#define SNDRV_PCM_IOCTL_RESUME		_IO('A', 0x47)
+#define SNDRV_PCM_IOCTL_XRUN		_IO('A', 0x48)
+#define SNDRV_PCM_IOCTL_FORWARD		_IOW('A', 0x49, snd_pcm_uframes_t)
+#define SNDRV_PCM_IOCTL_WRITEI_FRAMES	_IOW('A', 0x50, struct snd_xferi)
+#define SNDRV_PCM_IOCTL_READI_FRAMES	_IOR('A', 0x51, struct snd_xferi)
+#define SNDRV_PCM_IOCTL_WRITEN_FRAMES	_IOW('A', 0x52, struct snd_xfern)
+#define SNDRV_PCM_IOCTL_READN_FRAMES	_IOR('A', 0x53, struct snd_xfern)
+#define SNDRV_PCM_IOCTL_LINK		_IOW('A', 0x60, int)
+#define SNDRV_PCM_IOCTL_UNLINK		_IO('A', 0x61)
+
+/*****************************************************************************
+ *                                                                           *
+ *                            MIDI v1.0 interface                            *
+ *                                                                           *
+ *****************************************************************************/
+
+/*
+ *  Raw MIDI section - /dev/snd/midi??
+ */
+
+#define SNDRV_RAWMIDI_VERSION		SNDRV_PROTOCOL_VERSION(2, 0, 0)
+
+enum {
+	SNDRV_RAWMIDI_STREAM_OUTPUT = 0,
+	SNDRV_RAWMIDI_STREAM_INPUT,
+	SNDRV_RAWMIDI_STREAM_LAST = SNDRV_RAWMIDI_STREAM_INPUT,
+};
+
+#define SNDRV_RAWMIDI_INFO_OUTPUT		0x00000001
+#define SNDRV_RAWMIDI_INFO_INPUT		0x00000002
+#define SNDRV_RAWMIDI_INFO_DUPLEX		0x00000004
+
+struct snd_rawmidi_info {
+	unsigned int device;		/* RO/WR (control): device number */
+	unsigned int subdevice;		/* RO/WR (control): subdevice number */
+	int stream;			/* WR: stream */
+	int card;			/* R: card number */
+	unsigned int flags;		/* SNDRV_RAWMIDI_INFO_XXXX */
+	unsigned char id[64];		/* ID (user selectable) */
+	unsigned char name[80];		/* name of device */
+	unsigned char subname[32];	/* name of active or selected subdevice */
+	unsigned int subdevices_count;
+	unsigned int subdevices_avail;
+	unsigned char reserved[64];	/* reserved for future use */
+};
+
+struct snd_rawmidi_params {
+	int stream;
+	size_t buffer_size;		/* queue size in bytes */
+	size_t avail_min;		/* minimum avail bytes for wakeup */
+	unsigned int no_active_sensing: 1; /* do not send active sensing byte in close() */
+	unsigned char reserved[16];	/* reserved for future use */
+};
+
+struct snd_rawmidi_status {
+	int stream;
+	struct timespec tstamp;		/* Timestamp */
+	size_t avail;			/* available bytes */
+	size_t xruns;			/* count of overruns since last status (in bytes) */
+	unsigned char reserved[16];	/* reserved for future use */
+};
+
+#define SNDRV_RAWMIDI_IOCTL_PVERSION	_IOR('W', 0x00, int)
+#define SNDRV_RAWMIDI_IOCTL_INFO	_IOR('W', 0x01, struct snd_rawmidi_info)
+#define SNDRV_RAWMIDI_IOCTL_PARAMS	_IOWR('W', 0x10, struct snd_rawmidi_params)
+#define SNDRV_RAWMIDI_IOCTL_STATUS	_IOWR('W', 0x20, struct snd_rawmidi_status)
+#define SNDRV_RAWMIDI_IOCTL_DROP	_IOW('W', 0x30, int)
+#define SNDRV_RAWMIDI_IOCTL_DRAIN	_IOW('W', 0x31, int)
+
+/*
+ *  Timer section - /dev/snd/timer
+ */
+
+#define SNDRV_TIMER_VERSION		SNDRV_PROTOCOL_VERSION(2, 0, 6)
+
+enum {
+	SNDRV_TIMER_CLASS_NONE = -1,
+	SNDRV_TIMER_CLASS_SLAVE = 0,
+	SNDRV_TIMER_CLASS_GLOBAL,
+	SNDRV_TIMER_CLASS_CARD,
+	SNDRV_TIMER_CLASS_PCM,
+	SNDRV_TIMER_CLASS_LAST = SNDRV_TIMER_CLASS_PCM,
+};
+
+/* slave timer classes */
+enum {
+	SNDRV_TIMER_SCLASS_NONE = 0,
+	SNDRV_TIMER_SCLASS_APPLICATION,
+	SNDRV_TIMER_SCLASS_SEQUENCER,		/* alias */
+	SNDRV_TIMER_SCLASS_OSS_SEQUENCER,	/* alias */
+	SNDRV_TIMER_SCLASS_LAST = SNDRV_TIMER_SCLASS_OSS_SEQUENCER,
+};
+
+/* global timers (device member) */
+#define SNDRV_TIMER_GLOBAL_SYSTEM	0
+#define SNDRV_TIMER_GLOBAL_RTC		1	/* unused */
+#define SNDRV_TIMER_GLOBAL_HPET		2
+#define SNDRV_TIMER_GLOBAL_HRTIMER	3
+
+/* info flags */
+#define SNDRV_TIMER_FLG_SLAVE		(1<<0)	/* cannot be controlled */
+
+struct snd_timer_id {
+	int dev_class;
+	int dev_sclass;
+	int card;
+	int device;
+	int subdevice;
+};
+
+struct snd_timer_ginfo {
+	struct snd_timer_id tid;	/* requested timer ID */
+	unsigned int flags;		/* timer flags - SNDRV_TIMER_FLG_* */
+	int card;			/* card number */
+	unsigned char id[64];		/* timer identification */
+	unsigned char name[80];		/* timer name */
+	unsigned long reserved0;	/* reserved for future use */
+	unsigned long resolution;	/* average period resolution in ns */
+	unsigned long resolution_min;	/* minimal period resolution in ns */
+	unsigned long resolution_max;	/* maximal period resolution in ns */
+	unsigned int clients;		/* active timer clients */
+	unsigned char reserved[32];
+};
+
+struct snd_timer_gparams {
+	struct snd_timer_id tid;	/* requested timer ID */
+	unsigned long period_num;	/* requested precise period duration (in seconds) - numerator */
+	unsigned long period_den;	/* requested precise period duration (in seconds) - denominator */
+	unsigned char reserved[32];
+};
+
+struct snd_timer_gstatus {
+	struct snd_timer_id tid;	/* requested timer ID */
+	unsigned long resolution;	/* current period resolution in ns */
+	unsigned long resolution_num;	/* precise current period resolution (in seconds) - numerator */
+	unsigned long resolution_den;	/* precise current period resolution (in seconds) - denominator */
+	unsigned char reserved[32];
+};
+
+struct snd_timer_select {
+	struct snd_timer_id id;	/* bind to timer ID */
+	unsigned char reserved[32];	/* reserved */
+};
+
+struct snd_timer_info {
+	unsigned int flags;		/* timer flags - SNDRV_TIMER_FLG_* */
+	int card;			/* card number */
+	unsigned char id[64];		/* timer identificator */
+	unsigned char name[80];		/* timer name */
+	unsigned long reserved0;	/* reserved for future use */
+	unsigned long resolution;	/* average period resolution in ns */
+	unsigned char reserved[64];	/* reserved */
+};
+
+#define SNDRV_TIMER_PSFLG_AUTO		(1<<0)	/* auto start, otherwise one-shot */
+#define SNDRV_TIMER_PSFLG_EXCLUSIVE	(1<<1)	/* exclusive use, precise start/stop/pause/continue */
+#define SNDRV_TIMER_PSFLG_EARLY_EVENT	(1<<2)	/* write early event to the poll queue */
+
+struct snd_timer_params {
+	unsigned int flags;		/* flags - SNDRV_MIXER_PSFLG_* */
+	unsigned int ticks;		/* requested resolution in ticks */
+	unsigned int queue_size;	/* total size of queue (32-1024) */
+	unsigned int reserved0;		/* reserved, was: failure locations */
+	unsigned int filter;		/* event filter (bitmask of SNDRV_TIMER_EVENT_*) */
+	unsigned char reserved[60];	/* reserved */
+};
+
+struct snd_timer_status {
+	struct timespec tstamp;		/* Timestamp - last update */
+	unsigned int resolution;	/* current period resolution in ns */
+	unsigned int lost;		/* counter of master tick lost */
+	unsigned int overrun;		/* count of read queue overruns */
+	unsigned int queue;		/* used queue size */
+	unsigned char reserved[64];	/* reserved */
+};
+
+#define SNDRV_TIMER_IOCTL_PVERSION	_IOR('T', 0x00, int)
+#define SNDRV_TIMER_IOCTL_NEXT_DEVICE	_IOWR('T', 0x01, struct snd_timer_id)
+#define SNDRV_TIMER_IOCTL_TREAD		_IOW('T', 0x02, int)
+#define SNDRV_TIMER_IOCTL_GINFO		_IOWR('T', 0x03, struct snd_timer_ginfo)
+#define SNDRV_TIMER_IOCTL_GPARAMS	_IOW('T', 0x04, struct snd_timer_gparams)
+#define SNDRV_TIMER_IOCTL_GSTATUS	_IOWR('T', 0x05, struct snd_timer_gstatus)
+#define SNDRV_TIMER_IOCTL_SELECT	_IOW('T', 0x10, struct snd_timer_select)
+#define SNDRV_TIMER_IOCTL_INFO		_IOR('T', 0x11, struct snd_timer_info)
+#define SNDRV_TIMER_IOCTL_PARAMS	_IOW('T', 0x12, struct snd_timer_params)
+#define SNDRV_TIMER_IOCTL_STATUS	_IOR('T', 0x14, struct snd_timer_status)
+/* The following four ioctls are changed since 1.0.9 due to confliction */
+#define SNDRV_TIMER_IOCTL_START		_IO('T', 0xa0)
+#define SNDRV_TIMER_IOCTL_STOP		_IO('T', 0xa1)
+#define SNDRV_TIMER_IOCTL_CONTINUE	_IO('T', 0xa2)
+#define SNDRV_TIMER_IOCTL_PAUSE		_IO('T', 0xa3)
+
+struct snd_timer_read {
+	unsigned int resolution;
+	unsigned int ticks;
+};
+
+enum {
+	SNDRV_TIMER_EVENT_RESOLUTION = 0,	/* val = resolution in ns */
+	SNDRV_TIMER_EVENT_TICK,			/* val = ticks */
+	SNDRV_TIMER_EVENT_START,		/* val = resolution in ns */
+	SNDRV_TIMER_EVENT_STOP,			/* val = 0 */
+	SNDRV_TIMER_EVENT_CONTINUE,		/* val = resolution in ns */
+	SNDRV_TIMER_EVENT_PAUSE,		/* val = 0 */
+	SNDRV_TIMER_EVENT_EARLY,		/* val = 0, early event */
+	SNDRV_TIMER_EVENT_SUSPEND,		/* val = 0 */
+	SNDRV_TIMER_EVENT_RESUME,		/* val = resolution in ns */
+	/* master timer events for slave timer instances */
+	SNDRV_TIMER_EVENT_MSTART = SNDRV_TIMER_EVENT_START + 10,
+	SNDRV_TIMER_EVENT_MSTOP = SNDRV_TIMER_EVENT_STOP + 10,
+	SNDRV_TIMER_EVENT_MCONTINUE = SNDRV_TIMER_EVENT_CONTINUE + 10,
+	SNDRV_TIMER_EVENT_MPAUSE = SNDRV_TIMER_EVENT_PAUSE + 10,
+	SNDRV_TIMER_EVENT_MSUSPEND = SNDRV_TIMER_EVENT_SUSPEND + 10,
+	SNDRV_TIMER_EVENT_MRESUME = SNDRV_TIMER_EVENT_RESUME + 10,
+};
+
+struct snd_timer_tread {
+	int event;
+	struct timespec tstamp;
+	unsigned int val;
+};
+
+/****************************************************************************
+ *                                                                          *
+ *        Section for driver control interface - /dev/snd/control?          *
+ *                                                                          *
+ ****************************************************************************/
+
+#define SNDRV_CTL_VERSION		SNDRV_PROTOCOL_VERSION(2, 0, 7)
+
+struct snd_ctl_card_info {
+	int card;			/* card number */
+	int pad;			/* reserved for future (was type) */
+	unsigned char id[16];		/* ID of card (user selectable) */
+	unsigned char driver[16];	/* Driver name */
+	unsigned char name[32];		/* Short name of soundcard */
+	unsigned char longname[80];	/* name + info text about soundcard */
+	unsigned char reserved_[16];	/* reserved for future (was ID of mixer) */
+	unsigned char mixername[80];	/* visual mixer identification */
+	unsigned char components[128];	/* card components / fine identification, delimited with one space (AC97 etc..) */
+};
+
+typedef int __bitwise snd_ctl_elem_type_t;
+#define	SNDRV_CTL_ELEM_TYPE_NONE	((__force snd_ctl_elem_type_t) 0) /* invalid */
+#define	SNDRV_CTL_ELEM_TYPE_BOOLEAN	((__force snd_ctl_elem_type_t) 1) /* boolean type */
+#define	SNDRV_CTL_ELEM_TYPE_INTEGER	((__force snd_ctl_elem_type_t) 2) /* integer type */
+#define	SNDRV_CTL_ELEM_TYPE_ENUMERATED	((__force snd_ctl_elem_type_t) 3) /* enumerated type */
+#define	SNDRV_CTL_ELEM_TYPE_BYTES	((__force snd_ctl_elem_type_t) 4) /* byte array */
+#define	SNDRV_CTL_ELEM_TYPE_IEC958	((__force snd_ctl_elem_type_t) 5) /* IEC958 (S/PDIF) setup */
+#define	SNDRV_CTL_ELEM_TYPE_INTEGER64	((__force snd_ctl_elem_type_t) 6) /* 64-bit integer type */
+#define	SNDRV_CTL_ELEM_TYPE_LAST	SNDRV_CTL_ELEM_TYPE_INTEGER64
+
+typedef int __bitwise snd_ctl_elem_iface_t;
+#define	SNDRV_CTL_ELEM_IFACE_CARD	((__force snd_ctl_elem_iface_t) 0) /* global control */
+#define	SNDRV_CTL_ELEM_IFACE_HWDEP	((__force snd_ctl_elem_iface_t) 1) /* hardware dependent device */
+#define	SNDRV_CTL_ELEM_IFACE_MIXER	((__force snd_ctl_elem_iface_t) 2) /* virtual mixer device */
+#define	SNDRV_CTL_ELEM_IFACE_PCM	((__force snd_ctl_elem_iface_t) 3) /* PCM device */
+#define	SNDRV_CTL_ELEM_IFACE_RAWMIDI	((__force snd_ctl_elem_iface_t) 4) /* RawMidi device */
+#define	SNDRV_CTL_ELEM_IFACE_TIMER	((__force snd_ctl_elem_iface_t) 5) /* timer device */
+#define	SNDRV_CTL_ELEM_IFACE_SEQUENCER	((__force snd_ctl_elem_iface_t) 6) /* sequencer client */
+#define	SNDRV_CTL_ELEM_IFACE_LAST	SNDRV_CTL_ELEM_IFACE_SEQUENCER
+
+#define SNDRV_CTL_ELEM_ACCESS_READ		(1<<0)
+#define SNDRV_CTL_ELEM_ACCESS_WRITE		(1<<1)
+#define SNDRV_CTL_ELEM_ACCESS_READWRITE		(SNDRV_CTL_ELEM_ACCESS_READ|SNDRV_CTL_ELEM_ACCESS_WRITE)
+#define SNDRV_CTL_ELEM_ACCESS_VOLATILE		(1<<2)	/* control value may be changed without a notification */
+#define SNDRV_CTL_ELEM_ACCESS_TIMESTAMP		(1<<3)	/* when was control changed */
+#define SNDRV_CTL_ELEM_ACCESS_TLV_READ		(1<<4)	/* TLV read is possible */
+#define SNDRV_CTL_ELEM_ACCESS_TLV_WRITE		(1<<5)	/* TLV write is possible */
+#define SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE	(SNDRV_CTL_ELEM_ACCESS_TLV_READ|SNDRV_CTL_ELEM_ACCESS_TLV_WRITE)
+#define SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND	(1<<6)	/* TLV command is possible */
+#define SNDRV_CTL_ELEM_ACCESS_INACTIVE		(1<<8)	/* control does actually nothing, but may be updated */
+#define SNDRV_CTL_ELEM_ACCESS_LOCK		(1<<9)	/* write lock */
+#define SNDRV_CTL_ELEM_ACCESS_OWNER		(1<<10)	/* write lock owner */
+#define SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK	(1<<28)	/* kernel use a TLV callback */
+#define SNDRV_CTL_ELEM_ACCESS_USER		(1<<29) /* user space element */
+/* bits 30 and 31 are obsoleted (for indirect access) */
+
+/* for further details see the ACPI and PCI power management specification */
+#define SNDRV_CTL_POWER_D0		0x0000	/* full On */
+#define SNDRV_CTL_POWER_D1		0x0100	/* partial On */
+#define SNDRV_CTL_POWER_D2		0x0200	/* partial On */
+#define SNDRV_CTL_POWER_D3		0x0300	/* Off */
+#define SNDRV_CTL_POWER_D3hot		(SNDRV_CTL_POWER_D3|0x0000)	/* Off, with power */
+#define SNDRV_CTL_POWER_D3cold		(SNDRV_CTL_POWER_D3|0x0001)	/* Off, without power */
+
+#define SNDRV_CTL_ELEM_ID_NAME_MAXLEN	44
+
+struct snd_ctl_elem_id {
+	unsigned int numid;		/* numeric identifier, zero = invalid */
+	snd_ctl_elem_iface_t iface;	/* interface identifier */
+	unsigned int device;		/* device/client number */
+	unsigned int subdevice;		/* subdevice (substream) number */
+	unsigned char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];		/* ASCII name of item */
+	unsigned int index;		/* index of item */
+};
+
+struct snd_ctl_elem_list {
+	unsigned int offset;		/* W: first element ID to get */
+	unsigned int space;		/* W: count of element IDs to get */
+	unsigned int used;		/* R: count of element IDs set */
+	unsigned int count;		/* R: count of all elements */
+	struct snd_ctl_elem_id __user *pids; /* R: IDs */
+	unsigned char reserved[50];
+};
+
+struct snd_ctl_elem_info {
+	struct snd_ctl_elem_id id;	/* W: element ID */
+	snd_ctl_elem_type_t type;	/* R: value type - SNDRV_CTL_ELEM_TYPE_* */
+	unsigned int access;		/* R: value access (bitmask) - SNDRV_CTL_ELEM_ACCESS_* */
+	unsigned int count;		/* count of values */
+	__kernel_pid_t owner;		/* owner's PID of this control */
+	union {
+		struct {
+			long min;		/* R: minimum value */
+			long max;		/* R: maximum value */
+			long step;		/* R: step (0 variable) */
+		} integer;
+		struct {
+			long long min;		/* R: minimum value */
+			long long max;		/* R: maximum value */
+			long long step;		/* R: step (0 variable) */
+		} integer64;
+		struct {
+			unsigned int items;	/* R: number of items */
+			unsigned int item;	/* W: item number */
+			char name[64];		/* R: value name */
+			__u64 names_ptr;	/* W: names list (ELEM_ADD only) */
+			unsigned int names_length;
+		} enumerated;
+		unsigned char reserved[128];
+	} value;
+	union {
+		unsigned short d[4];		/* dimensions */
+		unsigned short *d_ptr;		/* indirect - obsoleted */
+	} dimen;
+	unsigned char reserved[64-4*sizeof(unsigned short)];
+};
+
+struct snd_ctl_elem_value {
+	struct snd_ctl_elem_id id;	/* W: element ID */
+	unsigned int indirect: 1;	/* W: indirect access - obsoleted */
+	union {
+		union {
+			long value[128];
+			long *value_ptr;	/* obsoleted */
+		} integer;
+		union {
+			long long value[64];
+			long long *value_ptr;	/* obsoleted */
+		} integer64;
+		union {
+			unsigned int item[128];
+			unsigned int *item_ptr;	/* obsoleted */
+		} enumerated;
+		union {
+			unsigned char data[512];
+			unsigned char *data_ptr;	/* obsoleted */
+		} bytes;
+		struct snd_aes_iec958 iec958;
+	} value;		/* RO */
+	struct timespec tstamp;
+	unsigned char reserved[128-sizeof(struct timespec)];
+};
+
+struct snd_ctl_tlv {
+	unsigned int numid;	/* control element numeric identification */
+	unsigned int length;	/* in bytes aligned to 4 */
+	unsigned int tlv[0];	/* first TLV */
+};
+
+#define SNDRV_CTL_IOCTL_PVERSION	_IOR('U', 0x00, int)
+#define SNDRV_CTL_IOCTL_CARD_INFO	_IOR('U', 0x01, struct snd_ctl_card_info)
+#define SNDRV_CTL_IOCTL_ELEM_LIST	_IOWR('U', 0x10, struct snd_ctl_elem_list)
+#define SNDRV_CTL_IOCTL_ELEM_INFO	_IOWR('U', 0x11, struct snd_ctl_elem_info)
+#define SNDRV_CTL_IOCTL_ELEM_READ	_IOWR('U', 0x12, struct snd_ctl_elem_value)
+#define SNDRV_CTL_IOCTL_ELEM_WRITE	_IOWR('U', 0x13, struct snd_ctl_elem_value)
+#define SNDRV_CTL_IOCTL_ELEM_LOCK	_IOW('U', 0x14, struct snd_ctl_elem_id)
+#define SNDRV_CTL_IOCTL_ELEM_UNLOCK	_IOW('U', 0x15, struct snd_ctl_elem_id)
+#define SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS _IOWR('U', 0x16, int)
+#define SNDRV_CTL_IOCTL_ELEM_ADD	_IOWR('U', 0x17, struct snd_ctl_elem_info)
+#define SNDRV_CTL_IOCTL_ELEM_REPLACE	_IOWR('U', 0x18, struct snd_ctl_elem_info)
+#define SNDRV_CTL_IOCTL_ELEM_REMOVE	_IOWR('U', 0x19, struct snd_ctl_elem_id)
+#define SNDRV_CTL_IOCTL_TLV_READ	_IOWR('U', 0x1a, struct snd_ctl_tlv)
+#define SNDRV_CTL_IOCTL_TLV_WRITE	_IOWR('U', 0x1b, struct snd_ctl_tlv)
+#define SNDRV_CTL_IOCTL_TLV_COMMAND	_IOWR('U', 0x1c, struct snd_ctl_tlv)
+#define SNDRV_CTL_IOCTL_HWDEP_NEXT_DEVICE _IOWR('U', 0x20, int)
+#define SNDRV_CTL_IOCTL_HWDEP_INFO	_IOR('U', 0x21, struct snd_hwdep_info)
+#define SNDRV_CTL_IOCTL_PCM_NEXT_DEVICE	_IOR('U', 0x30, int)
+#define SNDRV_CTL_IOCTL_PCM_INFO	_IOWR('U', 0x31, struct snd_pcm_info)
+#define SNDRV_CTL_IOCTL_PCM_PREFER_SUBDEVICE _IOW('U', 0x32, int)
+#define SNDRV_CTL_IOCTL_RAWMIDI_NEXT_DEVICE _IOWR('U', 0x40, int)
+#define SNDRV_CTL_IOCTL_RAWMIDI_INFO	_IOWR('U', 0x41, struct snd_rawmidi_info)
+#define SNDRV_CTL_IOCTL_RAWMIDI_PREFER_SUBDEVICE _IOW('U', 0x42, int)
+#define SNDRV_CTL_IOCTL_POWER		_IOWR('U', 0xd0, int)
+#define SNDRV_CTL_IOCTL_POWER_STATE	_IOR('U', 0xd1, int)
+
+/*
+ *  Read interface.
+ */
+
+enum sndrv_ctl_event_type {
+	SNDRV_CTL_EVENT_ELEM = 0,
+	SNDRV_CTL_EVENT_LAST = SNDRV_CTL_EVENT_ELEM,
+};
+
+#define SNDRV_CTL_EVENT_MASK_VALUE	(1<<0)	/* element value was changed */
+#define SNDRV_CTL_EVENT_MASK_INFO	(1<<1)	/* element info was changed */
+#define SNDRV_CTL_EVENT_MASK_ADD	(1<<2)	/* element was added */
+#define SNDRV_CTL_EVENT_MASK_TLV	(1<<3)	/* element TLV tree was changed */
+#define SNDRV_CTL_EVENT_MASK_REMOVE	(~0U)	/* element was removed */
+
+struct snd_ctl_event {
+	int type;	/* event type - SNDRV_CTL_EVENT_* */
+	union {
+		struct {
+			unsigned int mask;
+			struct snd_ctl_elem_id id;
+		} elem;
+		unsigned char data8[60];
+	} data;
+};
+
+/*
+ *  Control names
+ */
+
+#define SNDRV_CTL_NAME_NONE				""
+#define SNDRV_CTL_NAME_PLAYBACK				"Playback "
+#define SNDRV_CTL_NAME_CAPTURE				"Capture "
+
+#define SNDRV_CTL_NAME_IEC958_NONE			""
+#define SNDRV_CTL_NAME_IEC958_SWITCH			"Switch"
+#define SNDRV_CTL_NAME_IEC958_VOLUME			"Volume"
+#define SNDRV_CTL_NAME_IEC958_DEFAULT			"Default"
+#define SNDRV_CTL_NAME_IEC958_MASK			"Mask"
+#define SNDRV_CTL_NAME_IEC958_CON_MASK			"Con Mask"
+#define SNDRV_CTL_NAME_IEC958_PRO_MASK			"Pro Mask"
+#define SNDRV_CTL_NAME_IEC958_PCM_STREAM		"PCM Stream"
+#define SNDRV_CTL_NAME_IEC958(expl,direction,what)	"IEC958 " expl SNDRV_CTL_NAME_##direction SNDRV_CTL_NAME_IEC958_##what
+
+#endif /* _UAPI__SOUND_ASOUND_H */
diff --git a/tools/lguest/.gitignore b/tools/lguest/.gitignore
deleted file mode 100644
index 8d9a8383a52e..000000000000
--- a/tools/lguest/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-lguest
-include
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile
deleted file mode 100644
index d04599a79802..000000000000
--- a/tools/lguest/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
-
-all: lguest
-
-include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
-	mkdir -p include/linux 2>&1 || true
-	ln -sf ../../../../include/uapi/linux/virtio_types.h $@
-
-lguest: include/linux/virtio_types.h
-
-clean:
-	rm -f lguest
-	rm -rf include
diff --git a/tools/lguest/extract b/tools/lguest/extract
deleted file mode 100644
index 7730bb6e4b94..000000000000
--- a/tools/lguest/extract
+++ /dev/null
@@ -1,58 +0,0 @@
-#! /bin/sh
-
-set -e
-
-PREFIX=$1
-shift
-
-trap 'rm -r $TMPDIR' 0
-TMPDIR=`mktemp -d`
-
-exec 3>/dev/null
-for f; do
-    while IFS="
-" read -r LINE; do
-	case "$LINE" in
-	    *$PREFIX:[0-9]*:\**)
-		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-		if [ -f $TMPDIR/$NUM ]; then
-		    echo "$TMPDIR/$NUM already exits prior to $f"
-		    exit 1
-		fi
-		exec 3>>$TMPDIR/$NUM
-		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-		/bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
-		;;
-	    *$PREFIX:[0-9]*)
-		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-		if [ -f $TMPDIR/$NUM ]; then
-		    echo "$TMPDIR/$NUM already exits prior to $f"
-		    exit 1
-		fi
-		exec 3>>$TMPDIR/$NUM
-		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-		/bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
-		;;
-	    *:\**)
-		/bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
-		echo >&3
-		exec 3>/dev/null
-		;;
-	    *)
-		/bin/echo "$LINE" >&3
-		;;
-	esac
-    done < $f
-    echo >&3
-    exec 3>/dev/null
-done
-
-LASTFILE=""
-for f in $TMPDIR/*; do
-    if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
-	LASTFILE=$(cat $TMPDIR/.$(basename $f) )
-	echo "[ $LASTFILE ]"
-    fi
-    cat $f
-done
-
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
deleted file mode 100644
index 897cd6f3f687..000000000000
--- a/tools/lguest/lguest.c
+++ /dev/null
@@ -1,3420 +0,0 @@
-/*P:100
- * This is the Launcher code, a simple program which lays out the "physical"
- * memory for the new Guest by mapping the kernel image and the virtual
- * devices, then opens /dev/lguest to tell the kernel about the Guest and
- * control it.
-:*/
-#define _LARGEFILE64_SOURCE
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <err.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <elf.h>
-#include <sys/mman.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/eventfd.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <errno.h>
-#include <ctype.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <time.h>
-#include <netinet/in.h>
-#include <net/if.h>
-#include <linux/sockios.h>
-#include <linux/if_tun.h>
-#include <sys/uio.h>
-#include <termios.h>
-#include <getopt.h>
-#include <assert.h>
-#include <sched.h>
-#include <limits.h>
-#include <stddef.h>
-#include <signal.h>
-#include <pwd.h>
-#include <grp.h>
-#include <sys/user.h>
-#include <linux/pci_regs.h>
-
-#ifndef VIRTIO_F_ANY_LAYOUT
-#define VIRTIO_F_ANY_LAYOUT		27
-#endif
-
-/*L:110
- * We can ignore the 43 include files we need for this program, but I do want
- * to draw attention to the use of kernel-style types.
- *
- * As Linus said, "C is a Spartan language, and so should your naming be."  I
- * like these abbreviations, so we define them here.  Note that u64 is always
- * unsigned long long, which works on all Linux systems: this means that we can
- * use %llu in printf for any u64.
- */
-typedef unsigned long long u64;
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint8_t u8;
-/*:*/
-
-#define VIRTIO_CONFIG_NO_LEGACY
-#define VIRTIO_PCI_NO_LEGACY
-#define VIRTIO_BLK_NO_LEGACY
-#define VIRTIO_NET_NO_LEGACY
-
-/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
-#include "../../include/uapi/linux/virtio_config.h"
-#include "../../include/uapi/linux/virtio_net.h"
-#include "../../include/uapi/linux/virtio_blk.h"
-#include "../../include/uapi/linux/virtio_console.h"
-#include "../../include/uapi/linux/virtio_rng.h"
-#include <linux/virtio_ring.h>
-#include "../../include/uapi/linux/virtio_pci.h"
-#include <asm/bootparam.h>
-#include "../../include/linux/lguest_launcher.h"
-
-#define BRIDGE_PFX "bridge:"
-#ifndef SIOCBRADDIF
-#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
-#endif
-/* We can have up to 256 pages for devices. */
-#define DEVICE_PAGES 256
-/* This will occupy 3 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 256
-
-/*L:120
- * verbose is both a global flag and a macro.  The C preprocessor allows
- * this, and although I wouldn't recommend it, it works quite nicely here.
- */
-static bool verbose;
-#define verbose(args...) \
-	do { if (verbose) printf(args); } while(0)
-/*:*/
-
-/* The pointer to the start of guest memory. */
-static void *guest_base;
-/* The maximum guest physical address allowed, and maximum possible. */
-static unsigned long guest_limit, guest_max, guest_mmio;
-/* The /dev/lguest file descriptor. */
-static int lguest_fd;
-
-/* a per-cpu variable indicating whose vcpu is currently running */
-static unsigned int __thread cpu_id;
-
-/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
-#define MAX_PCI_DEVICES 32
-
-/* This is our list of devices. */
-struct device_list {
-	/* Counter to assign interrupt numbers. */
-	unsigned int next_irq;
-
-	/* Counter to print out convenient device numbers. */
-	unsigned int device_num;
-
-	/* PCI devices. */
-	struct device *pci[MAX_PCI_DEVICES];
-};
-
-/* The list of Guest devices, based on command line arguments. */
-static struct device_list devices;
-
-/*
- * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h,
- * but uses a u32 explicitly for the data.
- */
-struct virtio_pci_cfg_cap_u32 {
-	struct virtio_pci_cap cap;
-	u32 pci_cfg_data; /* Data for BAR access. */
-};
-
-struct virtio_pci_mmio {
-	struct virtio_pci_common_cfg cfg;
-	u16 notify;
-	u8 isr;
-	u8 padding;
-	/* Device-specific configuration follows this. */
-};
-
-/* This is the layout (little-endian) of the PCI config space. */
-struct pci_config {
-	u16 vendor_id, device_id;
-	u16 command, status;
-	u8 revid, prog_if, subclass, class;
-	u8 cacheline_size, lat_timer, header_type, bist;
-	u32 bar[6];
-	u32 cardbus_cis_ptr;
-	u16 subsystem_vendor_id, subsystem_device_id;
-	u32 expansion_rom_addr;
-	u8 capabilities, reserved1[3];
-	u32 reserved2;
-	u8 irq_line, irq_pin, min_grant, max_latency;
-
-	/* Now, this is the linked capability list. */
-	struct virtio_pci_cap common;
-	struct virtio_pci_notify_cap notify;
-	struct virtio_pci_cap isr;
-	struct virtio_pci_cap device;
-	struct virtio_pci_cfg_cap_u32 cfg_access;
-};
-
-/* The device structure describes a single device. */
-struct device {
-	/* The name of this device, for --verbose. */
-	const char *name;
-
-	/* Any queues attached to this device */
-	struct virtqueue *vq;
-
-	/* Is it operational */
-	bool running;
-
-	/* Has it written FEATURES_OK but not re-checked it? */
-	bool wrote_features_ok;
-
-	/* PCI configuration */
-	union {
-		struct pci_config config;
-		u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
-	};
-
-	/* Features we offer, and those accepted. */
-	u64 features, features_accepted;
-
-	/* Device-specific config hangs off the end of this. */
-	struct virtio_pci_mmio *mmio;
-
-	/* PCI MMIO resources (all in BAR0) */
-	size_t mmio_size;
-	u32 mmio_addr;
-
-	/* Device-specific data. */
-	void *priv;
-};
-
-/* The virtqueue structure describes a queue attached to a device. */
-struct virtqueue {
-	struct virtqueue *next;
-
-	/* Which device owns me. */
-	struct device *dev;
-
-	/* Name for printing errors. */
-	const char *name;
-
-	/* The actual ring of buffers. */
-	struct vring vring;
-
-	/* The information about this virtqueue (we only use queue_size on) */
-	struct virtio_pci_common_cfg pci_config;
-
-	/* Last available index we saw. */
-	u16 last_avail_idx;
-
-	/* How many are used since we sent last irq? */
-	unsigned int pending_used;
-
-	/* Eventfd where Guest notifications arrive. */
-	int eventfd;
-
-	/* Function for the thread which is servicing this virtqueue. */
-	void (*service)(struct virtqueue *vq);
-	pid_t thread;
-};
-
-/* Remember the arguments to the program so we can "reboot" */
-static char **main_args;
-
-/* The original tty settings to restore on exit. */
-static struct termios orig_term;
-
-/*
- * We have to be careful with barriers: our devices are all run in separate
- * threads and so we need to make sure that changes visible to the Guest happen
- * in precise order.
- */
-#define wmb() __asm__ __volatile__("" : : : "memory")
-#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-
-/* Wrapper for the last available index.  Makes it easier to change. */
-#define lg_last_avail(vq)	((vq)->last_avail_idx)
-
-/*
- * The virtio configuration space is defined to be little-endian.  x86 is
- * little-endian too, but it's nice to be explicit so we have these helpers.
- */
-#define cpu_to_le16(v16) (v16)
-#define cpu_to_le32(v32) (v32)
-#define cpu_to_le64(v64) (v64)
-#define le16_to_cpu(v16) (v16)
-#define le32_to_cpu(v32) (v32)
-#define le64_to_cpu(v64) (v64)
-
-/*
- * A real device would ignore weird/non-compliant driver behaviour.  We
- * stop and flag it, to help debugging Linux problems.
- */
-#define bad_driver(d, fmt, ...) \
-	errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
-#define bad_driver_vq(vq, fmt, ...)			       \
-	errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
-	     vq->name, ## __VA_ARGS__)
-
-/* Is this iovec empty? */
-static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_iov; i++)
-		if (iov[i].iov_len)
-			return false;
-	return true;
-}
-
-/* Take len bytes from the front of this iovec. */
-static void iov_consume(struct device *d,
-			struct iovec iov[], unsigned num_iov,
-			void *dest, unsigned len)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_iov; i++) {
-		unsigned int used;
-
-		used = iov[i].iov_len < len ? iov[i].iov_len : len;
-		if (dest) {
-			memcpy(dest, iov[i].iov_base, used);
-			dest += used;
-		}
-		iov[i].iov_base += used;
-		iov[i].iov_len -= used;
-		len -= used;
-	}
-	if (len != 0)
-		bad_driver(d, "iovec too short!");
-}
-
-/*L:100
- * The Launcher code itself takes us out into userspace, that scary place where
- * pointers run wild and free!  Unfortunately, like most userspace programs,
- * it's quite boring (which is why everyone likes to hack on the kernel!).
- * Perhaps if you make up an Lguest Drinking Game at this point, it will get
- * you through this section.  Or, maybe not.
- *
- * The Launcher sets up a big chunk of memory to be the Guest's "physical"
- * memory and stores it in "guest_base".  In other words, Guest physical ==
- * Launcher virtual with an offset.
- *
- * This can be tough to get your head around, but usually it just means that we
- * use these trivial conversion functions when the Guest gives us its
- * "physical" addresses:
- */
-static void *from_guest_phys(unsigned long addr)
-{
-	return guest_base + addr;
-}
-
-static unsigned long to_guest_phys(const void *addr)
-{
-	return (addr - guest_base);
-}
-
-/*L:130
- * Loading the Kernel.
- *
- * We start with couple of simple helper routines.  open_or_die() avoids
- * error-checking code cluttering the callers:
- */
-static int open_or_die(const char *name, int flags)
-{
-	int fd = open(name, flags);
-	if (fd < 0)
-		err(1, "Failed to open %s", name);
-	return fd;
-}
-
-/* map_zeroed_pages() takes a number of pages. */
-static void *map_zeroed_pages(unsigned int num)
-{
-	int fd = open_or_die("/dev/zero", O_RDONLY);
-	void *addr;
-
-	/*
-	 * We use a private mapping (ie. if we write to the page, it will be
-	 * copied). We allocate an extra two pages PROT_NONE to act as guard
-	 * pages against read/write attempts that exceed allocated space.
-	 */
-	addr = mmap(NULL, getpagesize() * (num+2),
-		    PROT_NONE, MAP_PRIVATE, fd, 0);
-
-	if (addr == MAP_FAILED)
-		err(1, "Mmapping %u pages of /dev/zero", num);
-
-	if (mprotect(addr + getpagesize(), getpagesize() * num,
-		     PROT_READ|PROT_WRITE) == -1)
-		err(1, "mprotect rw %u pages failed", num);
-
-	/*
-	 * One neat mmap feature is that you can close the fd, and it
-	 * stays mapped.
-	 */
-	close(fd);
-
-	/* Return address after PROT_NONE page */
-	return addr + getpagesize();
-}
-
-/* Get some bytes which won't be mapped into the guest. */
-static unsigned long get_mmio_region(size_t size)
-{
-	unsigned long addr = guest_mmio;
-	size_t i;
-
-	if (!size)
-		return addr;
-
-	/* Size has to be a power of 2 (and multiple of 16) */
-	for (i = 1; i < size; i <<= 1);
-
-	guest_mmio += i;
-
-	return addr;
-}
-
-/*
- * This routine is used to load the kernel or initrd.  It tries mmap, but if
- * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
- * it falls back to reading the memory in.
- */
-static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
-{
-	ssize_t r;
-
-	/*
-	 * We map writable even though for some segments are marked read-only.
-	 * The kernel really wants to be writable: it patches its own
-	 * instructions.
-	 *
-	 * MAP_PRIVATE means that the page won't be copied until a write is
-	 * done to it.  This allows us to share untouched memory between
-	 * Guests.
-	 */
-	if (mmap(addr, len, PROT_READ|PROT_WRITE,
-		 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
-		return;
-
-	/* pread does a seek and a read in one shot: saves a few lines. */
-	r = pread(fd, addr, len, offset);
-	if (r != len)
-		err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
-}
-
-/*
- * This routine takes an open vmlinux image, which is in ELF, and maps it into
- * the Guest memory.  ELF = Embedded Linking Format, which is the format used
- * by all modern binaries on Linux including the kernel.
- *
- * The ELF headers give *two* addresses: a physical address, and a virtual
- * address.  We use the physical address; the Guest will map itself to the
- * virtual address.
- *
- * We return the starting address.
- */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
-{
-	Elf32_Phdr phdr[ehdr->e_phnum];
-	unsigned int i;
-
-	/*
-	 * Sanity checks on the main ELF header: an x86 executable with a
-	 * reasonable number of correctly-sized program headers.
-	 */
-	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
-		errx(1, "Malformed elf header");
-
-	/*
-	 * An ELF executable contains an ELF header and a number of "program"
-	 * headers which indicate which parts ("segments") of the program to
-	 * load where.
-	 */
-
-	/* We read in all the program headers at once: */
-	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
-		err(1, "Seeking to program headers");
-	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
-		err(1, "Reading program headers");
-
-	/*
-	 * Try all the headers: there are usually only three.  A read-only one,
-	 * a read-write one, and a "note" section which we don't load.
-	 */
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		/* If this isn't a loadable segment, we ignore it */
-		if (phdr[i].p_type != PT_LOAD)
-			continue;
-
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-
-		/* We map this section of the file at its physical address. */
-		map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
-		       phdr[i].p_offset, phdr[i].p_filesz);
-	}
-
-	/* The entry point is given in the ELF header. */
-	return ehdr->e_entry;
-}
-
-/*L:150
- * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
- * to jump into it and it will unpack itself.  We used to have to perform some
- * hairy magic because the unpacking code scared me.
- *
- * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
- * a small patch to jump over the tricky bits in the Guest, so now we just read
- * the funky header so we know where in the file to load, and away we go!
- */
-static unsigned long load_bzimage(int fd)
-{
-	struct boot_params boot;
-	int r;
-	/* Modern bzImages get loaded at 1M. */
-	void *p = from_guest_phys(0x100000);
-
-	/*
-	 * Go back to the start of the file and read the header.  It should be
-	 * a Linux boot header (see Documentation/x86/boot.txt)
-	 */
-	lseek(fd, 0, SEEK_SET);
-	read(fd, &boot, sizeof(boot));
-
-	/* Inside the setup_hdr, we expect the magic "HdrS" */
-	if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
-		errx(1, "This doesn't look like a bzImage to me");
-
-	/* Skip over the extra sectors of the header. */
-	lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
-
-	/* Now read everything into memory. in nice big chunks. */
-	while ((r = read(fd, p, 65536)) > 0)
-		p += r;
-
-	/* Finally, code32_start tells us where to enter the kernel. */
-	return boot.hdr.code32_start;
-}
-
-/*L:140
- * Loading the kernel is easy when it's a "vmlinux", but most kernels
- * come wrapped up in the self-decompressing "bzImage" format.  With a little
- * work, we can load those, too.
- */
-static unsigned long load_kernel(int fd)
-{
-	Elf32_Ehdr hdr;
-
-	/* Read in the first few bytes. */
-	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
-		err(1, "Reading kernel");
-
-	/* If it's an ELF file, it starts with "\177ELF" */
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr);
-
-	/* Otherwise we assume it's a bzImage, and try to load it. */
-	return load_bzimage(fd);
-}
-
-/*
- * This is a trivial little helper to align pages.  Andi Kleen hated it because
- * it calls getpagesize() twice: "it's dumb code."
- *
- * Kernel guys get really het up about optimization, even when it's not
- * necessary.  I leave this code as a reaction against that.
- */
-static inline unsigned long page_align(unsigned long addr)
-{
-	/* Add upwards and truncate downwards. */
-	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
-}
-
-/*L:180
- * An "initial ram disk" is a disk image loaded into memory along with the
- * kernel which the kernel can use to boot from without needing any drivers.
- * Most distributions now use this as standard: the initrd contains the code to
- * load the appropriate driver modules for the current machine.
- *
- * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
- * kernels.  He sent me this (and tells me when I break it).
- */
-static unsigned long load_initrd(const char *name, unsigned long mem)
-{
-	int ifd;
-	struct stat st;
-	unsigned long len;
-
-	ifd = open_or_die(name, O_RDONLY);
-	/* fstat() is needed to get the file size. */
-	if (fstat(ifd, &st) < 0)
-		err(1, "fstat() on initrd '%s'", name);
-
-	/*
-	 * We map the initrd at the top of memory, but mmap wants it to be
-	 * page-aligned, so we round the size up for that.
-	 */
-	len = page_align(st.st_size);
-	map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
-	/*
-	 * Once a file is mapped, you can close the file descriptor.  It's a
-	 * little odd, but quite useful.
-	 */
-	close(ifd);
-	verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
-
-	/* We return the initrd size. */
-	return len;
-}
-/*:*/
-
-/*
- * Simple routine to roll all the commandline arguments together with spaces
- * between them.
- */
-static void concat(char *dst, char *args[])
-{
-	unsigned int i, len = 0;
-
-	for (i = 0; args[i]; i++) {
-		if (i) {
-			strcat(dst+len, " ");
-			len++;
-		}
-		strcpy(dst+len, args[i]);
-		len += strlen(args[i]);
-	}
-	/* In case it's empty. */
-	dst[len] = '\0';
-}
-
-/*L:185
- * This is where we actually tell the kernel to initialize the Guest.  We
- * saw the arguments it expects when we looked at initialize() in lguest_user.c:
- * the base of Guest "physical" memory, the top physical page to allow and the
- * entry point for the Guest.
- */
-static void tell_kernel(unsigned long start)
-{
-	unsigned long args[] = { LHREQ_INITIALIZE,
-				 (unsigned long)guest_base,
-				 guest_limit / getpagesize(), start,
-				 (guest_mmio+getpagesize()-1) / getpagesize() };
-	verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
-		guest_base, guest_base + guest_limit,
-		guest_limit, guest_mmio);
-	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
-	if (write(lguest_fd, args, sizeof(args)) < 0)
-		err(1, "Writing to /dev/lguest");
-}
-/*:*/
-
-/*L:200
- * Device Handling.
- *
- * When the Guest gives us a buffer, it sends an array of addresses and sizes.
- * We need to make sure it's not trying to reach into the Launcher itself, so
- * we have a convenient routine which checks it and exits with an error message
- * if something funny is going on:
- */
-static void *_check_pointer(struct device *d,
-			    unsigned long addr, unsigned int size,
-			    unsigned int line)
-{
-	/*
-	 * Check if the requested address and size exceeds the allocated memory,
-	 * or addr + size wraps around.
-	 */
-	if ((addr + size) > guest_limit || (addr + size) < addr)
-		bad_driver(d, "%s:%i: Invalid address %#lx",
-			   __FILE__, line, addr);
-	/*
-	 * We return a pointer for the caller's convenience, now we know it's
-	 * safe to use.
-	 */
-	return from_guest_phys(addr);
-}
-/* A macro which transparently hands the line number to the real function. */
-#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
-
-/*
- * Each buffer in the virtqueues is actually a chain of descriptors.  This
- * function returns the next descriptor in the chain, or vq->vring.num if we're
- * at the end.
- */
-static unsigned next_desc(struct device *d, struct vring_desc *desc,
-			  unsigned int i, unsigned int max)
-{
-	unsigned int next;
-
-	/* If this descriptor says it doesn't chain, we're done. */
-	if (!(desc[i].flags & VRING_DESC_F_NEXT))
-		return max;
-
-	/* Check they're not leading us off end of descriptors. */
-	next = desc[i].next;
-	/* Make sure compiler knows to grab that: we don't want it changing! */
-	wmb();
-
-	if (next >= max)
-		bad_driver(d, "Desc next is %u", next);
-
-	return next;
-}
-
-/*
- * This actually sends the interrupt for this virtqueue, if we've used a
- * buffer.
- */
-static void trigger_irq(struct virtqueue *vq)
-{
-	unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
-
-	/* Don't inform them if nothing used. */
-	if (!vq->pending_used)
-		return;
-	vq->pending_used = 0;
-
-	/*
-	 * 2.4.7.1:
-	 *
-	 *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-	 *    The driver MUST set flags to 0 or 1. 
-	 */
-	if (vq->vring.avail->flags > 1)
-		bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
-
-	/*
-	 * 2.4.7.2:
-	 *
-	 *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-	 *
-	 *     - The device MUST ignore the used_event value.
-	 *     - After the device writes a descriptor index into the used ring:
-	 *         - If flags is 1, the device SHOULD NOT send an interrupt.
-	 *         - If flags is 0, the device MUST send an interrupt.
-	 */
-	if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
-		return;
-	}
-
-	/*
-	 * 4.1.4.5.1:
-	 *
-	 *  If MSI-X capability is disabled, the device MUST set the Queue
-	 *  Interrupt bit in ISR status before sending a virtqueue notification
-	 *  to the driver.
-	 */
-	vq->dev->mmio->isr = 0x1;
-
-	/* Send the Guest an interrupt tell them we used something up. */
-	if (write(lguest_fd, buf, sizeof(buf)) != 0)
-		err(1, "Triggering irq %i", vq->dev->config.irq_line);
-}
-
-/*
- * This looks in the virtqueue for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function waits if necessary, and returns the descriptor number found.
- */
-static unsigned wait_for_vq_desc(struct virtqueue *vq,
-				 struct iovec iov[],
-				 unsigned int *out_num, unsigned int *in_num)
-{
-	unsigned int i, head, max;
-	struct vring_desc *desc;
-	u16 last_avail = lg_last_avail(vq);
-
-	/*
-	 * 2.4.7.1:
-	 *
-	 *   The driver MUST handle spurious interrupts from the device.
-	 *
-	 * That's why this is a while loop.
-	 */
-
-	/* There's nothing available? */
-	while (last_avail == vq->vring.avail->idx) {
-		u64 event;
-
-		/*
-		 * Since we're about to sleep, now is a good time to tell the
-		 * Guest about what we've used up to now.
-		 */
-		trigger_irq(vq);
-
-		/* OK, now we need to know about added descriptors. */
-		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
-
-		/*
-		 * They could have slipped one in as we were doing that: make
-		 * sure it's written, then check again.
-		 */
-		mb();
-		if (last_avail != vq->vring.avail->idx) {
-			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-			break;
-		}
-
-		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
-			errx(1, "Event read failed?");
-
-		/* We don't need to be notified again. */
-		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-	}
-
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
-		bad_driver_vq(vq, "Guest moved used index from %u to %u",
-			      last_avail, vq->vring.avail->idx);
-
-	/* 
-	 * Make sure we read the descriptor number *after* we read the ring
-	 * update; don't let the cpu or compiler change the order.
-	 */
-	rmb();
-
-	/*
-	 * Grab the next descriptor number they're advertising, and increment
-	 * the index we've seen.
-	 */
-	head = vq->vring.avail->ring[last_avail % vq->vring.num];
-	lg_last_avail(vq)++;
-
-	/* If their number is silly, that's a fatal mistake. */
-	if (head >= vq->vring.num)
-		bad_driver_vq(vq, "Guest says index %u is available", head);
-
-	/* When we start there are none of either input nor output. */
-	*out_num = *in_num = 0;
-
-	max = vq->vring.num;
-	desc = vq->vring.desc;
-	i = head;
-
-	/*
-	 * We have to read the descriptor after we read the descriptor number,
-	 * but there's a data dependency there so the CPU shouldn't reorder
-	 * that: no rmb() required.
-	 */
-
-	do {
-		/*
-		 * If this is an indirect entry, then this buffer contains a
-		 * descriptor table which we handle as if it's any normal
-		 * descriptor chain.
-		 */
-		if (desc[i].flags & VRING_DESC_F_INDIRECT) {
-			/* 2.4.5.3.1:
-			 *
-			 *  The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-			 *  flag unless the VIRTIO_F_INDIRECT_DESC feature was
-			 *  negotiated.
-			 */
-			if (!(vq->dev->features_accepted &
-			      (1<<VIRTIO_RING_F_INDIRECT_DESC)))
-				bad_driver_vq(vq, "vq indirect not negotiated");
-
-			/*
-			 * 2.4.5.3.1:
-			 *
-			 *   The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-			 *   flag within an indirect descriptor (ie. only one
-			 *   table per descriptor).
-			 */
-			if (desc != vq->vring.desc)
-				bad_driver_vq(vq, "Indirect within indirect");
-
-			/*
-			 * Proposed update VIRTIO-134 spells this out:
-			 *
-			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
-			 *   and VIRTQ_DESC_F_NEXT in flags.
-			 */
-			if (desc[i].flags & VRING_DESC_F_NEXT)
-				bad_driver_vq(vq, "indirect and next together");
-
-			if (desc[i].len % sizeof(struct vring_desc))
-				bad_driver_vq(vq,
-					      "Invalid size for indirect table");
-			/*
-			 * 2.4.5.3.2:
-			 *
-			 *  The device MUST ignore the write-only flag
-			 *  (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
-			 *  refers to an indirect table.
-			 *
-			 * We ignore it here: :)
-			 */
-
-			max = desc[i].len / sizeof(struct vring_desc);
-			desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
-			i = 0;
-
-			/* 2.4.5.3.1:
-			 *
-			 *  A driver MUST NOT create a descriptor chain longer
-			 *  than the Queue Size of the device.
-			 */
-			if (max > vq->pci_config.queue_size)
-				bad_driver_vq(vq,
-					      "indirect has too many entries");
-		}
-
-		/* Grab the first descriptor, and check it's OK. */
-		iov[*out_num + *in_num].iov_len = desc[i].len;
-		iov[*out_num + *in_num].iov_base
-			= check_pointer(vq->dev, desc[i].addr, desc[i].len);
-		/* If this is an input descriptor, increment that count. */
-		if (desc[i].flags & VRING_DESC_F_WRITE)
-			(*in_num)++;
-		else {
-			/*
-			 * If it's an output descriptor, they're all supposed
-			 * to come before any input descriptors.
-			 */
-			if (*in_num)
-				bad_driver_vq(vq,
-					      "Descriptor has out after in");
-			(*out_num)++;
-		}
-
-		/* If we've got too many, that implies a descriptor loop. */
-		if (*out_num + *in_num > max)
-			bad_driver_vq(vq, "Looped descriptor");
-	} while ((i = next_desc(vq->dev, desc, i, max)) != max);
-
-	return head;
-}
-
-/*
- * After we've used one of their buffers, we tell the Guest about it.  Sometime
- * later we'll want to send them an interrupt using trigger_irq(); note that
- * wait_for_vq_desc() does that for us if it has to wait.
- */
-static void add_used(struct virtqueue *vq, unsigned int head, int len)
-{
-	struct vring_used_elem *used;
-
-	/*
-	 * The virtqueue contains a ring of used buffers.  Get a pointer to the
-	 * next entry in that used ring.
-	 */
-	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
-	used->id = head;
-	used->len = len;
-	/* Make sure buffer is written before we update index. */
-	wmb();
-	vq->vring.used->idx++;
-	vq->pending_used++;
-}
-
-/* And here's the combo meal deal.  Supersize me! */
-static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
-{
-	add_used(vq, head, len);
-	trigger_irq(vq);
-}
-
-/*
- * The Console
- *
- * We associate some data with the console for our exit hack.
- */
-struct console_abort {
-	/* How many times have they hit ^C? */
-	int count;
-	/* When did they start? */
-	struct timeval start;
-};
-
-/* This is the routine which handles console input (ie. stdin). */
-static void console_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, in_num, out_num;
-	struct console_abort *abort = vq->dev->priv;
-	struct iovec iov[vq->vring.num];
-
-	/* Make sure there's a descriptor available. */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-	if (out_num)
-		bad_driver_vq(vq, "Output buffers in console in queue?");
-
-	/* Read into it.  This is where we usually wait. */
-	len = readv(STDIN_FILENO, iov, in_num);
-	if (len <= 0) {
-		/* Ran out of input? */
-		warnx("Failed to get console input, ignoring console.");
-		/*
-		 * For simplicity, dying threads kill the whole Launcher.  So
-		 * just nap here.
-		 */
-		for (;;)
-			pause();
-	}
-
-	/* Tell the Guest we used a buffer. */
-	add_used_and_trigger(vq, head, len);
-
-	/*
-	 * Three ^C within one second?  Exit.
-	 *
-	 * This is such a hack, but works surprisingly well.  Each ^C has to
-	 * be in a buffer by itself, so they can't be too fast.  But we check
-	 * that we get three within about a second, so they can't be too
-	 * slow.
-	 */
-	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
-		abort->count = 0;
-		return;
-	}
-
-	abort->count++;
-	if (abort->count == 1)
-		gettimeofday(&abort->start, NULL);
-	else if (abort->count == 3) {
-		struct timeval now;
-		gettimeofday(&now, NULL);
-		/* Kill all Launcher processes with SIGINT, like normal ^C */
-		if (now.tv_sec <= abort->start.tv_sec+1)
-			kill(0, SIGINT);
-		abort->count = 0;
-	}
-}
-
-/* This is the routine which handles console output (ie. stdout). */
-static void console_output(struct virtqueue *vq)
-{
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-
-	/* We usually wait in here, for the Guest to give us something. */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (in)
-		bad_driver_vq(vq, "Input buffers in console output queue?");
-
-	/* writev can return a partial write, so we loop here. */
-	while (!iov_empty(iov, out)) {
-		int len = writev(STDOUT_FILENO, iov, out);
-		if (len <= 0) {
-			warn("Write to stdout gave %i (%d)", len, errno);
-			break;
-		}
-		iov_consume(vq->dev, iov, out, NULL, len);
-	}
-
-	/*
-	 * We're finished with that buffer: if we're going to sleep,
-	 * wait_for_vq_desc() will prod the Guest with an interrupt.
-	 */
-	add_used(vq, head, 0);
-}
-
-/*
- * The Network
- *
- * Handling output for network is also simple: we get all the output buffers
- * and write them to /dev/net/tun.
- */
-struct net_info {
-	int tunfd;
-};
-
-static void net_output(struct virtqueue *vq)
-{
-	struct net_info *net_info = vq->dev->priv;
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-
-	/* We usually wait in here for the Guest to give us a packet. */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (in)
-		bad_driver_vq(vq, "Input buffers in net output queue?");
-	/*
-	 * Send the whole thing through to /dev/net/tun.  It expects the exact
-	 * same format: what a coincidence!
-	 */
-	if (writev(net_info->tunfd, iov, out) < 0)
-		warnx("Write to tun failed (%d)?", errno);
-
-	/*
-	 * Done with that one; wait_for_vq_desc() will send the interrupt if
-	 * all packets are processed.
-	 */
-	add_used(vq, head, 0);
-}
-
-/*
- * Handling network input is a bit trickier, because I've tried to optimize it.
- *
- * First we have a helper routine which tells is if from this file descriptor
- * (ie. the /dev/net/tun device) will block:
- */
-static bool will_block(int fd)
-{
-	fd_set fdset;
-	struct timeval zero = { 0, 0 };
-	FD_ZERO(&fdset);
-	FD_SET(fd, &fdset);
-	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
-}
-
-/*
- * This handles packets coming in from the tun device to our Guest.  Like all
- * service routines, it gets called again as soon as it returns, so you don't
- * see a while(1) loop here.
- */
-static void net_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-	struct net_info *net_info = vq->dev->priv;
-
-	/*
-	 * Get a descriptor to write an incoming packet into.  This will also
-	 * send an interrupt if they're out of descriptors.
-	 */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (out)
-		bad_driver_vq(vq, "Output buffers in net input queue?");
-
-	/*
-	 * If it looks like we'll block reading from the tun device, send them
-	 * an interrupt.
-	 */
-	if (vq->pending_used && will_block(net_info->tunfd))
-		trigger_irq(vq);
-
-	/*
-	 * Read in the packet.  This is where we normally wait (when there's no
-	 * incoming network traffic).
-	 */
-	len = readv(net_info->tunfd, iov, in);
-	if (len <= 0)
-		warn("Failed to read from tun (%d).", errno);
-
-	/*
-	 * Mark that packet buffer as used, but don't interrupt here.  We want
-	 * to wait until we've done as much work as we can.
-	 */
-	add_used(vq, head, len);
-}
-/*:*/
-
-/* This is the helper to create threads: run the service routine in a loop. */
-static int do_thread(void *_vq)
-{
-	struct virtqueue *vq = _vq;
-
-	for (;;)
-		vq->service(vq);
-	return 0;
-}
-
-/*
- * When a child dies, we kill our entire process group with SIGTERM.  This
- * also has the side effect that the shell restores the console for us!
- */
-static void kill_launcher(int signal)
-{
-	kill(0, SIGTERM);
-}
-
-static void reset_vq_pci_config(struct virtqueue *vq)
-{
-	vq->pci_config.queue_size = VIRTQUEUE_NUM;
-	vq->pci_config.queue_enable = 0;
-}
-
-static void reset_device(struct device *dev)
-{
-	struct virtqueue *vq;
-
-	verbose("Resetting device %s\n", dev->name);
-
-	/* Clear any features they've acked. */
-	dev->features_accepted = 0;
-
-	/* We're going to be explicitly killing threads, so ignore them. */
-	signal(SIGCHLD, SIG_IGN);
-
-	/*
-	 * 4.1.4.3.1:
-	 *
-	 *   The device MUST present a 0 in queue_enable on reset. 
-	 *
-	 * This means we set it here, and reset the saved ones in every vq.
-	 */
-	dev->mmio->cfg.queue_enable = 0;
-
-	/* Get rid of the virtqueue threads */
-	for (vq = dev->vq; vq; vq = vq->next) {
-		vq->last_avail_idx = 0;
-		reset_vq_pci_config(vq);
-		if (vq->thread != (pid_t)-1) {
-			kill(vq->thread, SIGTERM);
-			waitpid(vq->thread, NULL, 0);
-			vq->thread = (pid_t)-1;
-		}
-	}
-	dev->running = false;
-	dev->wrote_features_ok = false;
-
-	/* Now we care if threads die. */
-	signal(SIGCHLD, (void *)kill_launcher);
-}
-
-static void cleanup_devices(void)
-{
-	unsigned int i;
-
-	for (i = 1; i < MAX_PCI_DEVICES; i++) {
-		struct device *d = devices.pci[i];
-		if (!d)
-			continue;
-		reset_device(d);
-	}
-
-	/* If we saved off the original terminal settings, restore them now. */
-	if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
-		tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-/*L:217
- * We do PCI.  This is mainly done to let us test the kernel virtio PCI
- * code.
- */
-
-/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
-static struct device pci_host_bridge;
-
-static void init_pci_host_bridge(void)
-{
-	pci_host_bridge.name = "PCI Host Bridge";
-	pci_host_bridge.config.class = 0x06; /* bridge */
-	pci_host_bridge.config.subclass = 0; /* host bridge */
-	devices.pci[0] = &pci_host_bridge;
-}
-
-/* The IO ports used to read the PCI config space. */
-#define PCI_CONFIG_ADDR 0xCF8
-#define PCI_CONFIG_DATA 0xCFC
-
-/*
- * Not really portable, but does help readability: this is what the Guest
- * writes to the PCI_CONFIG_ADDR IO port.
- */
-union pci_config_addr {
-	struct {
-		unsigned mbz: 2;
-		unsigned offset: 6;
-		unsigned funcnum: 3;
-		unsigned devnum: 5;
-		unsigned busnum: 8;
-		unsigned reserved: 7;
-		unsigned enabled : 1;
-	} bits;
-	u32 val;
-};
-
-/*
- * We cache what they wrote to the address port, so we know what they're
- * talking about when they access the data port.
- */
-static union pci_config_addr pci_config_addr;
-
-static struct device *find_pci_device(unsigned int index)
-{
-	return devices.pci[index];
-}
-
-/* PCI can do 1, 2 and 4 byte reads; we handle that here. */
-static void ioread(u16 off, u32 v, u32 mask, u32 *val)
-{
-	assert(off < 4);
-	assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-	*val = (v >> (off * 8)) & mask;
-}
-
-/* PCI can do 1, 2 and 4 byte writes; we handle that here. */
-static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
-{
-	assert(off < 4);
-	assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-	*dst &= ~(mask << (off * 8));
-	*dst |= (v & mask) << (off * 8);
-}
-
-/*
- * Where PCI_CONFIG_DATA accesses depends on the previous write to
- * PCI_CONFIG_ADDR.
- */
-static struct device *dev_and_reg(u32 *reg)
-{
-	if (!pci_config_addr.bits.enabled)
-		return NULL;
-
-	if (pci_config_addr.bits.funcnum != 0)
-		return NULL;
-
-	if (pci_config_addr.bits.busnum != 0)
-		return NULL;
-
-	if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
-		return NULL;
-
-	*reg = pci_config_addr.bits.offset;
-	return find_pci_device(pci_config_addr.bits.devnum);
-}
-
-/*
- * We can get invalid combinations of values while they're writing, so we
- * only fault if they try to write with some invalid bar/offset/length.
- */
-static bool valid_bar_access(struct device *d,
-			     struct virtio_pci_cfg_cap_u32 *cfg_access)
-{
-	/* We only have 1 bar (BAR0) */
-	if (cfg_access->cap.bar != 0)
-		return false;
-
-	/* Check it's within BAR0. */
-	if (cfg_access->cap.offset >= d->mmio_size
-	    || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
-		return false;
-
-	/* Check length is 1, 2 or 4. */
-	if (cfg_access->cap.length != 1
-	    && cfg_access->cap.length != 2
-	    && cfg_access->cap.length != 4)
-		return false;
-
-	/*
-	 * 4.1.4.7.2:
-	 *
-	 *  The driver MUST NOT write a cap.offset which is not a multiple of
-	 *  cap.length (ie. all accesses MUST be aligned).
-	 */
-	if (cfg_access->cap.offset % cfg_access->cap.length != 0)
-		return false;
-
-	/* Return pointer into word in BAR0. */
-	return true;
-}
-
-/* Is this accessing the PCI config address port?. */
-static bool is_pci_addr_port(u16 port)
-{
-	return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
-}
-
-static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
-{
-	iowrite(port - PCI_CONFIG_ADDR, val, mask,
-		&pci_config_addr.val);
-	verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
-		pci_config_addr.bits.enabled ? "" : " DISABLED",
-		val, mask,
-		pci_config_addr.bits.busnum,
-		pci_config_addr.bits.devnum,
-		pci_config_addr.bits.funcnum,
-		pci_config_addr.bits.offset);
-	return true;
-}
-
-static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
-{
-	ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
-}
-
-/* Is this accessing the PCI config data port?. */
-static bool is_pci_data_port(u16 port)
-{
-	return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
-
-static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
-{
-	u32 reg, portoff;
-	struct device *d = dev_and_reg(&reg);
-
-	/* Complain if they don't belong to a device. */
-	if (!d)
-		return false;
-
-	/* They can do 1 byte writes, etc. */
-	portoff = port - PCI_CONFIG_DATA;
-
-	/*
-	 * PCI uses a weird way to determine the BAR size: the OS
-	 * writes all 1's, and sees which ones stick.
-	 */
-	if (&d->config_words[reg] == &d->config.bar[0]) {
-		int i;
-
-		iowrite(portoff, val, mask, &d->config.bar[0]);
-		for (i = 0; (1 << i) < d->mmio_size; i++)
-			d->config.bar[0] &= ~(1 << i);
-		return true;
-	} else if ((&d->config_words[reg] > &d->config.bar[0]
-		    && &d->config_words[reg] <= &d->config.bar[6])
-		   || &d->config_words[reg] == &d->config.expansion_rom_addr) {
-		/* Allow writing to any other BAR, or expansion ROM */
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-		/* We let them override latency timer and cacheline size */
-	} else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
-		/* Only let them change the first two fields. */
-		if (mask == 0xFFFFFFFF)
-			mask = 0xFFFF;
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-	} else if (&d->config_words[reg] == (void *)&d->config.command
-		   && mask == 0xFFFF) {
-		/* Ignore command writes. */
-		return true;
-	} else if (&d->config_words[reg]
-		   == (void *)&d->config.cfg_access.cap.bar
-		   || &d->config_words[reg]
-		   == &d->config.cfg_access.cap.length
-		   || &d->config_words[reg]
-		   == &d->config.cfg_access.cap.offset) {
-
-		/*
-		 * The VIRTIO_PCI_CAP_PCI_CFG capability
-		 * provides a backdoor to access the MMIO
-		 * regions without mapping them.  Weird, but
-		 * useful.
-		 */
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-	} else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-		u32 write_mask;
-
-		/*
-		 * 4.1.4.7.1:
-		 *
-		 *  Upon detecting driver write access to pci_cfg_data, the
-		 *  device MUST execute a write access at offset cap.offset at
-		 *  BAR selected by cap.bar using the first cap.length bytes
-		 *  from pci_cfg_data.
-		 */
-
-		/* Must be bar 0 */
-		if (!valid_bar_access(d, &d->config.cfg_access))
-			return false;
-
-		iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
-
-		/*
-		 * Now emulate a write.  The mask we use is set by
-		 * len, *not* this write!
-		 */
-		write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
-		verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
-			d->config.cfg_access.pci_cfg_data, write_mask,
-			d->config.cfg_access.cap.bar,
-			d->config.cfg_access.cap.offset,
-			d->config.cfg_access.cap.length);
-
-		emulate_mmio_write(d, d->config.cfg_access.cap.offset,
-				   d->config.cfg_access.pci_cfg_data,
-				   write_mask);
-		return true;
-	}
-
-	/*
-	 * 4.1.4.1:
-	 *
-	 *  The driver MUST NOT write into any field of the capability
-	 *  structure, with the exception of those with cap_type
-	 *  VIRTIO_PCI_CAP_PCI_CFG...
-	 */
-	return false;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
-
-static void pci_data_ioread(u16 port, u32 mask, u32 *val)
-{
-	u32 reg;
-	struct device *d = dev_and_reg(&reg);
-
-	if (!d)
-		return;
-
-	/* Read through the PCI MMIO access window is special */
-	if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-		u32 read_mask;
-
-		/*
-		 * 4.1.4.7.1:
-		 *
-		 *  Upon detecting driver read access to pci_cfg_data, the
-		 *  device MUST execute a read access of length cap.length at
-		 *  offset cap.offset at BAR selected by cap.bar and store the
-		 *  first cap.length bytes in pci_cfg_data.
-		 */
-		/* Must be bar 0 */
-		if (!valid_bar_access(d, &d->config.cfg_access))
-			bad_driver(d,
-			     "Invalid cfg_access to bar%u, offset %u len %u",
-			     d->config.cfg_access.cap.bar,
-			     d->config.cfg_access.cap.offset,
-			     d->config.cfg_access.cap.length);
-
-		/*
-		 * Read into the window.  The mask we use is set by
-		 * len, *not* this read!
-		 */
-		read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
-		d->config.cfg_access.pci_cfg_data
-			= emulate_mmio_read(d,
-					    d->config.cfg_access.cap.offset,
-					    read_mask);
-		verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
-			d->config.cfg_access.pci_cfg_data, read_mask,
-			d->config.cfg_access.cap.bar,
-			d->config.cfg_access.cap.offset,
-			d->config.cfg_access.cap.length);
-	}
-	ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
-}
-
-/*L:216
- * This is where we emulate a handful of Guest instructions.  It's ugly
- * and we used to do it in the kernel but it grew over time.
- */
-
-/*
- * We use the ptrace syscall's pt_regs struct to talk about registers
- * to lguest: these macros convert the names to the offsets.
- */
-#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
-#define setreg(name, val) \
-	setreg_off(offsetof(struct user_regs_struct, name), (val))
-
-static u32 getreg_off(size_t offset)
-{
-	u32 r;
-	unsigned long args[] = { LHREQ_GETREG, offset };
-
-	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-		err(1, "Getting register %u", offset);
-	if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
-		err(1, "Reading register %u", offset);
-
-	return r;
-}
-
-static void setreg_off(size_t offset, u32 val)
-{
-	unsigned long args[] = { LHREQ_SETREG, offset, val };
-
-	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-		err(1, "Setting register %u", offset);
-}
-
-/* Get register by instruction encoding */
-static u32 getreg_num(unsigned regnum, u32 mask)
-{
-	/* 8 bit ops use regnums 4-7 for high parts of word */
-	if (mask == 0xFF && (regnum & 0x4))
-		return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
-
-	switch (regnum) {
-	case 0: return getreg(eax) & mask;
-	case 1: return getreg(ecx) & mask;
-	case 2: return getreg(edx) & mask;
-	case 3: return getreg(ebx) & mask;
-	case 4: return getreg(esp) & mask;
-	case 5: return getreg(ebp) & mask;
-	case 6: return getreg(esi) & mask;
-	case 7: return getreg(edi) & mask;
-	}
-	abort();
-}
-
-/* Set register by instruction encoding */
-static void setreg_num(unsigned regnum, u32 val, u32 mask)
-{
-	/* Don't try to set bits out of range */
-	assert(~(val & ~mask));
-
-	/* 8 bit ops use regnums 4-7 for high parts of word */
-	if (mask == 0xFF && (regnum & 0x4)) {
-		/* Construct the 16 bits we want. */
-		val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
-		setreg_num(regnum & 0x3, val, 0xFFFF);
-		return;
-	}
-
-	switch (regnum) {
-	case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
-	case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
-	case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
-	case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
-	case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
-	case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
-	case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
-	case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
-	}
-	abort();
-}
-
-/* Get bytes of displacement appended to instruction, from r/m encoding */
-static u32 insn_displacement_len(u8 mod_reg_rm)
-{
-	/* Switch on the mod bits */
-	switch (mod_reg_rm >> 6) {
-	case 0:
-		/* If mod == 0, and r/m == 101, 16-bit displacement follows */
-		if ((mod_reg_rm & 0x7) == 0x5)
-			return 2;
-		/* Normally, mod == 0 means no literal displacement */
-		return 0;
-	case 1:
-		/* One byte displacement */
-		return 1;
-	case 2:
-		/* Four byte displacement */
-		return 4;
-	case 3:
-		/* Register mode */
-		return 0;
-	}
-	abort();
-}
-
-static void emulate_insn(const u8 insn[])
-{
-	unsigned long args[] = { LHREQ_TRAP, 13 };
-	unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
-	unsigned int eax, port, mask;
-	/*
-	 * Default is to return all-ones on IO port reads, which traditionally
-	 * means "there's nothing there".
-	 */
-	u32 val = 0xFFFFFFFF;
-
-	/*
-	 * This must be the Guest kernel trying to do something, not userspace!
-	 * The bottom two bits of the CS segment register are the privilege
-	 * level.
-	 */
-	if ((getreg(xcs) & 3) != 0x1)
-		goto no_emulate;
-
-	/* Decoding x86 instructions is icky. */
-
-	/*
-	 * Around 2.6.33, the kernel started using an emulation for the
-	 * cmpxchg8b instruction in early boot on many configurations.  This
-	 * code isn't paravirtualized, and it tries to disable interrupts.
-	 * Ignore it, which will Mostly Work.
-	 */
-	if (insn[insnlen] == 0xfa) {
-		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
-		insnlen = 1;
-		goto skip_insn;
-	}
-
-	/*
-	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
-	 */
-	if (insn[insnlen] == 0x66) {
-		small_operand = 1;
-		/* The instruction is 1 byte so far, read the next byte. */
-		insnlen = 1;
-	}
-
-	/* If the lower bit isn't set, it's a single byte access */
-	byte_access = !(insn[insnlen] & 1);
-
-	/*
-	 * Now we can ignore the lower bit and decode the 4 opcodes
-	 * we need to emulate.
-	 */
-	switch (insn[insnlen] & 0xFE) {
-	case 0xE4: /* in     <next byte>,%al */
-		port = insn[insnlen+1];
-		insnlen += 2;
-		in = 1;
-		break;
-	case 0xEC: /* in     (%dx),%al */
-		port = getreg(edx) & 0xFFFF;
-		insnlen += 1;
-		in = 1;
-		break;
-	case 0xE6: /* out    %al,<next byte> */
-		port = insn[insnlen+1];
-		insnlen += 2;
-		break;
-	case 0xEE: /* out    %al,(%dx) */
-		port = getreg(edx) & 0xFFFF;
-		insnlen += 1;
-		break;
-	default:
-		/* OK, we don't know what this is, can't emulate. */
-		goto no_emulate;
-	}
-
-	/* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
-	if (byte_access)
-		mask = 0xFF;
-	else if (small_operand)
-		mask = 0xFFFF;
-	else
-		mask = 0xFFFFFFFF;
-
-	/*
-	 * If it was an "IN" instruction, they expect the result to be read
-	 * into %eax, so we change %eax.
-	 */
-	eax = getreg(eax);
-
-	if (in) {
-		/* This is the PS/2 keyboard status; 1 means ready for output */
-		if (port == 0x64)
-			val = 1;
-		else if (is_pci_addr_port(port))
-			pci_addr_ioread(port, mask, &val);
-		else if (is_pci_data_port(port))
-			pci_data_ioread(port, mask, &val);
-
-		/* Clear the bits we're about to read */
-		eax &= ~mask;
-		/* Copy bits in from val. */
-		eax |= val & mask;
-		/* Now update the register. */
-		setreg(eax, eax);
-	} else {
-		if (is_pci_addr_port(port)) {
-			if (!pci_addr_iowrite(port, mask, eax))
-				goto bad_io;
-		} else if (is_pci_data_port(port)) {
-			if (!pci_data_iowrite(port, mask, eax))
-				goto bad_io;
-		}
-		/* There are many other ports, eg. CMOS clock, serial
-		 * and parallel ports, so we ignore them all. */
-	}
-
-	verbose("IO %s of %x to %u: %#08x\n",
-		in ? "IN" : "OUT", mask, port, eax);
-skip_insn:
-	/* Finally, we've "done" the instruction, so move past it. */
-	setreg(eip, getreg(eip) + insnlen);
-	return;
-
-bad_io:
-	warnx("Attempt to %s port %u (%#x mask)",
-	      in ? "read from" : "write to", port, mask);
-
-no_emulate:
-	/* Inject trap into Guest. */
-	if (write(lguest_fd, args, sizeof(args)) < 0)
-		err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
-}
-
-static struct device *find_mmio_region(unsigned long paddr, u32 *off)
-{
-	unsigned int i;
-
-	for (i = 1; i < MAX_PCI_DEVICES; i++) {
-		struct device *d = devices.pci[i];
-
-		if (!d)
-			continue;
-		if (paddr < d->mmio_addr)
-			continue;
-		if (paddr >= d->mmio_addr + d->mmio_size)
-			continue;
-		*off = paddr - d->mmio_addr;
-		return d;
-	}
-	return NULL;
-}
-
-/* FIXME: Use vq array. */
-static struct virtqueue *vq_by_num(struct device *d, u32 num)
-{
-	struct virtqueue *vq = d->vq;
-
-	while (num-- && vq)
-		vq = vq->next;
-
-	return vq;
-}
-
-static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
-			   struct virtqueue *vq)
-{
-	vq->pci_config = *cfg;
-}
-
-static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
-			      struct virtqueue *vq)
-{
-	/* Only restore the per-vq part */
-	size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
-
-	memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
-	       sizeof(*cfg) - off);
-}
-
-/*
- * 4.1.4.3.2:
- *
- *  The driver MUST configure the other virtqueue fields before
- *  enabling the virtqueue with queue_enable.
- *
- * When they enable the virtqueue, we check that their setup is valid.
- */
-static void check_virtqueue(struct device *d, struct virtqueue *vq)
-{
-	/* Because lguest is 32 bit, all the descriptor high bits must be 0 */
-	if (vq->pci_config.queue_desc_hi
-	    || vq->pci_config.queue_avail_hi
-	    || vq->pci_config.queue_used_hi)
-		bad_driver_vq(vq, "invalid 64-bit queue address");
-
-	/*
-	 * 2.4.1:
-	 *
-	 *  The driver MUST ensure that the physical address of the first byte
-	 *  of each virtqueue part is a multiple of the specified alignment
-	 *  value in the above table.
-	 */
-	if (vq->pci_config.queue_desc_lo % 16
-	    || vq->pci_config.queue_avail_lo % 2
-	    || vq->pci_config.queue_used_lo % 4)
-		bad_driver_vq(vq, "invalid alignment in queue addresses");
-
-	/* Initialize the virtqueue and check they're all in range. */
-	vq->vring.num = vq->pci_config.queue_size;
-	vq->vring.desc = check_pointer(vq->dev,
-				       vq->pci_config.queue_desc_lo,
-				       sizeof(*vq->vring.desc) * vq->vring.num);
-	vq->vring.avail = check_pointer(vq->dev,
-					vq->pci_config.queue_avail_lo,
-					sizeof(*vq->vring.avail)
-					+ (sizeof(vq->vring.avail->ring[0])
-					   * vq->vring.num));
-	vq->vring.used = check_pointer(vq->dev,
-				       vq->pci_config.queue_used_lo,
-				       sizeof(*vq->vring.used)
-				       + (sizeof(vq->vring.used->ring[0])
-					  * vq->vring.num));
-
-	/*
-	 * 2.4.9.1:
-	 *
-	 *   The driver MUST initialize flags in the used ring to 0
-	 *   when allocating the used ring.
-	 */
-	if (vq->vring.used->flags != 0)
-		bad_driver_vq(vq, "invalid initial used.flags %#x",
-			      vq->vring.used->flags);
-}
-
-static void start_virtqueue(struct virtqueue *vq)
-{
-	/*
-	 * Create stack for thread.  Since the stack grows upwards, we point
-	 * the stack pointer to the end of this region.
-	 */
-	char *stack = malloc(32768);
-
-	/* Create a zero-initialized eventfd. */
-	vq->eventfd = eventfd(0, 0);
-	if (vq->eventfd < 0)
-		err(1, "Creating eventfd");
-
-	/*
-	 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
-	 * we get a signal if it dies.
-	 */
-	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
-	if (vq->thread == (pid_t)-1)
-		err(1, "Creating clone");
-}
-
-static void start_virtqueues(struct device *d)
-{
-	struct virtqueue *vq;
-
-	for (vq = d->vq; vq; vq = vq->next) {
-		if (vq->pci_config.queue_enable)
-			start_virtqueue(vq);
-	}
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
-{
-	struct virtqueue *vq;
-
-	switch (off) {
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 * The device MUST present the feature bits it is offering in
-		 * device_feature, starting at bit device_feature_select ∗ 32
-		 * for any device_feature_select written by the driver
-		 */
-		if (val == 0)
-			d->mmio->cfg.device_feature = d->features;
-		else if (val == 1)
-			d->mmio->cfg.device_feature = (d->features >> 32);
-		else
-			d->mmio->cfg.device_feature = 0;
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-		if (val > 1)
-			bad_driver(d, "Unexpected driver select %u", val);
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-		if (d->mmio->cfg.guest_feature_select == 0) {
-			d->features_accepted &= ~((u64)0xFFFFFFFF);
-			d->features_accepted |= val;
-		} else {
-			assert(d->mmio->cfg.guest_feature_select == 1);
-			d->features_accepted &= 0xFFFFFFFF;
-			d->features_accepted |= ((u64)val) << 32;
-		}
-		/*
-		 * 2.2.1:
-		 *
-		 *   The driver MUST NOT accept a feature which the device did
-		 *   not offer
-		 */
-		if (d->features_accepted & ~d->features)
-			bad_driver(d, "over-accepted features %#llx of %#llx",
-				   d->features_accepted, d->features);
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.device_status): {
-		u8 prev;
-
-		verbose("%s: device status -> %#x\n", d->name, val);
-		/*
-		 * 4.1.4.3.1:
-		 * 
-		 *  The device MUST reset when 0 is written to device_status,
-		 *  and present a 0 in device_status once that is done.
-		 */
-		if (val == 0) {
-			reset_device(d);
-			goto write_through8;
-		}
-
-		/* 2.1.1: The driver MUST NOT clear a device status bit. */
-		if (d->mmio->cfg.device_status & ~val)
-			bad_driver(d, "unset of device status bit %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-
-		/*
-		 * 2.1.2:
-		 *
-		 *  The device MUST NOT consume buffers or notify the driver
-		 *  before DRIVER_OK.
-		 */
-		if (val & VIRTIO_CONFIG_S_DRIVER_OK
-		    && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-			start_virtqueues(d);
-
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST follow this sequence to initialize a device:
-		 *   - Reset the device.
-		 *   - Set the ACKNOWLEDGE status bit: the guest OS has
-                 *     notice the device.
-		 *   - Set the DRIVER status bit: the guest OS knows how
-                 *     to drive the device.
-		 *   - Read device feature bits, and write the subset
-		 *     of feature bits understood by the OS and driver
-		 *     to the device. During this step the driver MAY
-		 *     read (but MUST NOT write) the device-specific
-		 *     configuration fields to check that it can
-		 *     support the device before accepting it.
-		 *   - Set the FEATURES_OK status bit.  The driver
-		 *     MUST not accept new feature bits after this
-		 *     step.
-		 *   - Re-read device status to ensure the FEATURES_OK
-		 *     bit is still set: otherwise, the device does
-		 *     not support our subset of features and the
-		 *     device is unusable.
-		 *   - Perform device-specific setup, including
-		 *     discovery of virtqueues for the device,
-		 *     optional per-bus setup, reading and possibly
-		 *     writing the device’s virtio configuration
-		 *     space, and population of virtqueues.
-		 *   - Set the DRIVER_OK status bit. At this point the
-                 *     device is “live”.
-		 */
-		prev = 0;
-		switch (val & ~d->mmio->cfg.device_status) {
-		case VIRTIO_CONFIG_S_DRIVER_OK:
-			prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
-		case VIRTIO_CONFIG_S_FEATURES_OK:
-			prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
-		case VIRTIO_CONFIG_S_DRIVER:
-			prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
-		case VIRTIO_CONFIG_S_ACKNOWLEDGE:
-			break;
-		default:
-			bad_driver(d, "unknown device status bit %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-		}
-		if (d->mmio->cfg.device_status != prev)
-			bad_driver(d, "unexpected status transition %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-
-		/* If they just wrote FEATURES_OK, we make sure they read */
-		switch (val & ~d->mmio->cfg.device_status) {
-		case VIRTIO_CONFIG_S_FEATURES_OK:
-			d->wrote_features_ok = true;
-			break;
-		case VIRTIO_CONFIG_S_DRIVER_OK:
-			if (d->wrote_features_ok)
-				bad_driver(d, "did not re-read FEATURES_OK");
-			break;
-		}
-		goto write_through8;
-	}
-	case offsetof(struct virtio_pci_mmio, cfg.queue_select):
-		vq = vq_by_num(d, val);
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 *  The device MUST present a 0 in queue_size if the virtqueue
-		 *  corresponding to the current queue_select is unavailable.
-		 */
-		if (!vq) {
-			d->mmio->cfg.queue_size = 0;
-			goto write_through16;
-		}
-		/* Save registers for old vq, if it was a valid vq */
-		if (d->mmio->cfg.queue_size)
-			save_vq_config(&d->mmio->cfg,
-				       vq_by_num(d, d->mmio->cfg.queue_select));
-		/* Restore the registers for the queue they asked for */
-		restore_vq_config(&d->mmio->cfg, vq);
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.queue_size):
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST NOT write a value which is not a power of 2
-		 *  to queue_size.
-		 */
-		if (val & (val-1))
-			bad_driver(d, "invalid queue size %u", val);
-		if (d->mmio->cfg.queue_enable)
-			bad_driver(d, "changing queue size on live device");
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
-		bad_driver(d, "attempt to set MSIX vector to %u", val);
-	case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
-		struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
-
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST NOT write a 0 to queue_enable.
-		 */
-		if (val != 1)
-			bad_driver(d, "setting queue_enable to %u", val);
-
-		/*
-		 * 3.1.1:
-		 *
-		 *  7. Perform device-specific setup, including discovery of
-		 *     virtqueues for the device, optional per-bus setup,
-		 *     reading and possibly writing the device’s virtio
-		 *     configuration space, and population of virtqueues.
-		 *  8. Set the DRIVER_OK status bit.
-		 *
-		 * All our devices require all virtqueues to be enabled, so
-		 * they should have done that before setting DRIVER_OK.
-		 */
-		if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
-			bad_driver(d, "enabling vq after DRIVER_OK");
-
-		d->mmio->cfg.queue_enable = val;
-		save_vq_config(&d->mmio->cfg, vq);
-		check_virtqueue(d, vq);
-		goto write_through16;
-	}
-	case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
-		bad_driver(d, "attempt to write to queue_notify_off");
-	case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST configure the other virtqueue fields before
-		 *  enabling the virtqueue with queue_enable.
-		 */
-		if (d->mmio->cfg.queue_enable)
-			bad_driver(d, "changing queue on live device");
-
-		/*
-		 * 3.1.1:
-		 *
-		 *  The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *  5. Set the FEATURES_OK status bit. The driver MUST not
-		 *  accept new feature bits after this step.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
-			bad_driver(d, "setting up vq before FEATURES_OK");
-
-		/*
-		 *  6. Re-read device status to ensure the FEATURES_OK bit is
-		 *     still set...
-		 */
-		if (d->wrote_features_ok)
-			bad_driver(d, "didn't re-read FEATURES_OK before setup");
-
-		goto write_through32;
-	case offsetof(struct virtio_pci_mmio, notify):
-		vq = vq_by_num(d, val);
-		if (!vq)
-			bad_driver(d, "Invalid vq notification on %u", val);
-		/* Notify the process handling this vq by adding 1 to eventfd */
-		write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, isr):
-		bad_driver(d, "Unexpected write to isr");
-	/* Weird corner case: write to emerg_wr of console */
-	case sizeof(struct virtio_pci_mmio)
-		+ offsetof(struct virtio_console_config, emerg_wr):
-		if (strcmp(d->name, "console") == 0) {
-			char c = val;
-			write(STDOUT_FILENO, &c, 1);
-			goto write_through32;
-		}
-		/* Fall through... */
-	default:
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *   The driver MUST NOT write to device_feature, num_queues,
-		 *   config_generation or queue_notify_off.
-		 */
-		bad_driver(d, "Unexpected write to offset %u", off);
-	}
-
-feature_write_through32:
-	/*
-	 * 3.1.1:
-	 *
-	 *   The driver MUST follow this sequence to initialize a device:
-	 *...
-	 *   - Set the DRIVER status bit: the guest OS knows how
-	 *     to drive the device.
-	 *   - Read device feature bits, and write the subset
-	 *     of feature bits understood by the OS and driver
-	 *     to the device.
-	 *...
-	 *   - Set the FEATURES_OK status bit. The driver MUST not
-	 *     accept new feature bits after this step.
-	 */
-	if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-		bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
-	if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
-		bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
-
-	/*
-	 * 4.1.3.1:
-	 *
-	 *  The driver MUST access each field using the “natural” access
-	 *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-	 *  16-bit fields and 8-bit accesses for 8-bit fields.
-	 */
-write_through32:
-	if (mask != 0xFFFFFFFF) {
-		bad_driver(d, "non-32-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-		return;
-	}
-	memcpy((char *)d->mmio + off, &val, 4);
-	return;
-
-write_through16:
-	if (mask != 0xFFFF)
-		bad_driver(d, "non-16-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy((char *)d->mmio + off, &val, 2);
-	return;
-
-write_through8:
-	if (mask != 0xFF)
-		bad_driver(d, "non-8-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy((char *)d->mmio + off, &val, 1);
-	return;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
-{
-	u8 isr;
-	u32 val = 0;
-
-	switch (off) {
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature):
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *   - Set the DRIVER status bit: the guest OS knows how
-		 *     to drive the device.
-		 *   - Read device feature bits, and write the subset
-		 *     of feature bits understood by the OS and driver
-		 *     to the device.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-			bad_driver(d,
-				   "feature read before VIRTIO_CONFIG_S_DRIVER");
-		goto read_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.msix_config):
-		bad_driver(d, "read of msix_config");
-	case offsetof(struct virtio_pci_mmio, cfg.num_queues):
-		goto read_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.device_status):
-		/* As they did read, any write of FEATURES_OK is now fine. */
-		d->wrote_features_ok = false;
-		goto read_through8;
-	case offsetof(struct virtio_pci_mmio, cfg.config_generation):
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 *  The device MUST present a changed config_generation after
-		 *  the driver has read a device-specific configuration value
-		 *  which has changed since any part of the device-specific
-		 *  configuration was last read.
-		 *
-		 * This is simple: none of our devices change config, so this
-		 * is always 0.
-		 */
-		goto read_through8;
-	case offsetof(struct virtio_pci_mmio, notify):
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST NOT notify the device before setting
-		 *   DRIVER_OK.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-			bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
-		goto read_through16;
-	case offsetof(struct virtio_pci_mmio, isr):
-		if (mask != 0xFF)
-			bad_driver(d, "non-8-bit read from offset %u (%#x)",
-				   off, getreg(eip));
-		isr = d->mmio->isr;
-		/*
-		 * 4.1.4.5.1:
-		 *
-		 *  The device MUST reset ISR status to 0 on driver read. 
-		 */
-		d->mmio->isr = 0;
-		return isr;
-	case offsetof(struct virtio_pci_mmio, padding):
-		bad_driver(d, "read from padding (%#x)", getreg(eip));
-	default:
-		/* Read from device config space, beware unaligned overflow */
-		if (off > d->mmio_size - 4)
-			bad_driver(d, "read past end (%#x)", getreg(eip));
-
-		/*
-		 * 3.1.1:
-		 *  The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *  3. Set the DRIVER status bit: the guest OS knows how to
-		 *  drive the device.
-		 *  4. Read device feature bits, and write the subset of
-		 *  feature bits understood by the OS and driver to the
-		 *  device. During this step the driver MAY read (but MUST NOT
-		 *  write) the device-specific configuration fields to check
-		 *  that it can support the device before accepting it.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-			bad_driver(d,
-				   "config read before VIRTIO_CONFIG_S_DRIVER");
-
-		if (mask == 0xFFFFFFFF)
-			goto read_through32;
-		else if (mask == 0xFFFF)
-			goto read_through16;
-		else
-			goto read_through8;
-	}
-
-	/*
-	 * 4.1.3.1:
-	 *
-	 *  The driver MUST access each field using the “natural” access
-	 *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-	 *  16-bit fields and 8-bit accesses for 8-bit fields.
-	 */
-read_through32:
-	if (mask != 0xFFFFFFFF)
-		bad_driver(d, "non-32-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 4);
-	return val;
-
-read_through16:
-	if (mask != 0xFFFF)
-		bad_driver(d, "non-16-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 2);
-	return val;
-
-read_through8:
-	if (mask != 0xFF)
-		bad_driver(d, "non-8-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 1);
-	return val;
-}
-
-static void emulate_mmio(unsigned long paddr, const u8 *insn)
-{
-	u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
-	struct device *d = find_mmio_region(paddr, &off);
-	unsigned long args[] = { LHREQ_TRAP, 14 };
-
-	if (!d) {
-		warnx("MMIO touching %#08lx (not a device)", paddr);
-		goto reinject;
-	}
-
-	/* Prefix makes it a 16 bit op */
-	if (insn[0] == 0x66) {
-		mask = 0xFFFF;
-		insnlen++;
-	}
-
-	/* iowrite */
-	if (insn[insnlen] == 0x89) {
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
-		emulate_mmio_write(d, off, val, mask);
-		insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-	} else if (insn[insnlen] == 0x8b) { /* ioread */
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = emulate_mmio_read(d, off, mask);
-		setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
-		insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-	} else if (insn[0] == 0x88) { /* 8-bit iowrite */
-		mask = 0xff;
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = getreg_num((insn[1] >> 3) & 0x7, mask);
-		emulate_mmio_write(d, off, val, mask);
-		insnlen = 2 + insn_displacement_len(insn[1]);
-	} else if (insn[0] == 0x8a) { /* 8-bit ioread */
-		mask = 0xff;
-		val = emulate_mmio_read(d, off, mask);
-		setreg_num((insn[1] >> 3) & 0x7, val, mask);
-		insnlen = 2 + insn_displacement_len(insn[1]);
-	} else {
-		warnx("Unknown MMIO instruction touching %#08lx:"
-		     " %02x %02x %02x %02x at %u",
-		     paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
-	reinject:
-		/* Inject trap into Guest. */
-		if (write(lguest_fd, args, sizeof(args)) < 0)
-			err(1, "Reinjecting trap 14 for fault at %#x",
-			    getreg(eip));
-		return;
-	}
-
-	/* Finally, we've "done" the instruction, so move past it. */
-	setreg(eip, getreg(eip) + insnlen);
-}
-
-/*L:190
- * Device Setup
- *
- * All devices need a descriptor so the Guest knows it exists, and a "struct
- * device" so the Launcher can keep track of it.  We have common helper
- * routines to allocate and manage them.
- */
-static void add_pci_virtqueue(struct device *dev,
-			      void (*service)(struct virtqueue *),
-			      const char *name)
-{
-	struct virtqueue **i, *vq = malloc(sizeof(*vq));
-
-	/* Initialize the virtqueue */
-	vq->next = NULL;
-	vq->last_avail_idx = 0;
-	vq->dev = dev;
-	vq->name = name;
-
-	/*
-	 * This is the routine the service thread will run, and its Process ID
-	 * once it's running.
-	 */
-	vq->service = service;
-	vq->thread = (pid_t)-1;
-
-	/* Initialize the configuration. */
-	reset_vq_pci_config(vq);
-	vq->pci_config.queue_notify_off = 0;
-
-	/* Add one to the number of queues */
-	vq->dev->mmio->cfg.num_queues++;
-
-	/*
-	 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
-	 * second.
-	 */
-	for (i = &dev->vq; *i; i = &(*i)->next);
-	*i = vq;
-}
-
-/* The Guest accesses the feature bits via the PCI common config MMIO region */
-static void add_pci_feature(struct device *dev, unsigned bit)
-{
-	dev->features |= (1ULL << bit);
-}
-
-/* For devices with no config. */
-static void no_device_config(struct device *dev)
-{
-	dev->mmio_addr = get_mmio_region(dev->mmio_size);
-
-	dev->config.bar[0] = dev->mmio_addr;
-	/* Bottom 4 bits must be zero */
-	assert(~(dev->config.bar[0] & 0xF));
-}
-
-/* This puts the device config into BAR0 */
-static void set_device_config(struct device *dev, const void *conf, size_t len)
-{
-	/* Set up BAR 0 */
-	dev->mmio_size += len;
-	dev->mmio = realloc(dev->mmio, dev->mmio_size);
-	memcpy(dev->mmio + 1, conf, len);
-
-	/*
-	 * 4.1.4.6:
-	 *
-	 *  The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
-	 *  capability for any device type which has a device-specific
-	 *  configuration.
-	 */
-	/* Hook up device cfg */
-	dev->config.cfg_access.cap.cap_next
-		= offsetof(struct pci_config, device);
-
-	/*
-	 * 4.1.4.6.1:
-	 *
-	 *  The offset for the device-specific configuration MUST be 4-byte
-	 *  aligned.
-	 */
-	assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
-
-	/* Fix up device cfg field length. */
-	dev->config.device.length = len;
-
-	/* The rest is the same as the no-config case */
-	no_device_config(dev);
-}
-
-static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
-		     size_t bar_offset, size_t bar_bytes, u8 next)
-{
-	cap->cap_vndr = PCI_CAP_ID_VNDR;
-	cap->cap_next = next;
-	cap->cap_len = caplen;
-	cap->cfg_type = type;
-	cap->bar = 0;
-	memset(cap->padding, 0, sizeof(cap->padding));
-	cap->offset = bar_offset;
-	cap->length = bar_bytes;
-}
-
-/*
- * This sets up the pci_config structure, as defined in the virtio 1.0
- * standard (and PCI standard).
- */
-static void init_pci_config(struct pci_config *pci, u16 type,
-			    u8 class, u8 subclass)
-{
-	size_t bar_offset, bar_len;
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The device MUST either present notify_off_multiplier as an even
-	 *  power of 2, or present notify_off_multiplier as 0.
-	 *
-	 * 2.1.2:
-	 *
-	 *   The device MUST initialize device status to 0 upon reset. 
-	 */
-	memset(pci, 0, sizeof(*pci));
-
-	/* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
-	pci->vendor_id = 0x1AF4;
-	/* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
-	pci->device_id = 0x1040 + type;
-
-	/*
-	 * PCI have specific codes for different types of devices.
-	 * Linux doesn't care, but it's a good clue for people looking
-	 * at the device.
-	 */
-	pci->class = class;
-	pci->subclass = subclass;
-
-	/*
-	 * 4.1.2.1:
-	 *
-	 *  Non-transitional devices SHOULD have a PCI Revision ID of 1 or
-	 *  higher
-	 */
-	pci->revid = 1;
-
-	/*
-	 * 4.1.2.1:
-	 *
-	 *  Non-transitional devices SHOULD have a PCI Subsystem Device ID of
-	 *  0x40 or higher.
-	 */
-	pci->subsystem_device_id = 0x40;
-
-	/* We use our dummy interrupt controller, and irq_line is the irq */
-	pci->irq_line = devices.next_irq++;
-	pci->irq_pin = 0;
-
-	/* Support for extended capabilities. */
-	pci->status = (1 << 4);
-
-	/* Link them in. */
-	/*
-	 * 4.1.4.3.1:
-	 *
-	 *  The device MUST present at least one common configuration
-	 *  capability.
-	 */
-	pci->capabilities = offsetof(struct pci_config, common);
-
-	/* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
-	assert(pci->capabilities % 4 == 0);
-
-	bar_offset = offsetof(struct virtio_pci_mmio, cfg);
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
-	init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, notify));
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The device MUST present at least one notification capability.
-	 */
-	bar_offset += bar_len;
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The cap.offset MUST be 2-byte aligned.
-	 */
-	assert(pci->common.cap_next % 2 == 0);
-
-	/* FIXME: Use a non-zero notify_off, for per-queue notification? */
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The value cap.length presented by the device MUST be at least 2 and
-	 *  MUST be large enough to support queue notification offsets for all
-	 *  supported queues in all possible configurations.
-	 */
-	assert(bar_len >= 2);
-
-	init_cap(&pci->notify.cap, sizeof(pci->notify),
-		 VIRTIO_PCI_CAP_NOTIFY_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, isr));
-
-	bar_offset += bar_len;
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
-	/*
-	 * 4.1.4.5.1:
-	 *
-	 *  The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
-	 *  capability.
-	 */
-	init_cap(&pci->isr, sizeof(pci->isr),
-		 VIRTIO_PCI_CAP_ISR_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, cfg_access));
-
-	/*
-	 * 4.1.4.7.1:
-	 *
-	 * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
-	 * capability.
-	 */
-	/* This doesn't have any presence in the BAR */
-	init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
-		 VIRTIO_PCI_CAP_PCI_CFG,
-		 0, 0, 0);
-
-	bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
-	assert(bar_offset == sizeof(struct virtio_pci_mmio));
-
-	/*
-	 * This gets sewn in and length set in set_device_config().
-	 * Some devices don't have a device configuration interface, so
-	 * we never expose this if we don't call set_device_config().
-	 */
-	init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
-		 bar_offset, 0, 0);
-}
-
-/*
- * This routine does all the creation and setup of a new device, but we don't
- * actually place the MMIO region until we know the size (if any) of the
- * device-specific config.  And we don't actually start the service threads
- * until later.
- *
- * See what I mean about userspace being boring?
- */
-static struct device *new_pci_device(const char *name, u16 type,
-				     u8 class, u8 subclass)
-{
-	struct device *dev = malloc(sizeof(*dev));
-
-	/* Now we populate the fields one at a time. */
-	dev->name = name;
-	dev->vq = NULL;
-	dev->running = false;
-	dev->wrote_features_ok = false;
-	dev->mmio_size = sizeof(struct virtio_pci_mmio);
-	dev->mmio = calloc(1, dev->mmio_size);
-	dev->features = (u64)1 << VIRTIO_F_VERSION_1;
-	dev->features_accepted = 0;
-
-	if (devices.device_num + 1 >= MAX_PCI_DEVICES)
-		errx(1, "Can only handle 31 PCI devices");
-
-	init_pci_config(&dev->config, type, class, subclass);
-	assert(!devices.pci[devices.device_num+1]);
-	devices.pci[++devices.device_num] = dev;
-
-	return dev;
-}
-
-/*
- * Our first setup routine is the console.  It's a fairly simple device, but
- * UNIX tty handling makes it uglier than it could be.
- */
-static void setup_console(void)
-{
-	struct device *dev;
-	struct virtio_console_config conf;
-
-	/* If we can save the initial standard input settings... */
-	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
-		struct termios term = orig_term;
-		/*
-		 * Then we turn off echo, line buffering and ^C etc: We want a
-		 * raw input stream to the Guest.
-		 */
-		term.c_lflag &= ~(ISIG|ICANON|ECHO);
-		tcsetattr(STDIN_FILENO, TCSANOW, &term);
-	}
-
-	dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
-
-	/* We store the console state in dev->priv, and initialize it. */
-	dev->priv = malloc(sizeof(struct console_abort));
-	((struct console_abort *)dev->priv)->count = 0;
-
-	/*
-	 * The console needs two virtqueues: the input then the output.  When
-	 * they put something the input queue, we make sure we're listening to
-	 * stdin.  When they put something in the output queue, we write it to
-	 * stdout.
-	 */
-	add_pci_virtqueue(dev, console_input, "input");
-	add_pci_virtqueue(dev, console_output, "output");
-
-	/* We need a configuration area for the emerg_wr early writes. */
-	add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
-	set_device_config(dev, &conf, sizeof(conf));
-
-	verbose("device %u: console\n", devices.device_num);
-}
-/*:*/
-
-/*M:010
- * Inter-guest networking is an interesting area.  Simplest is to have a
- * --sharenet=<name> option which opens or creates a named pipe.  This can be
- * used to send packets to another guest in a 1:1 manner.
- *
- * More sophisticated is to use one of the tools developed for project like UML
- * to do networking.
- *
- * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
- * completely generic ("here's my vring, attach to your vring") and would work
- * for any traffic.  Of course, namespace and permissions issues need to be
- * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
- * multiple inter-guest channels behind one interface, although it would
- * require some manner of hotplugging new virtio channels.
- *
- * Finally, we could use a virtio network switch in the kernel, ie. vhost.
-:*/
-
-static u32 str2ip(const char *ipaddr)
-{
-	unsigned int b[4];
-
-	if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
-		errx(1, "Failed to parse IP address '%s'", ipaddr);
-	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
-}
-
-static void str2mac(const char *macaddr, unsigned char mac[6])
-{
-	unsigned int m[6];
-	if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
-		   &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
-		errx(1, "Failed to parse mac address '%s'", macaddr);
-	mac[0] = m[0];
-	mac[1] = m[1];
-	mac[2] = m[2];
-	mac[3] = m[3];
-	mac[4] = m[4];
-	mac[5] = m[5];
-}
-
-/*
- * This code is "adapted" from libbridge: it attaches the Host end of the
- * network device to the bridge device specified by the command line.
- *
- * This is yet another James Morris contribution (I'm an IP-level guy, so I
- * dislike bridging), and I just try not to break it.
- */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
-	int ifidx;
-	struct ifreq ifr;
-
-	if (!*br_name)
-		errx(1, "must specify bridge name");
-
-	ifidx = if_nametoindex(if_name);
-	if (!ifidx)
-		errx(1, "interface %s does not exist!", if_name);
-
-	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
-	ifr.ifr_name[IFNAMSIZ-1] = '\0';
-	ifr.ifr_ifindex = ifidx;
-	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
-		err(1, "can't add %s to bridge %s", if_name, br_name);
-}
-
-/*
- * This sets up the Host end of the network device with an IP address, brings
- * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer.
- */
-static void configure_device(int fd, const char *tapif, u32 ipaddr)
-{
-	struct ifreq ifr;
-	struct sockaddr_in sin;
-
-	memset(&ifr, 0, sizeof(ifr));
-	strcpy(ifr.ifr_name, tapif);
-
-	/* Don't read these incantations.  Just cut & paste them like I did! */
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = htonl(ipaddr);
-	memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
-	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-		err(1, "Setting %s interface address", tapif);
-	ifr.ifr_flags = IFF_UP;
-	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-		err(1, "Bringing interface %s up", tapif);
-}
-
-static int get_tun_device(char tapif[IFNAMSIZ])
-{
-	struct ifreq ifr;
-	int vnet_hdr_sz;
-	int netfd;
-
-	/* Start with this zeroed.  Messy but sure. */
-	memset(&ifr, 0, sizeof(ifr));
-
-	/*
-	 * We open the /dev/net/tun device and tell it we want a tap device.  A
-	 * tap device is like a tun device, only somehow different.  To tell
-	 * the truth, I completely blundered my way through this code, but it
-	 * works now!
-	 */
-	netfd = open_or_die("/dev/net/tun", O_RDWR);
-	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
-	strcpy(ifr.ifr_name, "tap%d");
-	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
-		err(1, "configuring /dev/net/tun");
-
-	if (ioctl(netfd, TUNSETOFFLOAD,
-		  TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
-		err(1, "Could not set features for tun device");
-
-	/*
-	 * We don't need checksums calculated for packets coming in this
-	 * device: trust us!
-	 */
-	ioctl(netfd, TUNSETNOCSUM, 1);
-
-	/*
-	 * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
-	 * field at the end of the network header iff
-	 * VIRTIO_NET_F_MRG_RXBUF was negotiated.  For virtio 1.0,
-	 * that became the norm, but we need to tell the tun device
-	 * about our expanded header (which is called
-	 * virtio_net_hdr_mrg_rxbuf in the legacy system).
-	 */
-	vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
-	if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
-		err(1, "Setting tun header size to %u", vnet_hdr_sz);
-
-	memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
-	return netfd;
-}
-
-/*L:195
- * Our network is a Host<->Guest network.  This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card.  We
- * just shunt packets between the Guest and the tun device.
- */
-static void setup_tun_net(char *arg)
-{
-	struct device *dev;
-	struct net_info *net_info = malloc(sizeof(*net_info));
-	int ipfd;
-	u32 ip = INADDR_ANY;
-	bool bridging = false;
-	char tapif[IFNAMSIZ], *p;
-	struct virtio_net_config conf;
-
-	net_info->tunfd = get_tun_device(tapif);
-
-	/* First we create a new network device. */
-	dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
-	dev->priv = net_info;
-
-	/* Network devices need a recv and a send queue, just like console. */
-	add_pci_virtqueue(dev, net_input, "rx");
-	add_pci_virtqueue(dev, net_output, "tx");
-
-	/*
-	 * We need a socket to perform the magic network ioctls to bring up the
-	 * tap interface, connect to the bridge etc.  Any socket will do!
-	 */
-	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	if (ipfd < 0)
-		err(1, "opening IP socket");
-
-	/* If the command line was --tunnet=bridge:<name> do bridging. */
-	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-		arg += strlen(BRIDGE_PFX);
-		bridging = true;
-	}
-
-	/* A mac address may follow the bridge name or IP address */
-	p = strchr(arg, ':');
-	if (p) {
-		str2mac(p+1, conf.mac);
-		add_pci_feature(dev, VIRTIO_NET_F_MAC);
-		*p = '\0';
-	}
-
-	/* arg is now either an IP address or a bridge name */
-	if (bridging)
-		add_to_bridge(ipfd, tapif, arg);
-	else
-		ip = str2ip(arg);
-
-	/* Set up the tun device. */
-	configure_device(ipfd, tapif, ip);
-
-	/* Expect Guest to handle everything except UFO */
-	add_pci_feature(dev, VIRTIO_NET_F_CSUM);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
-	/* We handle indirect ring entries */
-	add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
-	set_device_config(dev, &conf, sizeof(conf));
-
-	/* We don't need the socket any more; setup is done. */
-	close(ipfd);
-
-	if (bridging)
-		verbose("device %u: tun %s attached to bridge: %s\n",
-			devices.device_num, tapif, arg);
-	else
-		verbose("device %u: tun %s: %s\n",
-			devices.device_num, tapif, arg);
-}
-/*:*/
-
-/* This hangs off device->priv. */
-struct vblk_info {
-	/* The size of the file. */
-	off64_t len;
-
-	/* The file descriptor for the file. */
-	int fd;
-
-};
-
-/*L:210
- * The Disk
- *
- * The disk only has one virtqueue, so it only has one thread.  It is really
- * simple: the Guest asks for a block number and we read or write that position
- * in the file.
- *
- * Before we serviced each virtqueue in a separate thread, that was unacceptably
- * slow: the Guest waits until the read is finished before running anything
- * else, even if it could have been doing useful work.
- *
- * We could have used async I/O, except it's reputed to suck so hard that
- * characters actually go missing from your code when you try to use it.
- */
-static void blk_request(struct virtqueue *vq)
-{
-	struct vblk_info *vblk = vq->dev->priv;
-	unsigned int head, out_num, in_num, wlen;
-	int ret, i;
-	u8 *in;
-	struct virtio_blk_outhdr out;
-	struct iovec iov[vq->vring.num];
-	off64_t off;
-
-	/*
-	 * Get the next request, where we normally wait.  It triggers the
-	 * interrupt to acknowledge previously serviced requests (if any).
-	 */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-
-	/* Copy the output header from the front of the iov (adjusts iov) */
-	iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
-
-	/* Find and trim end of iov input array, for our status byte. */
-	in = NULL;
-	for (i = out_num + in_num - 1; i >= out_num; i--) {
-		if (iov[i].iov_len > 0) {
-			in = iov[i].iov_base + iov[i].iov_len - 1;
-			iov[i].iov_len--;
-			break;
-		}
-	}
-	if (!in)
-		bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
-
-	/*
-	 * For historical reasons, block operations are expressed in 512 byte
-	 * "sectors".
-	 */
-	off = out.sector * 512;
-
-	if (out.type & VIRTIO_BLK_T_OUT) {
-		/*
-		 * Write
-		 *
-		 * Move to the right location in the block file.  This can fail
-		 * if they try to write past end.
-		 */
-		if (lseek64(vblk->fd, off, SEEK_SET) != off)
-			err(1, "Bad seek to sector %llu", out.sector);
-
-		ret = writev(vblk->fd, iov, out_num);
-		verbose("WRITE to sector %llu: %i\n", out.sector, ret);
-
-		/*
-		 * Grr... Now we know how long the descriptor they sent was, we
-		 * make sure they didn't try to write over the end of the block
-		 * file (possibly extending it).
-		 */
-		if (ret > 0 && off + ret > vblk->len) {
-			/* Trim it back to the correct length */
-			ftruncate64(vblk->fd, vblk->len);
-			/* Die, bad Guest, die. */
-			bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
-		}
-
-		wlen = sizeof(*in);
-		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-	} else if (out.type & VIRTIO_BLK_T_FLUSH) {
-		/* Flush */
-		ret = fdatasync(vblk->fd);
-		verbose("FLUSH fdatasync: %i\n", ret);
-		wlen = sizeof(*in);
-		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-	} else {
-		/*
-		 * Read
-		 *
-		 * Move to the right location in the block file.  This can fail
-		 * if they try to read past end.
-		 */
-		if (lseek64(vblk->fd, off, SEEK_SET) != off)
-			err(1, "Bad seek to sector %llu", out.sector);
-
-		ret = readv(vblk->fd, iov + out_num, in_num);
-		if (ret >= 0) {
-			wlen = sizeof(*in) + ret;
-			*in = VIRTIO_BLK_S_OK;
-		} else {
-			wlen = sizeof(*in);
-			*in = VIRTIO_BLK_S_IOERR;
-		}
-	}
-
-	/* Finished that request. */
-	add_used(vq, head, wlen);
-}
-
-/*L:198 This actually sets up a virtual block device. */
-static void setup_block_file(const char *filename)
-{
-	struct device *dev;
-	struct vblk_info *vblk;
-	struct virtio_blk_config conf;
-
-	/* Create the device. */
-	dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
-
-	/* The device has one virtqueue, where the Guest places requests. */
-	add_pci_virtqueue(dev, blk_request, "request");
-
-	/* Allocate the room for our own bookkeeping */
-	vblk = dev->priv = malloc(sizeof(*vblk));
-
-	/* First we open the file and store the length. */
-	vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
-	vblk->len = lseek64(vblk->fd, 0, SEEK_END);
-
-	/* Tell Guest how many sectors this device has. */
-	conf.capacity = cpu_to_le64(vblk->len / 512);
-
-	/*
-	 * Tell Guest not to put in too many descriptors at once: two are used
-	 * for the in and out elements.
-	 */
-	add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
-	conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
-
-	set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
-
-	verbose("device %u: virtblock %llu sectors\n",
-		devices.device_num, le64_to_cpu(conf.capacity));
-}
-
-/*L:211
- * Our random number generator device reads from /dev/urandom into the Guest's
- * input buffers.  The usual case is that the Guest doesn't want random numbers
- * and so has no buffers although /dev/urandom is still readable, whereas
- * console is the reverse.
- *
- * The same logic applies, however.
- */
-struct rng_info {
-	int rfd;
-};
-
-static void rng_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, in_num, out_num, totlen = 0;
-	struct rng_info *rng_info = vq->dev->priv;
-	struct iovec iov[vq->vring.num];
-
-	/* First we need a buffer from the Guests's virtqueue. */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-	if (out_num)
-		bad_driver_vq(vq, "Output buffers in rng?");
-
-	/*
-	 * Just like the console write, we loop to cover the whole iovec.
-	 * In this case, short reads actually happen quite a bit.
-	 */
-	while (!iov_empty(iov, in_num)) {
-		len = readv(rng_info->rfd, iov, in_num);
-		if (len <= 0)
-			err(1, "Read from /dev/urandom gave %i", len);
-		iov_consume(vq->dev, iov, in_num, NULL, len);
-		totlen += len;
-	}
-
-	/* Tell the Guest about the new input. */
-	add_used(vq, head, totlen);
-}
-
-/*L:199
- * This creates a "hardware" random number device for the Guest.
- */
-static void setup_rng(void)
-{
-	struct device *dev;
-	struct rng_info *rng_info = malloc(sizeof(*rng_info));
-
-	/* Our device's private info simply contains the /dev/urandom fd. */
-	rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
-
-	/* Create the new device. */
-	dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
-	dev->priv = rng_info;
-
-	/* The device has one virtqueue, where the Guest places inbufs. */
-	add_pci_virtqueue(dev, rng_input, "input");
-
-	/* We don't have any configuration space */
-	no_device_config(dev);
-
-	verbose("device %u: rng\n", devices.device_num);
-}
-/* That's the end of device setup. */
-
-/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
-static void __attribute__((noreturn)) restart_guest(void)
-{
-	unsigned int i;
-
-	/*
-	 * Since we don't track all open fds, we simply close everything beyond
-	 * stderr.
-	 */
-	for (i = 3; i < FD_SETSIZE; i++)
-		close(i);
-
-	/* Reset all the devices (kills all threads). */
-	cleanup_devices();
-
-	execv(main_args[0], main_args);
-	err(1, "Could not exec %s", main_args[0]);
-}
-
-/*L:220
- * Finally we reach the core of the Launcher which runs the Guest, serves
- * its input and output, and finally, lays it to rest.
- */
-static void __attribute__((noreturn)) run_guest(void)
-{
-	for (;;) {
-		struct lguest_pending notify;
-		int readval;
-
-		/* We read from the /dev/lguest device to run the Guest. */
-		readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
-		if (readval == sizeof(notify)) {
-			if (notify.trap == 13) {
-				verbose("Emulating instruction at %#x\n",
-					getreg(eip));
-				emulate_insn(notify.insn);
-			} else if (notify.trap == 14) {
-				verbose("Emulating MMIO at %#x\n",
-					getreg(eip));
-				emulate_mmio(notify.addr, notify.insn);
-			} else
-				errx(1, "Unknown trap %i addr %#08x\n",
-				     notify.trap, notify.addr);
-		/* ENOENT means the Guest died.  Reading tells us why. */
-		} else if (errno == ENOENT) {
-			char reason[1024] = { 0 };
-			pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
-			errx(1, "%s", reason);
-		/* ERESTART means that we need to reboot the guest */
-		} else if (errno == ERESTART) {
-			restart_guest();
-		/* Anything else means a bug or incompatible change. */
-		} else
-			err(1, "Running guest failed");
-	}
-}
-/*L:240
- * This is the end of the Launcher.  The good news: we are over halfway
- * through!  The bad news: the most fiendish part of the code still lies ahead
- * of us.
- *
- * Are you ready?  Take a deep breath and join me in the core of the Host, in
- * "make Host".
-:*/
-
-static struct option opts[] = {
-	{ "verbose", 0, NULL, 'v' },
-	{ "tunnet", 1, NULL, 't' },
-	{ "block", 1, NULL, 'b' },
-	{ "rng", 0, NULL, 'r' },
-	{ "initrd", 1, NULL, 'i' },
-	{ "username", 1, NULL, 'u' },
-	{ "chroot", 1, NULL, 'c' },
-	{ NULL },
-};
-static void usage(void)
-{
-	errx(1, "Usage: lguest [--verbose] "
-	     "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
-	     "|--block=<filename>|--initrd=<filename>]...\n"
-	     "<mem-in-mb> vmlinux [args...]");
-}
-
-/*L:105 The main routine is where the real work begins: */
-int main(int argc, char *argv[])
-{
-	/* Memory, code startpoint and size of the (optional) initrd. */
-	unsigned long mem = 0, start, initrd_size = 0;
-	/* Two temporaries. */
-	int i, c;
-	/* The boot information for the Guest. */
-	struct boot_params *boot;
-	/* If they specify an initrd file to load. */
-	const char *initrd_name = NULL;
-
-	/* Password structure for initgroups/setres[gu]id */
-	struct passwd *user_details = NULL;
-
-	/* Directory to chroot to */
-	char *chroot_path = NULL;
-
-	/* Save the args: we "reboot" by execing ourselves again. */
-	main_args = argv;
-
-	/*
-	 * First we initialize the device list.  We remember next interrupt
-	 * number to use for devices (1: remember that 0 is used by the timer).
-	 */
-	devices.next_irq = 1;
-
-	/* We're CPU 0.  In fact, that's the only CPU possible right now. */
-	cpu_id = 0;
-
-	/*
-	 * We need to know how much memory so we can set up the device
-	 * descriptor and memory pages for the devices as we parse the command
-	 * line.  So we quickly look through the arguments to find the amount
-	 * of memory now.
-	 */
-	for (i = 1; i < argc; i++) {
-		if (argv[i][0] != '-') {
-			mem = atoi(argv[i]) * 1024 * 1024;
-			/*
-			 * We start by mapping anonymous pages over all of
-			 * guest-physical memory range.  This fills it with 0,
-			 * and ensures that the Guest won't be killed when it
-			 * tries to access it.
-			 */
-			guest_base = map_zeroed_pages(mem / getpagesize()
-						      + DEVICE_PAGES);
-			guest_limit = mem;
-			guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
-			break;
-		}
-	}
-
-	/* If we exit via err(), this kills all the threads, restores tty. */
-	atexit(cleanup_devices);
-
-	/* We always have a console device, and it's always device 1. */
-	setup_console();
-
-	/* The options are fairly straight-forward */
-	while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
-		switch (c) {
-		case 'v':
-			verbose = true;
-			break;
-		case 't':
-			setup_tun_net(optarg);
-			break;
-		case 'b':
-			setup_block_file(optarg);
-			break;
-		case 'r':
-			setup_rng();
-			break;
-		case 'i':
-			initrd_name = optarg;
-			break;
-		case 'u':
-			user_details = getpwnam(optarg);
-			if (!user_details)
-				err(1, "getpwnam failed, incorrect username?");
-			break;
-		case 'c':
-			chroot_path = optarg;
-			break;
-		default:
-			warnx("Unknown argument %s", argv[optind]);
-			usage();
-		}
-	}
-	/*
-	 * After the other arguments we expect memory and kernel image name,
-	 * followed by command line arguments for the kernel.
-	 */
-	if (optind + 2 > argc)
-		usage();
-
-	verbose("Guest base is at %p\n", guest_base);
-
-	/* Initialize the (fake) PCI host bridge device. */
-	init_pci_host_bridge();
-
-	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
-
-	/* Boot information is stashed at physical address 0 */
-	boot = from_guest_phys(0);
-
-	/* Map the initrd image if requested (at top of physical memory) */
-	if (initrd_name) {
-		initrd_size = load_initrd(initrd_name, mem);
-		/*
-		 * These are the location in the Linux boot header where the
-		 * start and size of the initrd are expected to be found.
-		 */
-		boot->hdr.ramdisk_image = mem - initrd_size;
-		boot->hdr.ramdisk_size = initrd_size;
-		/* The bootloader type 0xFF means "unknown"; that's OK. */
-		boot->hdr.type_of_loader = 0xFF;
-	}
-
-	/*
-	 * The Linux boot header contains an "E820" memory map: ours is a
-	 * simple, single region.
-	 */
-	boot->e820_entries = 1;
-	boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM });
-	/*
-	 * The boot header contains a command line pointer: we put the command
-	 * line after the boot header.
-	 */
-	boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
-	/* We use a simple helper to copy the arguments separated by spaces. */
-	concat((char *)(boot + 1), argv+optind+2);
-
-	/* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
-	boot->hdr.kernel_alignment = 0x1000000;
-
-	/* Boot protocol version: 2.07 supports the fields for lguest. */
-	boot->hdr.version = 0x207;
-
-	/* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */
-	boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST;
-
-	/* Tell the entry path not to try to reload segment registers. */
-	boot->hdr.loadflags |= KEEP_SEGMENTS;
-
-	/* We don't support tboot: */
-	boot->tboot_addr = 0;
-
-	/* Ensure this is 0 to prevent APM from loading: */
-	boot->apm_bios_info.version = 0;
-
-	/* We tell the kernel to initialize the Guest. */
-	tell_kernel(start);
-
-	/* Ensure that we terminate if a device-servicing child dies. */
-	signal(SIGCHLD, kill_launcher);
-
-	/* If requested, chroot to a directory */
-	if (chroot_path) {
-		if (chroot(chroot_path) != 0)
-			err(1, "chroot(\"%s\") failed", chroot_path);
-
-		if (chdir("/") != 0)
-			err(1, "chdir(\"/\") failed");
-
-		verbose("chroot done\n");
-	}
-
-	/* If requested, drop privileges */
-	if (user_details) {
-		uid_t u;
-		gid_t g;
-
-		u = user_details->pw_uid;
-		g = user_details->pw_gid;
-
-		if (initgroups(user_details->pw_name, g) != 0)
-			err(1, "initgroups failed");
-
-		if (setresgid(g, g, g) != 0)
-			err(1, "setresgid failed");
-
-		if (setresuid(u, u, u) != 0)
-			err(1, "setresuid failed");
-
-		verbose("Dropping privileges completed\n");
-	}
-
-	/* Finally, run the Guest.  This doesn't return. */
-	run_guest();
-}
-/*:*/
-
-/*M:999
- * Mastery is done: you now know everything I do.
- *
- * But surely you have seen code, features and bugs in your wanderings which
- * you now yearn to attack?  That is the real game, and I look forward to you
- * patching and forking lguest into the Your-Name-Here-visor.
- *
- * Farewell, and good coding!
- * Rusty Russell.
- */
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
deleted file mode 100644
index 06e1f4649511..000000000000
--- a/tools/lguest/lguest.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-      __
- (___()'`;  Rusty's Remarkably Unreliable Guide to Lguest
- /,    /`      - or, A Young Coder's Illustrated Hypervisor
- \\"--\\    http://lguest.ozlabs.org
-
-Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
-for Linux developers and users to experiment with virtualization with the
-minimum of complexity.  Nonetheless, it should have sufficient features to
-make it useful for specific tasks, and, of course, you are encouraged to fork
-and enhance it (see drivers/lguest/README).
-
-Features:
-
-- Kernel module which runs in a normal kernel.
-- Simple I/O model for communication.
-- Simple program to create new guests.
-- Logo contains cute puppies: http://lguest.ozlabs.org
-
-Developer features:
-
-- Fun to hack on.
-- No ABI: being tied to a specific kernel anyway, you can change anything.
-- Many opportunities for improvement or feature implementation.
-
-Running Lguest:
-
-- The easiest way to run lguest is to use same kernel as guest and host.
-  You can configure them differently, but usually it's easiest not to.
-
-  You will need to configure your kernel with the following options:
-
-  "Processor type and features":
-     "Paravirtualized guest support" = Y
-        "Lguest guest support" = Y
-     "High Memory Support" = off/4GB
-     "Alignment value to which kernel should be aligned" = 0x100000
-        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
-         CONFIG_PHYSICAL_ALIGN=0x100000)
-
-  "Device Drivers":
-     "Block devices"
-        "Virtio block driver" = M/Y
-     "Network device support"
-        "Universal TUN/TAP device driver support" = M/Y
-        "Virtio network driver" = M/Y
-           (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
-
-  "Virtualization"
-     "Linux hypervisor example code" = M/Y
-        (CONFIG_LGUEST=m)
-
-- A tool called "lguest" is available in this directory: type "make"
-  to build it.  If you didn't build your kernel in-tree, use "make
-  O=<builddir>".
-
-- Create or find a root disk image.  There are several useful ones
-  around, such as the xm-test tiny root image at
-	  http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
-
-  For more serious work, I usually use a distribution ISO image and
-  install it under qemu, then make multiple copies:
-
-	  dd if=/dev/zero of=rootfile bs=1M count=2048
-	  qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
-
-  Make sure that you install a getty on /dev/hvc0 if you want to log in on the
-  console!
-
-- "modprobe lg" if you built it as a module.
-
-- Run an lguest as root:
-
-      tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
-        --block=rootfile root=/dev/vda
-
-   Explanation:
-    64: the amount of memory to use, in MB.
-
-    vmlinux: the kernel image found in the top of your build directory.  You
-       can also use a standard bzImage.
-
-    --tunnet=192.168.19.1: configures a "tap" device for networking with this
-       IP address.
-
-    --block=rootfile: a file or block device which becomes /dev/vda
-       inside the guest.
-
-    root=/dev/vda: this (and anything else on the command line) are
-       kernel boot parameters.
-
-- Configuring networking.  I usually have the host masquerade, using
-  "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
-  /proc/sys/net/ipv4/ip_forward".  In this example, I would configure
-  eth0 inside the guest at 192.168.19.2.
-
-  Another method is to bridge the tap device to an external interface
-  using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
-  to obtain an IP address.  The bridge needs to be configured first:
-  this option simply adds the tap interface to it.
-
-  A simple example on my system:
-
-    ifconfig eth0 0.0.0.0
-    brctl addbr lg0
-    ifconfig lg0 up
-    brctl addif lg0 eth0
-    dhclient lg0
-
-  Then use --tunnet=bridge:lg0 when launching the guest.
-
-  See:
-  
-    http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
-    
-  for general information on how to get bridging to work.
-
-- Random number generation. Using the --rng option will provide a
-  /dev/hwrng in the guest that will read from the host's /dev/random.
-  Use this option in conjunction with rng-tools (see ../hw_random.txt)
-  to provide entropy to the guest kernel's /dev/random.
-
-There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-
-Good luck!
-Rusty Russell rusty@rustcorp.com.au.
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile
index eb6e0b36bfc1..4563ba7ede6f 100644
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -8,9 +8,9 @@ srctree := $(patsubst %/,%,$(dir $(srctree)))
 #$(info Determined 'srctree' to be $(srctree))
 endif
 
-CC = $(CROSS_COMPILE)gcc
-AR = $(CROSS_COMPILE)ar
-LD = $(CROSS_COMPILE)ld
+CC ?= $(CROSS_COMPILE)gcc
+AR ?= $(CROSS_COMPILE)ar
+LD ?= $(CROSS_COMPILE)ld
 
 MAKEFLAGS += --no-print-directory
 
@@ -19,7 +19,7 @@ LIBFILE = $(OUTPUT)libapi.a
 CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
 CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC
 
-ifeq ($(CC), clang)
+ifeq ($(CC_NO_CLANG), 0)
   CFLAGS += -O3
 else
   CFLAGS += -O6
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 1f5300e56b44..4ed0257dc1f3 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -154,12 +154,12 @@ all: fixdep $(VERSION_FILES) all_cmd
 all_cmd: $(CMD_TARGETS)
 
 $(BPF_IN): force elfdep bpfdep
-	@(test -f ../../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \
+	@(test -f ../../include/uapi/linux/bpf.h -a -f ../../../include/uapi/linux/bpf.h && ( \
 	(diff -B ../../include/uapi/linux/bpf.h ../../../include/uapi/linux/bpf.h >/dev/null) || \
-	echo "Warning: tools/include/uapi/linux/bpf.h differs from kernel" >&2 )) || true
-	@(test -f ../../../include/uapi/linux/bpf_common.h -a -f ../../../include/uapi/linux/bpf_common.h && ( \
+	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf.h' differs from latest version at 'include/uapi/linux/bpf.h'" >&2 )) || true
+	@(test -f ../../include/uapi/linux/bpf_common.h -a -f ../../../include/uapi/linux/bpf_common.h && ( \
 	(diff -B ../../include/uapi/linux/bpf_common.h ../../../include/uapi/linux/bpf_common.h >/dev/null) || \
-	echo "Warning: tools/include/uapi/linux/bpf_common.h differs from kernel" >&2 )) || true
+	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/bpf_common.h' differs from latest version at 'include/uapi/linux/bpf_common.h'" >&2 )) || true
 	$(Q)$(MAKE) $(build)=libbpf
 
 $(OUTPUT)libbpf.so: $(BPF_IN)
diff --git a/tools/lib/string.c b/tools/lib/string.c
index 8e678af1c6ee..a4246f14ded1 100644
--- a/tools/lib/string.c
+++ b/tools/lib/string.c
@@ -39,27 +39,45 @@ void *memdup(const void *src, size_t len)
  * @s: input string
  * @res: result
  *
- * This routine returns 0 iff the first character is one of 'Yy1Nn0'.
- * Otherwise it will return -EINVAL.  Value pointed to by res is
- * updated upon finding a match.
+ * This routine returns 0 iff the first character is one of 'Yy1Nn0', or
+ * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL.  Value
+ * pointed to by res is updated upon finding a match.
  */
 int strtobool(const char *s, bool *res)
 {
+	if (!s)
+		return -EINVAL;
+
 	switch (s[0]) {
 	case 'y':
 	case 'Y':
 	case '1':
 		*res = true;
-		break;
+		return 0;
 	case 'n':
 	case 'N':
 	case '0':
 		*res = false;
-		break;
+		return 0;
+	case 'o':
+	case 'O':
+		switch (s[1]) {
+		case 'n':
+		case 'N':
+			*res = true;
+			return 0;
+		case 'f':
+		case 'F':
+			*res = false;
+			return 0;
+		default:
+			break;
+		}
 	default:
-		return -EINVAL;
+		break;
 	}
-	return 0;
+
+	return -EINVAL;
 }
 
 /**
@@ -87,12 +105,3 @@ size_t __weak strlcpy(char *dest, const char *src, size_t size)
 	}
 	return ret;
 }
-
-int prefixcmp(const char *str, const char *prefix)
-{
-	for (; ; str++, prefix++)
-		if (!*prefix)
-			return 0;
-		else if (*str != *prefix)
-			return (unsigned char)*prefix - (unsigned char)*str;
-}
diff --git a/tools/lib/subcmd/Makefile b/tools/lib/subcmd/Makefile
index 3d1c3b5b5150..7e9f03c97e4c 100644
--- a/tools/lib/subcmd/Makefile
+++ b/tools/lib/subcmd/Makefile
@@ -21,7 +21,7 @@ LIBFILE = $(OUTPUT)libsubcmd.a
 CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
 CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC
 
-ifeq ($(CC), clang)
+ifeq ($(CC_NO_CLANG), 0)
   CFLAGS += -O3
 else
   CFLAGS += -O6
diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c
index ba970a73d053..0310520f918e 100644
--- a/tools/lib/subcmd/help.c
+++ b/tools/lib/subcmd/help.c
@@ -171,7 +171,7 @@ static void list_commands_in_dir(struct cmdnames *cmds,
 	while ((de = readdir(dir)) != NULL) {
 		int entlen;
 
-		if (prefixcmp(de->d_name, prefix))
+		if (!strstarts(de->d_name, prefix))
 			continue;
 
 		astrcat(&buf, de->d_name);
diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c
index 359bfa77f39c..2bd6fd0c1d40 100644
--- a/tools/lib/subcmd/parse-options.c
+++ b/tools/lib/subcmd/parse-options.c
@@ -368,7 +368,7 @@ retry:
 			return 0;
 		}
 		if (!rest) {
-			if (!prefixcmp(options->long_name, "no-")) {
+			if (strstarts(options->long_name, "no-")) {
 				/*
 				 * The long name itself starts with "no-", so
 				 * accept the option without "no-" so that users
@@ -381,7 +381,7 @@ retry:
 					goto match;
 				}
 				/* Abbreviated case */
-				if (!prefixcmp(options->long_name + 3, arg)) {
+				if (strstarts(options->long_name + 3, arg)) {
 					flags |= OPT_UNSET;
 					goto is_abbreviated;
 				}
@@ -406,7 +406,7 @@ is_abbreviated:
 				continue;
 			}
 			/* negated and abbreviated very much? */
-			if (!prefixcmp("no-", arg)) {
+			if (strstarts("no-", arg)) {
 				flags |= OPT_UNSET;
 				goto is_abbreviated;
 			}
@@ -416,7 +416,7 @@ is_abbreviated:
 			flags |= OPT_UNSET;
 			rest = skip_prefix(arg + 3, options->long_name);
 			/* abbreviated and negated? */
-			if (!rest && !prefixcmp(options->long_name, arg + 3))
+			if (!rest && strstarts(options->long_name, arg + 3))
 				goto is_abbreviated;
 			if (!rest)
 				continue;
@@ -456,7 +456,7 @@ static void check_typos(const char *arg, const struct option *options)
 	if (strlen(arg) < 3)
 		return;
 
-	if (!prefixcmp(arg, "no-")) {
+	if (strstarts(arg, "no-")) {
 		fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
 		exit(129);
 	}
@@ -464,7 +464,7 @@ static void check_typos(const char *arg, const struct option *options)
 	for (; options->type != OPTION_END; options++) {
 		if (!options->long_name)
 			continue;
-		if (!prefixcmp(options->long_name, arg)) {
+		if (strstarts(options->long_name, arg)) {
 			fprintf(stderr, " Error: did you mean `--%s` (with two dashes ?)", arg);
 			exit(129);
 		}
@@ -933,10 +933,10 @@ opt:
 		if (opts->long_name == NULL)
 			continue;
 
-		if (!prefixcmp(opts->long_name, optstr))
+		if (strstarts(opts->long_name, optstr))
 			print_option_help(opts, 0);
-		if (!prefixcmp("no-", optstr) &&
-		    !prefixcmp(opts->long_name, optstr + 3))
+		if (strstarts("no-", optstr) &&
+		    strstarts(opts->long_name, optstr + 3))
 			print_option_help(opts, 0);
 	}
 
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 3a6425fefc43..6976c73e60c4 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -25,7 +25,8 @@ OBJTOOL_IN := $(OBJTOOL)-in.o
 all: $(OBJTOOL)
 
 INCLUDES := -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi
-CFLAGS   += -Wall -Werror $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -fomit-frame-pointer -O2 -g $(INCLUDES)
+WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed
+CFLAGS   += -Wall -Werror $(WARNINGS) -fomit-frame-pointer -O2 -g $(INCLUDES)
 LDFLAGS  += -lelf $(LIBSUBCMD)
 
 # Allow old libelf to be used:
diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index 21aeca874edb..b0d7dc3d71b5 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -31,8 +31,9 @@
 #define INSN_RETURN		6
 #define INSN_CONTEXT_SWITCH	7
 #define INSN_STACK		8
-#define INSN_NOP		9
-#define INSN_OTHER		10
+#define INSN_BUG		9
+#define INSN_NOP		10
+#define INSN_OTHER		11
 #define INSN_LAST		INSN_OTHER
 
 enum op_dest_type {
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 4559a21a8de2..0e8c8ec4fd4e 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -86,8 +86,8 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 	struct insn insn;
 	int x86_64, sign;
 	unsigned char op1, op2, rex = 0, rex_b = 0, rex_r = 0, rex_w = 0,
-		      modrm = 0, modrm_mod = 0, modrm_rm = 0, modrm_reg = 0,
-		      sib = 0;
+		      rex_x = 0, modrm = 0, modrm_mod = 0, modrm_rm = 0,
+		      modrm_reg = 0, sib = 0;
 
 	x86_64 = is_x86_64(elf);
 	if (x86_64 == -1)
@@ -114,6 +114,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 		rex = insn.rex_prefix.bytes[0];
 		rex_w = X86_REX_W(rex) >> 3;
 		rex_r = X86_REX_R(rex) >> 2;
+		rex_x = X86_REX_X(rex) >> 1;
 		rex_b = X86_REX_B(rex);
 	}
 
@@ -217,6 +218,18 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 			op->dest.reg = CFI_BP;
 			break;
 		}
+
+		if (rex_w && !rex_b && modrm_mod == 3 && modrm_rm == 4) {
+
+			/* mov reg, %rsp */
+			*type = INSN_STACK;
+			op->src.type = OP_SRC_REG;
+			op->src.reg = op_to_cfi_reg[modrm_reg][rex_r];
+			op->dest.type = OP_DEST_REG;
+			op->dest.reg = CFI_SP;
+			break;
+		}
+
 		/* fallthrough */
 	case 0x88:
 		if (!rex_b &&
@@ -269,80 +282,28 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 		break;
 
 	case 0x8d:
-		if (rex == 0x48 && modrm == 0x65) {
+		if (sib == 0x24 && rex_w && !rex_b && !rex_x) {
 
-			/* lea disp(%rbp), %rsp */
+			/* lea disp(%rsp), reg */
 			*type = INSN_STACK;
 			op->src.type = OP_SRC_ADD;
-			op->src.reg = CFI_BP;
+			op->src.reg = CFI_SP;
 			op->src.offset = insn.displacement.value;
 			op->dest.type = OP_DEST_REG;
-			op->dest.reg = CFI_SP;
-			break;
-		}
+			op->dest.reg = op_to_cfi_reg[modrm_reg][rex_r];
 
-		if (rex == 0x48 && (modrm == 0xa4 || modrm == 0x64) &&
-		    sib == 0x24) {
+		} else if (rex == 0x48 && modrm == 0x65) {
 
-			/* lea disp(%rsp), %rsp */
+			/* lea disp(%rbp), %rsp */
 			*type = INSN_STACK;
 			op->src.type = OP_SRC_ADD;
-			op->src.reg = CFI_SP;
+			op->src.reg = CFI_BP;
 			op->src.offset = insn.displacement.value;
 			op->dest.type = OP_DEST_REG;
 			op->dest.reg = CFI_SP;
-			break;
-		}
-
-		if (rex == 0x48 && modrm == 0x2c && sib == 0x24) {
 
-			/* lea (%rsp), %rbp */
-			*type = INSN_STACK;
-			op->src.type = OP_SRC_REG;
-			op->src.reg = CFI_SP;
-			op->dest.type = OP_DEST_REG;
-			op->dest.reg = CFI_BP;
-			break;
-		}
-
-		if (rex == 0x4c && modrm == 0x54 && sib == 0x24 &&
-		    insn.displacement.value == 8) {
-
-			/*
-			 * lea 0x8(%rsp), %r10
-			 *
-			 * Here r10 is the "drap" pointer, used as a stack
-			 * pointer helper when the stack gets realigned.
-			 */
-			*type = INSN_STACK;
-			op->src.type = OP_SRC_ADD;
-			op->src.reg = CFI_SP;
-			op->src.offset = 8;
-			op->dest.type = OP_DEST_REG;
-			op->dest.reg = CFI_R10;
-			break;
-		}
-
-		if (rex == 0x4c && modrm == 0x6c && sib == 0x24 &&
-		    insn.displacement.value == 16) {
-
-			/*
-			 * lea 0x10(%rsp), %r13
-			 *
-			 * Here r13 is the "drap" pointer, used as a stack
-			 * pointer helper when the stack gets realigned.
-			 */
-			*type = INSN_STACK;
-			op->src.type = OP_SRC_ADD;
-			op->src.reg = CFI_SP;
-			op->src.offset = 16;
-			op->dest.type = OP_DEST_REG;
-			op->dest.reg = CFI_R13;
-			break;
-		}
-
-		if (rex == 0x49 && modrm == 0x62 &&
-		    insn.displacement.value == -8) {
+		} else if (rex == 0x49 && modrm == 0x62 &&
+			   insn.displacement.value == -8) {
 
 			/*
 			 * lea -0x8(%r10), %rsp
@@ -356,11 +317,9 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 			op->src.offset = -8;
 			op->dest.type = OP_DEST_REG;
 			op->dest.reg = CFI_SP;
-			break;
-		}
 
-		if (rex == 0x49 && modrm == 0x65 &&
-		    insn.displacement.value == -16) {
+		} else if (rex == 0x49 && modrm == 0x65 &&
+			   insn.displacement.value == -16) {
 
 			/*
 			 * lea -0x10(%r13), %rsp
@@ -374,7 +333,6 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 			op->src.offset = -16;
 			op->dest.type = OP_DEST_REG;
 			op->dest.reg = CFI_SP;
-			break;
 		}
 
 		break;
@@ -406,20 +364,27 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 
 	case 0x0f:
 
-		if (op2 >= 0x80 && op2 <= 0x8f)
+		if (op2 >= 0x80 && op2 <= 0x8f) {
+
 			*type = INSN_JUMP_CONDITIONAL;
-		else if (op2 == 0x05 || op2 == 0x07 || op2 == 0x34 ||
-			 op2 == 0x35)
+
+		} else if (op2 == 0x05 || op2 == 0x07 || op2 == 0x34 ||
+			   op2 == 0x35) {
 
 			/* sysenter, sysret */
 			*type = INSN_CONTEXT_SWITCH;
 
-		else if (op2 == 0x0d || op2 == 0x1f)
+		} else if (op2 == 0x0b || op2 == 0xb9) {
+
+			/* ud2 */
+			*type = INSN_BUG;
+
+		} else if (op2 == 0x0d || op2 == 0x1f) {
 
 			/* nopl/nopw */
 			*type = INSN_NOP;
 
-		else if (op2 == 0xa0 || op2 == 0xa8) {
+		} else if (op2 == 0xa0 || op2 == 0xa8) {
 
 			/* push fs/gs */
 			*type = INSN_STACK;
diff --git a/tools/objtool/cfi.h b/tools/objtool/cfi.h
index 443ab2c69992..2fe883c665c7 100644
--- a/tools/objtool/cfi.h
+++ b/tools/objtool/cfi.h
@@ -40,7 +40,7 @@
 #define CFI_R14			14
 #define CFI_R15			15
 #define CFI_RA			16
-#define CFI_NUM_REGS	17
+#define CFI_NUM_REGS		17
 
 struct cfi_reg {
 	int base;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 3436a942b606..f744617c9946 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -218,9 +218,12 @@ static void clear_insn_state(struct insn_state *state)
 
 	memset(state, 0, sizeof(*state));
 	state->cfa.base = CFI_UNDEFINED;
-	for (i = 0; i < CFI_NUM_REGS; i++)
+	for (i = 0; i < CFI_NUM_REGS; i++) {
 		state->regs[i].base = CFI_UNDEFINED;
+		state->vals[i].base = CFI_UNDEFINED;
+	}
 	state->drap_reg = CFI_UNDEFINED;
+	state->drap_offset = -1;
 }
 
 /*
@@ -296,7 +299,7 @@ static int decode_instructions(struct objtool_file *file)
 }
 
 /*
- * Find all uses of the unreachable() macro, which are code path dead ends.
+ * Mark "ud2" instructions and manually annotated dead ends.
  */
 static int add_dead_ends(struct objtool_file *file)
 {
@@ -305,9 +308,20 @@ static int add_dead_ends(struct objtool_file *file)
 	struct instruction *insn;
 	bool found;
 
+	/*
+	 * By default, "ud2" is a dead end unless otherwise annotated, because
+	 * GCC 7 inserts it for certain divide-by-zero cases.
+	 */
+	for_each_insn(file, insn)
+		if (insn->type == INSN_BUG)
+			insn->dead_end = true;
+
+	/*
+	 * Check for manually annotated dead ends.
+	 */
 	sec = find_section_by_name(file->elf, ".rela.discard.unreachable");
 	if (!sec)
-		return 0;
+		goto reachable;
 
 	list_for_each_entry(rela, &sec->rela_list, list) {
 		if (rela->sym->type != STT_SECTION) {
@@ -340,6 +354,48 @@ static int add_dead_ends(struct objtool_file *file)
 		insn->dead_end = true;
 	}
 
+reachable:
+	/*
+	 * These manually annotated reachable checks are needed for GCC 4.4,
+	 * where the Linux unreachable() macro isn't supported.  In that case
+	 * GCC doesn't know the "ud2" is fatal, so it generates code as if it's
+	 * not a dead end.
+	 */
+	sec = find_section_by_name(file->elf, ".rela.discard.reachable");
+	if (!sec)
+		return 0;
+
+	list_for_each_entry(rela, &sec->rela_list, list) {
+		if (rela->sym->type != STT_SECTION) {
+			WARN("unexpected relocation symbol type in %s", sec->name);
+			return -1;
+		}
+		insn = find_insn(file, rela->sym->sec, rela->addend);
+		if (insn)
+			insn = list_prev_entry(insn, list);
+		else if (rela->addend == rela->sym->sec->len) {
+			found = false;
+			list_for_each_entry_reverse(insn, &file->insn_list, list) {
+				if (insn->sec == rela->sym->sec) {
+					found = true;
+					break;
+				}
+			}
+
+			if (!found) {
+				WARN("can't find reachable insn at %s+0x%x",
+				     rela->sym->sec->name, rela->addend);
+				return -1;
+			}
+		} else {
+			WARN("can't find reachable insn at %s+0x%x",
+			     rela->sym->sec->name, rela->addend);
+			return -1;
+		}
+
+		insn->dead_end = false;
+	}
+
 	return 0;
 }
 
@@ -1057,8 +1113,7 @@ static int update_insn_state_regs(struct instruction *insn, struct insn_state *s
 static void save_reg(struct insn_state *state, unsigned char reg, int base,
 		     int offset)
 {
-	if ((arch_callee_saved_reg(reg) ||
-	    (state->drap && reg == state->drap_reg)) &&
+	if (arch_callee_saved_reg(reg) &&
 	    state->regs[reg].base == CFI_UNDEFINED) {
 		state->regs[reg].base = base;
 		state->regs[reg].offset = offset;
@@ -1148,24 +1203,47 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 		switch (op->src.type) {
 
 		case OP_SRC_REG:
-			if (cfa->base == op->src.reg && cfa->base == CFI_SP &&
-			    op->dest.reg == CFI_BP && regs[CFI_BP].base == CFI_CFA &&
-			    regs[CFI_BP].offset == -cfa->offset) {
-
-				/* mov %rsp, %rbp */
-				cfa->base = op->dest.reg;
-				state->bp_scratch = false;
-			} else if (state->drap) {
-
-				/* drap: mov %rsp, %rbp */
-				regs[CFI_BP].base = CFI_BP;
-				regs[CFI_BP].offset = -state->stack_size;
-				state->bp_scratch = false;
-			} else if (!no_fp) {
-
-				WARN_FUNC("unknown stack-related register move",
-					  insn->sec, insn->offset);
-				return -1;
+			if (op->src.reg == CFI_SP && op->dest.reg == CFI_BP) {
+
+				if (cfa->base == CFI_SP &&
+				    regs[CFI_BP].base == CFI_CFA &&
+				    regs[CFI_BP].offset == -cfa->offset) {
+
+					/* mov %rsp, %rbp */
+					cfa->base = op->dest.reg;
+					state->bp_scratch = false;
+				}
+
+				else if (state->drap) {
+
+					/* drap: mov %rsp, %rbp */
+					regs[CFI_BP].base = CFI_BP;
+					regs[CFI_BP].offset = -state->stack_size;
+					state->bp_scratch = false;
+				}
+			}
+
+			else if (op->dest.reg == cfa->base) {
+
+				/* mov %reg, %rsp */
+				if (cfa->base == CFI_SP &&
+				    state->vals[op->src.reg].base == CFI_CFA) {
+
+					/*
+					 * This is needed for the rare case
+					 * where GCC does something dumb like:
+					 *
+					 *   lea    0x8(%rsp), %rcx
+					 *   ...
+					 *   mov    %rcx, %rsp
+					 */
+					cfa->offset = -state->vals[op->src.reg].offset;
+					state->stack_size = cfa->offset;
+
+				} else {
+					cfa->base = CFI_UNDEFINED;
+					cfa->offset = 0;
+				}
 			}
 
 			break;
@@ -1187,11 +1265,25 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 				break;
 			}
 
-			if (op->dest.reg != CFI_BP && op->src.reg == CFI_SP &&
-			    cfa->base == CFI_SP) {
+			if (op->src.reg == CFI_SP && cfa->base == CFI_SP) {
 
 				/* drap: lea disp(%rsp), %drap */
 				state->drap_reg = op->dest.reg;
+
+				/*
+				 * lea disp(%rsp), %reg
+				 *
+				 * This is needed for the rare case where GCC
+				 * does something dumb like:
+				 *
+				 *   lea    0x8(%rsp), %rcx
+				 *   ...
+				 *   mov    %rcx, %rsp
+				 */
+				state->vals[op->dest.reg].base = CFI_CFA;
+				state->vals[op->dest.reg].offset = \
+					-state->stack_size + op->src.offset;
+
 				break;
 			}
 
@@ -1228,7 +1320,6 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 				cfa->base = state->drap_reg;
 				cfa->offset = state->stack_size = 0;
 				state->drap = true;
-
 			}
 
 			/*
@@ -1246,17 +1337,19 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 				cfa->base = CFI_SP;
 			}
 
-			if (regs[op->dest.reg].offset == -state->stack_size) {
+			if (state->drap && cfa->base == CFI_BP_INDIRECT &&
+			    op->dest.type == OP_DEST_REG &&
+			    op->dest.reg == state->drap_reg &&
+			    state->drap_offset == -state->stack_size) {
 
-				if (state->drap && cfa->base == CFI_BP_INDIRECT &&
-				    op->dest.type == OP_DEST_REG &&
-				    op->dest.reg == state->drap_reg) {
+				/* drap: pop %drap */
+				cfa->base = state->drap_reg;
+				cfa->offset = 0;
+				state->drap_offset = -1;
 
-					/* drap: pop %drap */
-					cfa->base = state->drap_reg;
-					cfa->offset = 0;
-				}
+			} else if (regs[op->dest.reg].offset == -state->stack_size) {
 
+				/* pop %reg */
 				restore_reg(state, op->dest.reg);
 			}
 
@@ -1268,14 +1361,18 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 
 		case OP_SRC_REG_INDIRECT:
 			if (state->drap && op->src.reg == CFI_BP &&
+			    op->src.offset == state->drap_offset) {
+
+				/* drap: mov disp(%rbp), %drap */
+				cfa->base = state->drap_reg;
+				cfa->offset = 0;
+				state->drap_offset = -1;
+			}
+
+			if (state->drap && op->src.reg == CFI_BP &&
 			    op->src.offset == regs[op->dest.reg].offset) {
 
 				/* drap: mov disp(%rbp), %reg */
-				if (op->dest.reg == state->drap_reg) {
-					cfa->base = state->drap_reg;
-					cfa->offset = 0;
-				}
-
 				restore_reg(state, op->dest.reg);
 
 			} else if (op->src.reg == cfa->base &&
@@ -1311,8 +1408,8 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 				cfa->base = CFI_BP_INDIRECT;
 				cfa->offset = -state->stack_size;
 
-				/* save drap so we know when to undefine it */
-				save_reg(state, op->src.reg, CFI_CFA, -state->stack_size);
+				/* save drap so we know when to restore it */
+				state->drap_offset = -state->stack_size;
 
 			} else if (op->src.reg == CFI_BP && cfa->base == state->drap_reg) {
 
@@ -1346,8 +1443,8 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 				cfa->base = CFI_BP_INDIRECT;
 				cfa->offset = op->dest.offset;
 
-				/* save drap so we know when to undefine it */
-				save_reg(state, op->src.reg, CFI_CFA, op->dest.offset);
+				/* save drap offset so we know when to restore it */
+				state->drap_offset = op->dest.offset;
 			}
 
 			else if (regs[op->src.reg].base == CFI_UNDEFINED) {
@@ -1438,11 +1535,12 @@ static bool insn_state_match(struct instruction *insn, struct insn_state *state)
 			  insn->sec, insn->offset, state1->type, state2->type);
 
 	} else if (state1->drap != state2->drap ||
-		 (state1->drap && state1->drap_reg != state2->drap_reg)) {
-		WARN_FUNC("stack state mismatch: drap1=%d(%d) drap2=%d(%d)",
+		 (state1->drap && state1->drap_reg != state2->drap_reg) ||
+		 (state1->drap && state1->drap_offset != state2->drap_offset)) {
+		WARN_FUNC("stack state mismatch: drap1=%d(%d,%d) drap2=%d(%d,%d)",
 			  insn->sec, insn->offset,
-			  state1->drap, state1->drap_reg,
-			  state2->drap, state2->drap_reg);
+			  state1->drap, state1->drap_reg, state1->drap_offset,
+			  state2->drap, state2->drap_reg, state2->drap_offset);
 
 	} else
 		return true;
@@ -1471,26 +1569,26 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 	if (insn->alt_group && list_empty(&insn->alts)) {
 		WARN_FUNC("don't know how to handle branch to middle of alternative instruction group",
 			  sec, insn->offset);
-		return -1;
+		return 1;
 	}
 
 	while (1) {
 		next_insn = next_insn_same_sec(file, insn);
 
-		if (file->c_file && insn->func) {
-			if (func && func != insn->func) {
-				WARN("%s() falls through to next function %s()",
-				     func->name, insn->func->name);
-				return 1;
-			}
+
+		if (file->c_file && func && insn->func && func != insn->func) {
+			WARN("%s() falls through to next function %s()",
+			     func->name, insn->func->name);
+			return 1;
 		}
 
-		func = insn->func;
+		if (insn->func)
+			func = insn->func;
 
 		if (func && insn->ignore) {
 			WARN_FUNC("BUG: why am I validating an ignored function?",
 				  sec, insn->offset);
-			return -1;
+			return 1;
 		}
 
 		if (insn->visited) {
@@ -1628,7 +1726,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 
 		case INSN_STACK:
 			if (update_insn_state(insn, &state))
-				return -1;
+				return 1;
 
 			break;
 
@@ -1693,8 +1791,13 @@ static bool ignore_unreachable_insn(struct instruction *insn)
 	/*
 	 * Ignore any unused exceptions.  This can happen when a whitelisted
 	 * function has an exception table entry.
+	 *
+	 * Also ignore alternative replacement instructions.  This can happen
+	 * when a whitelisted function uses one of the ALTERNATIVE macros.
 	 */
-	if (!strcmp(insn->sec->name, ".fixup"))
+	if (!strcmp(insn->sec->name, ".fixup") ||
+	    !strcmp(insn->sec->name, ".altinstr_replacement") ||
+	    !strcmp(insn->sec->name, ".altinstr_aux"))
 		return true;
 
 	/*
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index c9af11f0c8af..47d9ea70a83d 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -32,7 +32,8 @@ struct insn_state {
 	unsigned char type;
 	bool bp_scratch;
 	bool drap;
-	int drap_reg;
+	int drap_reg, drap_offset;
+	struct cfi_reg vals[CFI_NUM_REGS];
 };
 
 struct instruction {
diff --git a/tools/perf/Build b/tools/perf/Build
index bd8eeb60533c..b48ca40fccf9 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -50,6 +50,6 @@ libperf-y += util/
 libperf-y += arch/
 libperf-y += ui/
 libperf-y += scripts/
-libperf-y += trace/beauty/
+libperf-$(CONFIG_AUDIT) += trace/beauty/
 
 gtk-y += ui/gtk/
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index 098cfb9ca8f0..db11478e30b4 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -192,7 +192,7 @@ do-install-man: man
 #		$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir); \
 #		$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
 
-install-man: check-man-tools man
+install-man: check-man-tools man do-install-man
 
 ifdef missing_tools
   DO_INSTALL_MAN = $(warning Please install $(missing_tools) to have the man pages installed)
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index 4b6cdbf8f935..ab1b0825130a 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -104,9 +104,9 @@ system, asynchronous, interrupt, transaction abort, trace begin, trace end, and
 in transaction, respectively.
 
 While it is possible to create scripts to analyze the data, an alternative
-approach is available to export the data to a postgresql database.  Refer to
-script export-to-postgresql.py for more details, and to script
-call-graph-from-postgresql.py for an example of using the database.
+approach is available to export the data to a sqlite or postgresql database.
+Refer to script export-to-sqlite.py or export-to-postgresql.py for more details,
+and to script call-graph-from-sql.py for an example of using the database.
 
 There is also script intel-pt-events.py which provides an example of how to
 unpack the raw data for power events and PTWRITE.
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index a89273d8e744..c635eab6af54 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -43,6 +43,10 @@ OPTIONS
 --quiet::
 	Do not show any message.  (Suppress -v)
 
+-n::
+--show-nr-samples::
+	Show the number of samples for each symbol
+
 -D::
 --dump-raw-trace::
         Dump raw trace in ASCII.
@@ -88,6 +92,8 @@ OPTIONS
 --asm-raw::
 	Show raw instruction encoding of assembly instructions.
 
+--show-total-period:: Show a column with the sum of periods.
+
 --source::
 	Interleave source code with assembly code. Enabled by default,
 	disable with --no-source.
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index 058064db39d2..84681007f80f 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -61,6 +61,11 @@ OPTIONS
 --verbose::
 	Be more verbose.
 
+--target-ns=PID:
+	Obtain mount namespace information from the target pid.  This is
+	used when creating a uprobe for a process that resides in a
+	different mount namespace from the perf(1) utility.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 165c2b1d4317..d7e4869905f1 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -130,6 +130,11 @@ OPTIONS
 --max-probes=NUM::
 	Set the maximum number of probe points for an event. Default is 128.
 
+--target-ns=PID:
+	Obtain mount namespace information from the target pid.  This is
+	used when creating a uprobe for a process that resides in a
+	different mount namespace from the perf(1) utility.
+
 -x::
 --exec=PATH::
 	Specify path to the executable or shared library file for user
@@ -264,6 +269,15 @@ Add probes at malloc() function on libc
 
  ./perf probe -x /lib/libc.so.6 malloc or ./perf probe /lib/libc.so.6 malloc
 
+Add a uprobe to a target process running in a different mount namespace
+
+ ./perf probe --target-ns <target pid> -x /lib64/libc.so.6 malloc
+
+Add a USDT probe to a target process running in a different mount namespace
+
+ ./perf probe --target-ns <target pid> -x /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.121-0.b13.el7_3.x86_64/jre/lib/amd64/server/libjvm.so %sdt_hotspot:thread__sleep__end
+
+
 SEE ALSO
 --------
 linkperf:perf-trace[1], linkperf:perf-record[1], linkperf:perf-buildid-cache[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index b0e9e921d534..9bdea047c5db 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -332,6 +332,7 @@ following filters are defined:
 	- no_tx: only when the target is not in a hardware transaction
 	- abort_tx: only when the target is a hardware transaction abort
 	- cond: conditional branches
+	- save_type: save branch type during sampling in case binary is not available later
 
 +
 The option requires at least one branch type among any, any_call, any_ret, ind_call, cond.
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 698076313606..c37d61682dfb 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -41,13 +41,13 @@ report::
 
 	- a symbolically formed event like 'pmu/param1=0x3,param2/' where
 	  param1 and param2 are defined as formats for the PMU in
-	  /sys/bus/event_sources/devices/<pmu>/format/*
+	  /sys/bus/event_source/devices/<pmu>/format/*
 
 	- a symbolically formed event like 'pmu/config=M,config1=N,config2=K/'
 	  where M, N, K are numbers (in decimal, hex, octal format).
 	  Acceptable values for each of 'config', 'config1' and 'config2'
 	  parameters are defined by corresponding entries in
-	  /sys/bus/event_sources/devices/<pmu>/format/*
+	  /sys/bus/event_source/devices/<pmu>/format/*
 
 -i::
 --no-inherit::
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index e71d63843f45..d864ea6fd367 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -237,6 +237,10 @@ Default is to monitor all CPUS.
 --hierarchy::
 	Enable hierarchy output.
 
+--force::
+	Don't do ownership validation.
+
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 
diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
index de8b39dda7b8..e90c59c6d815 100644
--- a/tools/perf/Documentation/perf.data-file-format.txt
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -398,6 +398,11 @@ struct auxtrace_error_event {
 	char msg[MAX_AUXTRACE_ERROR_MSG];
 };
 
+	PERF_RECORD_HEADER_FEATURE		= 80,
+
+Describes a header feature. These are records used in pipe-mode that
+contain information that otherwise would be in perf.data file's header.
+
 Event types
 
 Define the event attributes with their IDs.
@@ -422,8 +427,9 @@ struct perf_pipe_file_header {
 };
 
 The information about attrs, data, and event_types is instead in the
-synthesized events PERF_RECORD_ATTR, PERF_RECORD_HEADER_TRACING_DATA and
-PERF_RECORD_HEADER_EVENT_TYPE that are generated by perf record in pipe-mode.
+synthesized events PERF_RECORD_ATTR, PERF_RECORD_HEADER_TRACING_DATA,
+PERF_RECORD_HEADER_EVENT_TYPE, and PERF_RECORD_HEADER_FEATURE
+that are generated by perf record in pipe-mode.
 
 
 References:
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index a29da46d180f..62072822dc85 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -70,15 +70,23 @@ tools/include/linux/hash.h
 tools/include/linux/kernel.h
 tools/include/linux/list.h
 tools/include/linux/log2.h
+tools/include/uapi/asm-generic/fcntl.h
+tools/include/uapi/asm-generic/ioctls.h
 tools/include/uapi/asm-generic/mman-common.h
 tools/include/uapi/asm-generic/mman.h
+tools/include/uapi/drm/drm.h
+tools/include/uapi/drm/i915_drm.h
 tools/include/uapi/linux/bpf.h
 tools/include/uapi/linux/bpf_common.h
 tools/include/uapi/linux/fcntl.h
 tools/include/uapi/linux/hw_breakpoint.h
+tools/include/uapi/linux/kvm.h
 tools/include/uapi/linux/mman.h
 tools/include/uapi/linux/perf_event.h
+tools/include/uapi/linux/sched.h
 tools/include/uapi/linux/stat.h
+tools/include/uapi/linux/vhost.h
+tools/include/uapi/sound/asound.h
 tools/include/linux/poison.h
 tools/include/linux/rbtree.h
 tools/include/linux/rbtree_augmented.h
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index bdf0e87f9b29..63f534a0902f 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -35,7 +35,7 @@ ifeq ($(SRCARCH),x86)
   ifeq (${IS_64_BIT}, 1)
     CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -DHAVE_SYSCALL_TABLE -I$(OUTPUT)arch/x86/include/generated
     ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
-    LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
+    LIBUNWIND_LIBS = -lunwind-x86_64 -lunwind -llzma
     $(call detected,CONFIG_X86_64)
   else
     LIBUNWIND_LIBS = -lunwind-x86 -llzma -lunwind
@@ -103,8 +103,12 @@ ifdef LIBDW_DIR
   LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
   LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
 endif
+DWARFLIBS := -ldw
+ifeq ($(findstring -static,${LDFLAGS}),-static)
+  DWARFLIBS += -lelf -lebl -ldl -lz -llzma -lbz2
+endif
 FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
-FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
+FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) $(DWARFLIBS)
 
 # for linking with debug library, run like:
 # make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
@@ -144,7 +148,7 @@ ifndef DEBUG
 endif
 
 ifeq ($(DEBUG),0)
-ifeq ($(CC), clang)
+ifeq ($(CC_NO_CLANG), 0)
   CFLAGS += -O3
 else
   CFLAGS += -O6
@@ -180,7 +184,7 @@ ifdef PYTHON_CONFIG
   PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
   PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS)) -lutil
   PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
-  ifeq ($(CC), clang)
+  ifeq ($(CC_NO_CLANG), 1)
     PYTHON_EMBED_CCOPTS := $(filter-out -specs=%,$(PYTHON_EMBED_CCOPTS))
   endif
   FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
@@ -330,6 +334,11 @@ ifeq ($(feature-sched_getcpu), 1)
   CFLAGS += -DHAVE_SCHED_GETCPU_SUPPORT
 endif
 
+ifeq ($(feature-setns), 1)
+  CFLAGS += -DHAVE_SETNS_SUPPORT
+  $(call detected,CONFIG_SETNS)
+endif
+
 ifndef NO_LIBELF
   CFLAGS += -DHAVE_LIBELF_SUPPORT
   EXTLIBS += -lelf
@@ -360,10 +369,6 @@ ifndef NO_LIBELF
     else
       CFLAGS += -DHAVE_DWARF_SUPPORT $(LIBDW_CFLAGS)
       LDFLAGS += $(LIBDW_LDFLAGS)
-      DWARFLIBS := -ldw
-      ifeq ($(findstring -static,${LDFLAGS}),-static)
-	DWARFLIBS += -lelf -lebl -lz -llzma -lbz2
-      endif
       EXTLIBS += ${DWARFLIBS}
       $(call detected,CONFIG_DWARF)
     endif # PERF_HAVE_DWARF_REGS
@@ -500,6 +505,10 @@ ifndef NO_LOCAL_LIBUNWIND
   EXTLIBS += $(LIBUNWIND_LIBS)
   LDFLAGS += $(LIBUNWIND_LIBS)
 endif
+ifeq ($(findstring -static,${LDFLAGS}),-static)
+  # gcc -static links libgcc_eh which contans piece of libunwind
+  LIBUNWIND_LDFLAGS += -Wl,--allow-multiple-definition
+endif
 
 ifndef NO_LIBUNWIND
   CFLAGS  += -DHAVE_LIBUNWIND_SUPPORT
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 5008f51a08a2..91ef44bfaf3e 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -33,6 +33,11 @@ include ../scripts/utilities.mak
 #
 # Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
 #
+# Define EXCLUDE_EXTLIBS=-lmylib to exclude libmylib from the auto-generated
+# EXTLIBS.
+#
+# Define EXTRA_PERFLIBS to pass extra libraries to PERFLIBS.
+#
 # Define NO_DWARF if you do not want debug-info analysis feature at all.
 #
 # Define WERROR=0 to disable treating any warnings as errors.
@@ -159,8 +164,8 @@ LN      = ln -f
 MKDIR   = mkdir
 FIND    = find
 INSTALL = install
-FLEX    = flex
-BISON   = bison
+FLEX    ?= flex
+BISON   ?= bison
 STRIP   = strip
 AWK     = awk
 
@@ -235,7 +240,7 @@ endif
 ifeq ($(FEATURES_DUMP),)
 FEATURE_DUMP_EXPORT := $(realpath $(OUTPUT)FEATURE-DUMP)
 else
-FEATURE_DUMP_EXPORT := $(FEATURES_DUMP)
+FEATURE_DUMP_EXPORT := $(realpath $(FEATURES_DUMP))
 endif
 
 export prefix bindir sharedir sysconfdir DESTDIR
@@ -274,7 +279,13 @@ LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
 export LIBTRACEEVENT
 
 LIBTRACEEVENT_DYNAMIC_LIST = $(TE_PATH)libtraceevent-dynamic-list
-LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS = -Xlinker --dynamic-list=$(LIBTRACEEVENT_DYNAMIC_LIST)
+
+#
+# The static build has no dynsym table, so this does not work for
+# static build. Looks like linker starts to scream about that now
+# (in Fedora 26) so we need to switch it off for static build.
+DYNAMIC_LIST_LDFLAGS               = -Xlinker --dynamic-list=$(LIBTRACEEVENT_DYNAMIC_LIST)
+LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS = $(if $(findstring -static,$(LDFLAGS)),,$(DYNAMIC_LIST_LDFLAGS))
 
 LIBAPI = $(API_PATH)libapi.a
 export LIBAPI
@@ -352,7 +363,8 @@ ifdef ASCIIDOC8
   export ASCIIDOC8
 endif
 
-LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive -Wl,--start-group $(EXTLIBS) -Wl,--end-group
+EXTLIBS := $(call filter-out,$(EXCLUDE_EXTLIBS),$(EXTLIBS))
+LIBS = -Wl,--whole-archive $(PERFLIBS) $(EXTRA_PERFLIBS) -Wl,--no-whole-archive -Wl,--start-group $(EXTLIBS) -Wl,--end-group
 
 ifeq ($(USE_CLANG), 1)
   CLANGLIBS_LIST = AST Basic CodeGen Driver Frontend Lex Tooling Edit Sema Analysis Parse Serialization
@@ -375,6 +387,60 @@ export INSTALL SHELL_PATH
 
 SHELL = $(SHELL_PATH)
 
+beauty_outdir := $(OUTPUT)trace/beauty/generated
+beauty_ioctl_outdir := $(beauty_outdir)/ioctl
+drm_ioctl_array := $(beauty_ioctl_outdir)/drm_ioctl_array.c
+drm_hdr_dir := $(srctree)/tools/include/uapi/drm
+drm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/drm_ioctl.sh
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(beauty_ioctl_outdir)' ] || mkdir -p '$(beauty_ioctl_outdir)')
+
+$(drm_ioctl_array): $(drm_hdr_dir)/drm.h $(drm_hdr_dir)/i915_drm.h $(drm_ioctl_tbl)
+	$(Q)$(SHELL) '$(drm_ioctl_tbl)' $(drm_hdr_dir) > $@
+
+pkey_alloc_access_rights_array := $(beauty_outdir)/pkey_alloc_access_rights_array.c
+asm_generic_hdr_dir := $(srctree)/tools/include/uapi/asm-generic/
+pkey_alloc_access_rights_tbl := $(srctree)/tools/perf/trace/beauty/pkey_alloc_access_rights.sh
+
+$(pkey_alloc_access_rights_array): $(asm_generic_hdr_dir)/mman-common.h $(pkey_alloc_access_rights_tbl)
+	$(Q)$(SHELL) '$(pkey_alloc_access_rights_tbl)' $(asm_generic_hdr_dir) > $@
+
+sndrv_ctl_ioctl_array := $(beauty_ioctl_outdir)/sndrv_ctl_ioctl_array.c
+sndrv_ctl_hdr_dir := $(srctree)/tools/include/uapi/sound
+sndrv_ctl_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh
+
+$(sndrv_ctl_ioctl_array): $(sndrv_ctl_hdr_dir)/asound.h $(sndrv_ctl_ioctl_tbl)
+	$(Q)$(SHELL) '$(sndrv_ctl_ioctl_tbl)' $(sndrv_ctl_hdr_dir) > $@
+
+sndrv_pcm_ioctl_array := $(beauty_ioctl_outdir)/sndrv_pcm_ioctl_array.c
+sndrv_pcm_hdr_dir := $(srctree)/tools/include/uapi/sound
+sndrv_pcm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh
+
+$(sndrv_pcm_ioctl_array): $(sndrv_pcm_hdr_dir)/asound.h $(sndrv_pcm_ioctl_tbl)
+	$(Q)$(SHELL) '$(sndrv_pcm_ioctl_tbl)' $(sndrv_pcm_hdr_dir) > $@
+
+kvm_ioctl_array := $(beauty_ioctl_outdir)/kvm_ioctl_array.c
+kvm_hdr_dir := $(srctree)/tools/include/uapi/linux
+kvm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/kvm_ioctl.sh
+
+$(kvm_ioctl_array): $(kvm_hdr_dir)/kvm.h $(kvm_ioctl_tbl)
+	$(Q)$(SHELL) '$(kvm_ioctl_tbl)' $(kvm_hdr_dir) > $@
+
+vhost_virtio_ioctl_array := $(beauty_ioctl_outdir)/vhost_virtio_ioctl_array.c
+vhost_virtio_hdr_dir := $(srctree)/tools/include/uapi/linux
+vhost_virtio_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/vhost_virtio_ioctl.sh
+
+$(vhost_virtio_ioctl_array): $(vhost_virtio_hdr_dir)/vhost.h $(vhost_virtio_ioctl_tbl)
+	$(Q)$(SHELL) '$(vhost_virtio_ioctl_tbl)' $(vhost_virtio_hdr_dir) > $@
+
+perf_ioctl_array := $(beauty_ioctl_outdir)/perf_ioctl_array.c
+perf_hdr_dir := $(srctree)/tools/include/uapi/linux
+perf_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/perf_ioctl.sh
+
+$(perf_ioctl_array): $(perf_hdr_dir)/perf_event.h $(perf_ioctl_tbl)
+	$(Q)$(SHELL) '$(perf_ioctl_tbl)' $(perf_hdr_dir) > $@
+
 all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
 
 $(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST)
@@ -469,7 +535,13 @@ endif
 __build-dir = $(subst $(OUTPUT),,$(dir $@))
 build-dir   = $(if $(__build-dir),$(__build-dir),.)
 
-prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders
+prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioctl_array) \
+	$(pkey_alloc_access_rights_array) \
+	$(sndrv_pcm_ioctl_array) \
+	$(sndrv_ctl_ioctl_array) \
+	$(kvm_ioctl_array) \
+	$(vhost_virtio_ioctl_array) \
+	$(perf_ioctl_array)
 
 $(OUTPUT)%.o: %.c prepare FORCE
 	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
@@ -512,7 +584,7 @@ $(LIBJVMTI_IN): FORCE
 	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=jvmti obj=jvmti
 
 $(OUTPUT)$(LIBJVMTI): $(LIBJVMTI_IN)
-	$(QUIET_LINK)$(CC) -shared -Wl,-soname -Wl,$(LIBJVMTI) -o $@ $< -lelf -lrt
+	$(QUIET_LINK)$(CC) -shared -Wl,-soname -Wl,$(LIBJVMTI) -o $@ $<
 endif
 
 $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h)
@@ -703,7 +775,11 @@ install-tests: all install-gtk
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
 		$(INSTALL) tests/attr.py '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \
-		$(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
+		$(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell'; \
+		$(INSTALL) tests/shell/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \
+		$(INSTALL) tests/shell/lib/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'
 
 install-bin: install-tools install-tests install-traceevent-plugins
 
@@ -734,7 +810,14 @@ clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clea
 	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
 		$(OUTPUT)util/intel-pt-decoder/inat-tables.c \
 		$(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \
-		$(OUTPUT)pmu-events/pmu-events.c
+		$(OUTPUT)pmu-events/pmu-events.c \
+		$(OUTPUT)$(drm_ioctl_array) \
+		$(OUTPUT)$(pkey_alloc_access_rights_array) \
+		$(OUTPUT)$(sndrv_ctl_ioctl_array) \
+		$(OUTPUT)$(sndrv_pcm_ioctl_array) \
+		$(OUTPUT)$(kvm_ioctl_array) \
+		$(OUTPUT)$(vhost_virtio_ioctl_array) \
+		$(OUTPUT)$(perf_ioctl_array)
 	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
 	$(python-clean)
 
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index bf9a2594572c..9c4e23d8c8ce 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -126,7 +126,7 @@ void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
 	struct rb_node *tmp;
 	int i = 0;
 
-	map = get_target_map(pev->target, pev->uprobes);
+	map = get_target_map(pev->target, pev->nsi, pev->uprobes);
 	if (!map || map__load(map) < 0)
 		return;
 
diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build
index 5bd7b9260cc0..bd518b623d7a 100644
--- a/tools/perf/arch/s390/util/Build
+++ b/tools/perf/arch/s390/util/Build
@@ -1,4 +1,5 @@
 libperf-y += header.o
+libperf-y += sym-handling.o
 libperf-y += kvm-stat.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
diff --git a/tools/perf/arch/s390/util/sym-handling.c b/tools/perf/arch/s390/util/sym-handling.c
new file mode 100644
index 000000000000..e103f6e46afe
--- /dev/null
+++ b/tools/perf/arch/s390/util/sym-handling.c
@@ -0,0 +1,29 @@
+/*
+ * Architecture specific ELF symbol handling and relocation mapping.
+ *
+ * Copyright 2017 IBM Corp.
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ */
+
+#include "symbol.h"
+
+#ifdef HAVE_LIBELF_SUPPORT
+bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+{
+	if (ehdr.e_type == ET_EXEC)
+		return false;
+	return ehdr.e_type == ET_REL || ehdr.e_type == ET_DYN;
+}
+
+void arch__adjust_sym_map_offset(GElf_Sym *sym,
+				 GElf_Shdr *shdr __maybe_unused,
+				 struct map *map)
+{
+	if (map->type == MAP__FUNCTION)
+		sym->st_value += map->start;
+}
+#endif
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index 6c9211b18ec0..9a628a24c5c9 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -20,7 +20,7 @@ _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 $(header): $(sys)/syscall_64.tbl $(systbl)
 	@(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \
         (diff -B arch/x86/entry/syscalls/syscall_64.tbl ../../arch/x86/entry/syscalls/syscall_64.tbl >/dev/null) \
-        || echo "Warning: x86_64's syscall_64.tbl differs from kernel" >&2 )) || true
+        || echo "Warning: Kernel ABI header at 'tools/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl'" >&2 )) || true
 	$(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@
 
 clean::
diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
index c1625f256df3..d84b72063a30 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -76,3 +76,49 @@ static struct ins x86__instructions[] = {
 	{ .name = "xbeginq",	.ops = &jump_ops, },
 	{ .name = "retq",	.ops = &ret_ops,  },
 };
+
+static bool x86__ins_is_fused(struct arch *arch, const char *ins1,
+			      const char *ins2)
+{
+	if (arch->family != 6 || arch->model < 0x1e || strstr(ins2, "jmp"))
+		return false;
+
+	if (arch->model == 0x1e) {
+		/* Nehalem */
+		if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) ||
+		     strstr(ins1, "test")) {
+			return true;
+		}
+	} else {
+		/* Newer platform */
+		if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) ||
+		     strstr(ins1, "test") ||
+		     strstr(ins1, "add") ||
+		     strstr(ins1, "sub") ||
+		     strstr(ins1, "and") ||
+		     strstr(ins1, "inc") ||
+		     strstr(ins1, "dec")) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static int x86__cpuid_parse(struct arch *arch, char *cpuid)
+{
+	unsigned int family, model, stepping;
+	int ret;
+
+	/*
+	 * cpuid = "GenuineIntel,family,model,stepping"
+	 */
+	ret = sscanf(cpuid, "%*[^,],%u,%u,%u", &family, &model, &stepping);
+	if (ret == 3) {
+		arch->family = family;
+		arch->model = model;
+		return 0;
+	}
+
+	return -1;
+}
diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h
index b48de2f5813c..4e0b806a7a0f 100644
--- a/tools/perf/arch/x86/include/arch-tests.h
+++ b/tools/perf/arch/x86/include/arch-tests.h
@@ -1,11 +1,14 @@
 #ifndef ARCH_TESTS_H
 #define ARCH_TESTS_H
 
+#include <linux/compiler.h>
+struct test;
+
 /* Tests */
-int test__rdpmc(int subtest);
-int test__perf_time_to_tsc(int subtest);
-int test__insn_x86(int subtest);
-int test__intel_cqm_count_nmi_context(int subtest);
+int test__rdpmc(struct test *test __maybe_unused, int subtest);
+int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest);
+int test__insn_x86(struct test *test __maybe_unused, int subtest);
+int test__intel_cqm_count_nmi_context(struct test *test __maybe_unused, int subtest);
 
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
 struct thread;
diff --git a/tools/perf/arch/x86/tests/insn-x86.c b/tools/perf/arch/x86/tests/insn-x86.c
index 08d9b2bc185c..b3860586a0c2 100644
--- a/tools/perf/arch/x86/tests/insn-x86.c
+++ b/tools/perf/arch/x86/tests/insn-x86.c
@@ -171,7 +171,7 @@ static int test_data_set(struct test_data *dat_set, int x86_64)
  * verbose (-v) option to see all the instructions and whether or not they
  * decoded successfuly.
  */
-int test__insn_x86(int subtest __maybe_unused)
+int test__insn_x86(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int ret = 0;
 
diff --git a/tools/perf/arch/x86/tests/intel-cqm.c b/tools/perf/arch/x86/tests/intel-cqm.c
index f9713a71d77e..57f86b6e7d6f 100644
--- a/tools/perf/arch/x86/tests/intel-cqm.c
+++ b/tools/perf/arch/x86/tests/intel-cqm.c
@@ -36,7 +36,7 @@ static pid_t spawn(void)
  * the last read counter value to avoid triggering a WARN_ON_ONCE() in
  * smp_call_function_many() caused by sending IPIs from NMI context.
  */
-int test__intel_cqm_count_nmi_context(int subtest __maybe_unused)
+int test__intel_cqm_count_nmi_context(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct perf_evlist *evlist = NULL;
 	struct perf_evsel *evsel = NULL;
diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
index e3ae9cff2b67..5dd7efb192ce 100644
--- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c
+++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
@@ -37,7 +37,7 @@
  * %0 is returned, otherwise %-1 is returned.  If TSC conversion is not
  * supported then then the test passes but " (not supported)" is printed.
  */
-int test__perf_time_to_tsc(int subtest __maybe_unused)
+int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.mmap_pages	     = UINT_MAX,
diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c
index 500cf96db979..17fec30a0b31 100644
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c
@@ -154,7 +154,7 @@ out_close:
 	return 0;
 }
 
-int test__rdpmc(int subtest __maybe_unused)
+int test__rdpmc(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int status = 0;
 	int wret = 0;
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index 9535be57033f..db0ba8caf5a2 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -701,6 +701,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
 				perf_evsel__set_sample_bit(switch_evsel, TID);
 				perf_evsel__set_sample_bit(switch_evsel, TIME);
 				perf_evsel__set_sample_bit(switch_evsel, CPU);
+				perf_evsel__reset_sample_bit(switch_evsel, BRANCH_STACK);
 
 				opts->record_switch_events = false;
 				ptr->have_sched_switch = 3;
@@ -752,6 +753,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
 		tracking_evsel->attr.freq = 0;
 		tracking_evsel->attr.sample_period = 1;
 
+		tracking_evsel->no_aux_samples = true;
 		if (need_immediate)
 			tracking_evsel->immediate = true;
 
@@ -761,6 +763,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
 			/* And the CPU for switch events */
 			perf_evsel__set_sample_bit(tracking_evsel, CPU);
 		}
+		perf_evsel__reset_sample_bit(tracking_evsel, BRANCH_STACK);
 	}
 
 	/*
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 7a5dc7e5c577..c38373195c4a 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -177,14 +177,11 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel,
 	 */
 	process_branch_stack(sample->branch_stack, al, sample);
 
-	sample->period = 1;
-	sample->weight = 1;
-
 	he = hists__add_entry(hists, al, NULL, NULL, NULL, sample, true);
 	if (he == NULL)
 		return -ENOMEM;
 
-	ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
+	ret = hist_entry__inc_addr_samples(he, sample, evsel->idx, al->addr);
 	hists__inc_nr_samples(hists, true);
 	return ret;
 }
@@ -397,6 +394,8 @@ int cmd_annotate(int argc, const char **argv)
 			.namespaces = perf_event__process_namespaces,
 			.attr	= perf_event__process_attr,
 			.build_id = perf_event__process_build_id,
+			.tracing_data   = perf_event__process_tracing_data,
+			.feature	= perf_event__process_feature,
 			.ordered_events = true,
 			.ordering_requires_timestamps = true,
 		},
@@ -404,7 +403,7 @@ int cmd_annotate(int argc, const char **argv)
 	struct perf_data_file file = {
 		.mode  = PERF_DATA_MODE_READ,
 	};
-	const struct option options[] = {
+	struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
 	OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
@@ -446,13 +445,20 @@ int cmd_annotate(int argc, const char **argv)
 		    "Show event group information together"),
 	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
 		    "Show a column with the sum of periods"),
+	OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
+		    "Show a column with the number of samples"),
 	OPT_CALLBACK_DEFAULT(0, "stdio-color", NULL, "mode",
 			     "'always' (default), 'never' or 'auto' only applicable to --stdio mode",
 			     stdio__config_color, "always"),
 	OPT_END()
 	};
-	int ret = hists__init();
+	int ret;
+
+	set_option_flag(options, 0, "show-total-period", PARSE_OPT_EXCLUSIVE);
+	set_option_flag(options, 0, "show-nr-samples", PARSE_OPT_EXCLUSIVE);
+
 
+	ret = hists__init();
 	if (ret < 0)
 		return ret;
 
@@ -468,6 +474,11 @@ int cmd_annotate(int argc, const char **argv)
 		annotate.sym_hist_filter = argv[0];
 	}
 
+	if (symbol_conf.show_nr_samples && annotate.use_gtk) {
+		pr_err("--show-nr-samples is not available in --gtk mode at this time\n");
+		return ret;
+	}
+
 	if (quiet)
 		perf_quiet_option();
 
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 9eba7f1add1f..e3eb6240ced0 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -14,6 +14,7 @@
 #include <unistd.h>
 #include "builtin.h"
 #include "perf.h"
+#include "namespaces.h"
 #include "util/cache.h"
 #include "util/debug.h"
 #include "util/header.h"
@@ -165,33 +166,41 @@ static int build_id_cache__add_kcore(const char *filename, bool force)
 	return 0;
 }
 
-static int build_id_cache__add_file(const char *filename)
+static int build_id_cache__add_file(const char *filename, struct nsinfo *nsi)
 {
 	char sbuild_id[SBUILD_ID_SIZE];
 	u8 build_id[BUILD_ID_SIZE];
 	int err;
+	struct nscookie nsc;
 
-	if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) {
+	nsinfo__mountns_enter(nsi, &nsc);
+	err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+	nsinfo__mountns_exit(&nsc);
+	if (err < 0) {
 		pr_debug("Couldn't read a build-id in %s\n", filename);
 		return -1;
 	}
 
 	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-	err = build_id_cache__add_s(sbuild_id, filename,
+	err = build_id_cache__add_s(sbuild_id, filename, nsi,
 				    false, false);
 	pr_debug("Adding %s %s: %s\n", sbuild_id, filename,
 		 err ? "FAIL" : "Ok");
 	return err;
 }
 
-static int build_id_cache__remove_file(const char *filename)
+static int build_id_cache__remove_file(const char *filename, struct nsinfo *nsi)
 {
 	u8 build_id[BUILD_ID_SIZE];
 	char sbuild_id[SBUILD_ID_SIZE];
+	struct nscookie nsc;
 
 	int err;
 
-	if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) {
+	nsinfo__mountns_enter(nsi, &nsc);
+	err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+	nsinfo__mountns_exit(&nsc);
+	if (err < 0) {
 		pr_debug("Couldn't read a build-id in %s\n", filename);
 		return -1;
 	}
@@ -204,13 +213,13 @@ static int build_id_cache__remove_file(const char *filename)
 	return err;
 }
 
-static int build_id_cache__purge_path(const char *pathname)
+static int build_id_cache__purge_path(const char *pathname, struct nsinfo *nsi)
 {
 	struct strlist *list;
 	struct str_node *pos;
 	int err;
 
-	err = build_id_cache__list_build_ids(pathname, &list);
+	err = build_id_cache__list_build_ids(pathname, nsi, &list);
 	if (err)
 		goto out;
 
@@ -234,7 +243,7 @@ static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused)
 	char filename[PATH_MAX];
 	u8 build_id[BUILD_ID_SIZE];
 
-	if (dso__build_id_filename(dso, filename, sizeof(filename)) &&
+	if (dso__build_id_filename(dso, filename, sizeof(filename), false) &&
 	    filename__read_build_id(filename, build_id,
 				    sizeof(build_id)) != sizeof(build_id)) {
 		if (errno == ENOENT)
@@ -256,24 +265,30 @@ static int build_id_cache__fprintf_missing(struct perf_session *session, FILE *f
 	return 0;
 }
 
-static int build_id_cache__update_file(const char *filename)
+static int build_id_cache__update_file(const char *filename, struct nsinfo *nsi)
 {
 	u8 build_id[BUILD_ID_SIZE];
 	char sbuild_id[SBUILD_ID_SIZE];
+	struct nscookie nsc;
 
-	int err = 0;
+	int err;
 
-	if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) {
+	nsinfo__mountns_enter(nsi, &nsc);
+	err = filename__read_build_id(filename, &build_id, sizeof(build_id));
+	nsinfo__mountns_exit(&nsc);
+	if (err < 0) {
 		pr_debug("Couldn't read a build-id in %s\n", filename);
 		return -1;
 	}
+	err = 0;
 
 	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
 	if (build_id_cache__cached(sbuild_id))
 		err = build_id_cache__remove_s(sbuild_id);
 
 	if (!err)
-		err = build_id_cache__add_s(sbuild_id, filename, false, false);
+		err = build_id_cache__add_s(sbuild_id, filename, nsi, false,
+					    false);
 
 	pr_debug("Updating %s %s: %s\n", sbuild_id, filename,
 		 err ? "FAIL" : "Ok");
@@ -286,6 +301,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 	struct strlist *list;
 	struct str_node *pos;
 	int ret = 0;
+	int ns_id = -1;
 	bool force = false;
 	char const *add_name_list_str = NULL,
 		   *remove_name_list_str = NULL,
@@ -299,6 +315,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 		.mode  = PERF_DATA_MODE_READ,
 	};
 	struct perf_session *session = NULL;
+	struct nsinfo *nsi = NULL;
 
 	const struct option buildid_cache_options[] = {
 	OPT_STRING('a', "add", &add_name_list_str,
@@ -315,6 +332,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 	OPT_STRING('u', "update", &update_name_list_str, "file list",
 		    "file(s) to update"),
 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
+	OPT_INTEGER(0, "target-ns", &ns_id, "target pid for namespace context"),
 	OPT_END()
 	};
 	const char * const buildid_cache_usage[] = {
@@ -330,6 +348,9 @@ int cmd_buildid_cache(int argc, const char **argv)
 		     !missing_filename && !update_name_list_str))
 		usage_with_options(buildid_cache_usage, buildid_cache_options);
 
+	if (ns_id > 0)
+		nsi = nsinfo__new(ns_id);
+
 	if (missing_filename) {
 		file.path = missing_filename;
 		file.force = force;
@@ -348,7 +369,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 		list = strlist__new(add_name_list_str, NULL);
 		if (list) {
 			strlist__for_each_entry(pos, list)
-				if (build_id_cache__add_file(pos->s)) {
+				if (build_id_cache__add_file(pos->s, nsi)) {
 					if (errno == EEXIST) {
 						pr_debug("%s already in the cache\n",
 							 pos->s);
@@ -366,7 +387,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 		list = strlist__new(remove_name_list_str, NULL);
 		if (list) {
 			strlist__for_each_entry(pos, list)
-				if (build_id_cache__remove_file(pos->s)) {
+				if (build_id_cache__remove_file(pos->s, nsi)) {
 					if (errno == ENOENT) {
 						pr_debug("%s wasn't in the cache\n",
 							 pos->s);
@@ -384,7 +405,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 		list = strlist__new(purge_name_list_str, NULL);
 		if (list) {
 			strlist__for_each_entry(pos, list)
-				if (build_id_cache__purge_path(pos->s)) {
+				if (build_id_cache__purge_path(pos->s, nsi)) {
 					if (errno == ENOENT) {
 						pr_debug("%s wasn't in the cache\n",
 							 pos->s);
@@ -405,7 +426,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 		list = strlist__new(update_name_list_str, NULL);
 		if (list) {
 			strlist__for_each_entry(pos, list)
-				if (build_id_cache__update_file(pos->s)) {
+				if (build_id_cache__update_file(pos->s, nsi)) {
 					if (errno == ENOENT) {
 						pr_debug("%s wasn't in the cache\n",
 							 pos->s);
@@ -424,6 +445,7 @@ int cmd_buildid_cache(int argc, const char **argv)
 
 out:
 	perf_session__delete(session);
+	nsinfo__zput(nsi);
 
 	return ret;
 }
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index ece45582a48d..3ddcc6e2abeb 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -13,6 +13,7 @@
 #include "util/util.h"
 #include "util/debug.h"
 #include "util/config.h"
+#include <linux/string.h>
 
 static bool use_system_config, use_user_config;
 
@@ -79,7 +80,7 @@ static int show_spec_config(struct perf_config_set *set, const char *var)
 		return -1;
 
 	perf_config_items__for_each_entry(&set->sections, section) {
-		if (prefixcmp(var, section->name) != 0)
+		if (!strstarts(var, section->name))
 			continue;
 
 		perf_config_items__for_each_entry(&section->items, item) {
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
index 0adb5f82335a..46cd8490baf4 100644
--- a/tools/perf/builtin-data.c
+++ b/tools/perf/builtin-data.c
@@ -69,7 +69,7 @@ static int cmd_data_convert(int argc, const char **argv)
 	};
 
 #ifndef HAVE_LIBBABELTRACE_SUPPORT
-	pr_err("No conversion support compiled in.\n");
+	pr_err("No conversion support compiled in. perf should be compiled with environment variables LIBBABELTRACE=1 and LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n");
 	return -1;
 #endif
 
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index dd26c62c9893..25a42acabee1 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -381,7 +381,7 @@ static int perf_ftrace_config(const char *var, const char *value, void *cb)
 {
 	struct perf_ftrace *ftrace = cb;
 
-	if (prefixcmp(var, "ftrace."))
+	if (!strstarts(var, "ftrace."))
 		return 0;
 
 	if (strcmp(var, "ftrace.tracer"))
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 530a7f2fa0f3..dbe4e4153bcf 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -90,7 +90,7 @@ static int check_emacsclient_version(void)
 	 */
 	finish_command(&ec_process);
 
-	if (prefixcmp(buffer.buf, "emacsclient")) {
+	if (!strstarts(buffer.buf, "emacsclient")) {
 		fprintf(stderr, "Failed to parse emacsclient version.\n");
 		goto out;
 	}
@@ -283,7 +283,7 @@ static int perf_help_config(const char *var, const char *value, void *cb)
 		add_man_viewer(value);
 		return 0;
 	}
-	if (!prefixcmp(var, "man."))
+	if (!strstarts(var, "man."))
 		return add_man_viewer_info(var, value);
 
 	return 0;
@@ -313,7 +313,7 @@ static const char *cmd_to_page(const char *perf_cmd)
 
 	if (!perf_cmd)
 		return "perf";
-	else if (!prefixcmp(perf_cmd, "perf"))
+	else if (!strstarts(perf_cmd, "perf"))
 		return perf_cmd;
 
 	return asprintf(&s, "perf-%s", perf_cmd) < 0 ? NULL : s;
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index ea8db38eedd1..2b8032908fb2 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -770,6 +770,7 @@ int cmd_inject(int argc, const char **argv)
 			.finished_round	= perf_event__repipe_oe_synth,
 			.build_id	= perf_event__repipe_op2_synth,
 			.id_index	= perf_event__repipe_op2_synth,
+			.feature	= perf_event__repipe_op2_synth,
 		},
 		.input_name  = "-",
 		.samples = LIST_HEAD_INIT(inject.samples),
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index cf9f9e9c2fc0..c0065923a525 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -58,6 +58,7 @@ static struct {
 	struct line_range line_range;
 	char *target;
 	struct strfilter *filter;
+	struct nsinfo *nsi;
 } params;
 
 /* Parse an event definition. Note that any error must die. */
@@ -80,6 +81,9 @@ static int parse_probe_event(const char *str)
 		params.target_used = true;
 	}
 
+	if (params.nsi)
+		pev->nsi = nsinfo__get(params.nsi);
+
 	/* Parse a perf-probe command into event */
 	ret = parse_perf_probe_command(str, pev);
 	pr_debug("%d arguments\n", pev->nargs);
@@ -189,7 +193,7 @@ static int opt_set_target(const struct option *opt, const char *str,
 
 		/* Expand given path to absolute path, except for modulename */
 		if (params.uprobes || strchr(str, '/')) {
-			tmp = realpath(str, NULL);
+			tmp = nsinfo__realpath(str, params.nsi);
 			if (!tmp) {
 				pr_warning("Failed to get the absolute path of %s: %m\n", str);
 				return ret;
@@ -208,6 +212,34 @@ static int opt_set_target(const struct option *opt, const char *str,
 	return ret;
 }
 
+static int opt_set_target_ns(const struct option *opt __maybe_unused,
+			     const char *str, int unset __maybe_unused)
+{
+	int ret = -ENOENT;
+	pid_t ns_pid;
+	struct nsinfo *nsip;
+
+	if (str) {
+		errno = 0;
+		ns_pid = (pid_t)strtol(str, NULL, 10);
+		if (errno != 0) {
+			ret = -errno;
+			pr_warning("Failed to parse %s as a pid: %s\n", str,
+				   strerror(errno));
+			return ret;
+		}
+		nsip = nsinfo__new(ns_pid);
+		if (nsip && nsip->need_setns)
+			params.nsi = nsinfo__get(nsip);
+		nsinfo__put(nsip);
+
+		ret = 0;
+	}
+
+	return ret;
+}
+
+
 /* Command option callbacks */
 
 #ifdef HAVE_DWARF_SUPPORT
@@ -299,6 +331,7 @@ static void cleanup_params(void)
 	line_range__clear(&params.line_range);
 	free(params.target);
 	strfilter__delete(params.filter);
+	nsinfo__put(params.nsi);
 	memset(&params, 0, sizeof(params));
 }
 
@@ -383,7 +416,7 @@ static int del_perf_probe_caches(struct strfilter *filter)
 	}
 
 	strlist__for_each_entry(nd, bidlist) {
-		cache = probe_cache__new(nd->s);
+		cache = probe_cache__new(nd->s, NULL);
 		if (!cache)
 			continue;
 		if (probe_cache__filter_purge(cache, filter) < 0 ||
@@ -554,6 +587,8 @@ __cmd_probe(int argc, const char **argv)
 	OPT_BOOLEAN(0, "cache", &probe_conf.cache, "Manipulate probe cache"),
 	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
 		   "Look for files with symbols relative to this directory"),
+	OPT_CALLBACK(0, "target-ns", NULL, "pid",
+		     "target pid for namespace contexts", opt_set_target_ns),
 	OPT_END()
 	};
 	int ret;
@@ -634,15 +669,15 @@ __cmd_probe(int argc, const char **argv)
 			pr_err_with_code("  Error: Failed to show event list.", ret);
 		return ret;
 	case 'F':
-		ret = show_available_funcs(params.target, params.filter,
-					params.uprobes);
+		ret = show_available_funcs(params.target, params.nsi,
+					   params.filter, params.uprobes);
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show functions.", ret);
 		return ret;
 #ifdef HAVE_DWARF_SUPPORT
 	case 'L':
 		ret = show_line_range(&params.line_range, params.target,
-				      params.uprobes);
+				      params.nsi, params.uprobes);
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show lines.", ret);
 		return ret;
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 17a14bcce34a..36d7117a7562 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -799,6 +799,13 @@ static int record__synthesize(struct record *rec, bool tail)
 		return 0;
 
 	if (file->is_pipe) {
+		err = perf_event__synthesize_features(
+			tool, session, rec->evlist, process_synthesized_event);
+		if (err < 0) {
+			pr_err("Couldn't synthesize features.\n");
+			return err;
+		}
+
 		err = perf_event__synthesize_attrs(tool, session,
 						   process_synthesized_event);
 		if (err < 0) {
@@ -1821,7 +1828,7 @@ int cmd_record(int argc, const char **argv)
 		record.opts.tail_synthesize = true;
 
 	if (rec->evlist->nr_entries == 0 &&
-	    perf_evlist__add_default(rec->evlist) < 0) {
+	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
 		pr_err("Not enough memory for event selector list\n");
 		goto out;
 	}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 79a33eb1a10d..f9dff652dcbd 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -38,6 +38,7 @@
 #include "util/time-utils.h"
 #include "util/auxtrace.h"
 #include "util/units.h"
+#include "util/branch.h"
 
 #include <dlfcn.h>
 #include <errno.h>
@@ -73,6 +74,7 @@ struct report {
 	u64			queue_size;
 	int			socket_filter;
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+	struct branch_type_stat	brtype_stat;
 };
 
 static int report__config(const char *var, const char *value, void *cb)
@@ -113,43 +115,60 @@ static int hist_iter__report_callback(struct hist_entry_iter *iter,
 	struct report *rep = arg;
 	struct hist_entry *he = iter->he;
 	struct perf_evsel *evsel = iter->evsel;
+	struct perf_sample *sample = iter->sample;
 	struct mem_info *mi;
 	struct branch_info *bi;
 
 	if (!ui__has_annotation())
 		return 0;
 
-	hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
+	hist__account_cycles(sample->branch_stack, al, sample,
 			     rep->nonany_branch_mode);
 
 	if (sort__mode == SORT_MODE__BRANCH) {
 		bi = he->branch_info;
-		err = addr_map_symbol__inc_samples(&bi->from, evsel->idx);
+		err = addr_map_symbol__inc_samples(&bi->from, sample, evsel->idx);
 		if (err)
 			goto out;
 
-		err = addr_map_symbol__inc_samples(&bi->to, evsel->idx);
+		err = addr_map_symbol__inc_samples(&bi->to, sample, evsel->idx);
 
 	} else if (rep->mem_mode) {
 		mi = he->mem_info;
-		err = addr_map_symbol__inc_samples(&mi->daddr, evsel->idx);
+		err = addr_map_symbol__inc_samples(&mi->daddr, sample, evsel->idx);
 		if (err)
 			goto out;
 
-		err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
+		err = hist_entry__inc_addr_samples(he, sample, evsel->idx, al->addr);
 
 	} else if (symbol_conf.cumulate_callchain) {
 		if (single)
-			err = hist_entry__inc_addr_samples(he, evsel->idx,
+			err = hist_entry__inc_addr_samples(he, sample, evsel->idx,
 							   al->addr);
 	} else {
-		err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
+		err = hist_entry__inc_addr_samples(he, sample, evsel->idx, al->addr);
 	}
 
 out:
 	return err;
 }
 
+static int hist_iter__branch_callback(struct hist_entry_iter *iter,
+				      struct addr_location *al __maybe_unused,
+				      bool single __maybe_unused,
+				      void *arg)
+{
+	struct hist_entry *he = iter->he;
+	struct report *rep = arg;
+	struct branch_info *bi;
+
+	bi = he->branch_info;
+	branch_type_count(&rep->brtype_stat, &bi->flags,
+			  bi->from.addr, bi->to.addr);
+
+	return 0;
+}
+
 static int process_sample_event(struct perf_tool *tool,
 				union perf_event *event,
 				struct perf_sample *sample,
@@ -188,6 +207,8 @@ static int process_sample_event(struct perf_tool *tool,
 		 */
 		if (!sample->branch_stack)
 			goto out_put;
+
+		iter.add_entry_cb = hist_iter__branch_callback;
 		iter.ops = &hist_iter_branch;
 	} else if (rep->mem_mode) {
 		iter.ops = &hist_iter_mem;
@@ -220,7 +241,7 @@ static int process_read_event(struct perf_tool *tool,
 		const char *name = evsel ? perf_evsel__name(evsel) : "unknown";
 		int err = perf_read_values_add_value(&rep->show_threads_values,
 					   event->read.pid, event->read.tid,
-					   event->read.id,
+					   evsel->idx,
 					   name,
 					   event->read.value);
 
@@ -228,10 +249,6 @@ static int process_read_event(struct perf_tool *tool,
 			return err;
 	}
 
-	dump_printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid,
-		    evsel ? perf_evsel__name(evsel) : "FAIL",
-		    event->read.value);
-
 	return 0;
 }
 
@@ -258,10 +275,11 @@ static int report__setup_sample_type(struct report *rep)
 				    "'perf record' without -g?\n");
 			return -EINVAL;
 		}
-		if (symbol_conf.use_callchain) {
-			ui__error("Selected -g or --branch-history but no "
-				  "callchain data. Did\n"
-				  "you call 'perf record' without -g?\n");
+		if (symbol_conf.use_callchain &&
+			!symbol_conf.show_branchflag_count) {
+			ui__error("Selected -g or --branch-history.\n"
+				  "But no callchain or branch data.\n"
+				  "Did you call 'perf record' without -g or -b?\n");
 			return -1;
 		}
 	} else if (!callchain_param.enabled &&
@@ -396,7 +414,8 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
 
 		hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
 		hists__fprintf(hists, !quiet, 0, 0, rep->min_percent, stdout,
-			       symbol_conf.use_callchain);
+			       symbol_conf.use_callchain ||
+			       symbol_conf.show_branchflag_count);
 		fprintf(stdout, "\n\n");
 	}
 
@@ -410,6 +429,9 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
 		perf_read_values_destroy(&rep->show_threads_values);
 	}
 
+	if (sort__mode == SORT_MODE__BRANCH)
+		branch_type_stat_display(stdout, &rep->brtype_stat);
+
 	return 0;
 }
 
@@ -718,6 +740,7 @@ int cmd_report(int argc, const char **argv)
 			.id_index	 = perf_event__process_id_index,
 			.auxtrace_info	 = perf_event__process_auxtrace_info,
 			.auxtrace	 = perf_event__process_auxtrace,
+			.feature	 = perf_event__process_feature,
 			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
@@ -943,6 +966,8 @@ repeat:
 	if (has_br_stack && branch_call_mode)
 		symbol_conf.show_branchflag_count = true;
 
+	memset(&report.brtype_stat, 0, sizeof(struct branch_type_stat));
+
 	/*
 	 * Branch mode is a tristate:
 	 * -1 means default, so decide based on the file having branch data.
@@ -988,6 +1013,10 @@ repeat:
 	/* Force tty output for header output and per-thread stat. */
 	if (report.header || report.header_only || report.show_threads)
 		use_browser = 0;
+	if (report.header || report.header_only)
+		report.tool.show_feat_hdr = SHOW_FEAT_HEADER;
+	if (report.show_full_info)
+		report.tool.show_feat_hdr = SHOW_FEAT_HEADER_FULL_INFO;
 
 	if (strcmp(input_name, "-") != 0)
 		setup_browser(true);
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 83cdc0a61fd6..378f76cdf923 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2199,16 +2199,11 @@ static struct script_desc *script_desc__findnew(const char *name)
 
 	s = script_desc__new(name);
 	if (!s)
-		goto out_delete_desc;
+		return NULL;
 
 	script_desc__add(s);
 
 	return s;
-
-out_delete_desc:
-	script_desc__delete(s);
-
-	return NULL;
 }
 
 static const char *ends_with(const char *str, const char *suffix)
@@ -2682,6 +2677,7 @@ int cmd_script(int argc, const char **argv)
 			.attr		 = process_attr,
 			.event_update   = perf_event__process_event_update,
 			.tracing_data	 = perf_event__process_tracing_data,
+			.feature	 = perf_event__process_feature,
 			.build_id	 = perf_event__process_build_id,
 			.id_index	 = perf_event__process_id_index,
 			.auxtrace_info	 = perf_event__process_auxtrace_info,
@@ -2972,10 +2968,13 @@ int cmd_script(int argc, const char **argv)
 		return -1;
 
 	if (header || header_only) {
+		script.tool.show_feat_hdr = SHOW_FEAT_HEADER;
 		perf_session__fprintf_info(session, stdout, show_full_info);
 		if (header_only)
 			goto out_delete;
 	}
+	if (show_full_info)
+		script.tool.show_feat_hdr = SHOW_FEAT_HEADER_FULL_INFO;
 
 	if (symbol__init(&session->header.env) < 0)
 		goto out_delete;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 48ac53b199fc..866da7aa54bf 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -213,10 +213,20 @@ static void perf_stat__reset_stats(void)
 static int create_perf_stat_counter(struct perf_evsel *evsel)
 {
 	struct perf_event_attr *attr = &evsel->attr;
+	struct perf_evsel *leader = evsel->leader;
 
-	if (stat_config.scale)
+	if (stat_config.scale) {
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
 				    PERF_FORMAT_TOTAL_TIME_RUNNING;
+	}
+
+	/*
+	 * The event is part of non trivial group, let's enable
+	 * the group read (for leader) and ID retrieval for all
+	 * members.
+	 */
+	if (leader->nr_members > 1)
+		attr->read_format |= PERF_FORMAT_ID|PERF_FORMAT_GROUP;
 
 	attr->inherit = !no_inherit;
 
@@ -333,13 +343,21 @@ static int read_counter(struct perf_evsel *counter)
 			struct perf_counts_values *count;
 
 			count = perf_counts(counter->counts, cpu, thread);
-			if (perf_evsel__read(counter, cpu, thread, count)) {
+
+			/*
+			 * The leader's group read loads data into its group members
+			 * (via perf_evsel__read_counter) and sets threir count->loaded.
+			 */
+			if (!count->loaded &&
+			    perf_evsel__read_counter(counter, cpu, thread)) {
 				counter->counts->scaled = -1;
 				perf_counts(counter->counts, cpu, thread)->ena = 0;
 				perf_counts(counter->counts, cpu, thread)->run = 0;
 				return -1;
 			}
 
+			count->loaded = false;
+
 			if (STAT_RECORD) {
 				if (perf_evsel__write_stat_event(counter, cpu, thread, count)) {
 					pr_err("failed to write stat event\n");
@@ -559,6 +577,11 @@ static int store_counter_ids(struct perf_evsel *counter)
 	return __store_counter_ids(counter, cpus, threads);
 }
 
+static bool perf_evsel__should_store_id(struct perf_evsel *counter)
+{
+	return STAT_RECORD || counter->attr.read_format & PERF_FORMAT_ID;
+}
+
 static int __run_perf_stat(int argc, const char **argv)
 {
 	int interval = stat_config.interval;
@@ -631,7 +654,8 @@ try_again:
 		if (l > unit_width)
 			unit_width = l;
 
-		if (STAT_RECORD && store_counter_ids(counter))
+		if (perf_evsel__should_store_id(counter) &&
+		    store_counter_ids(counter))
 			return -1;
 	}
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 6052376634c0..ee954bde7e3e 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -134,7 +134,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
 		return err;
 	}
 
-	err = symbol__disassemble(sym, map, NULL, 0, NULL);
+	err = symbol__disassemble(sym, map, NULL, 0, NULL, NULL);
 	if (err == 0) {
 out_assign:
 		top->sym_filter_entry = he;
@@ -183,6 +183,7 @@ static void ui__warn_map_erange(struct map *map, struct symbol *sym, u64 ip)
 
 static void perf_top__record_precise_ip(struct perf_top *top,
 					struct hist_entry *he,
+					struct perf_sample *sample,
 					int counter, u64 ip)
 {
 	struct annotation *notes;
@@ -199,7 +200,7 @@ static void perf_top__record_precise_ip(struct perf_top *top,
 	if (pthread_mutex_trylock(&notes->lock))
 		return;
 
-	err = hist_entry__inc_addr_samples(he, counter, ip);
+	err = hist_entry__inc_addr_samples(he, sample, counter, ip);
 
 	pthread_mutex_unlock(&notes->lock);
 
@@ -586,6 +587,13 @@ static void *display_thread_tui(void *arg)
 		.refresh	= top->delay_secs,
 	};
 
+	/* In order to read symbols from other namespaces perf to  needs to call
+	 * setns(2).  This isn't permitted if the struct_fs has multiple users.
+	 * unshare(2) the fs so that we may continue to setns into namespaces
+	 * that we're observing.
+	 */
+	unshare(CLONE_FS);
+
 	perf_top__sort_new_samples(top);
 
 	/*
@@ -627,6 +635,13 @@ static void *display_thread(void *arg)
 	struct perf_top *top = arg;
 	int delay_msecs, c;
 
+	/* In order to read symbols from other namespaces perf to  needs to call
+	 * setns(2).  This isn't permitted if the struct_fs has multiple users.
+	 * unshare(2) the fs so that we may continue to setns into namespaces
+	 * that we're observing.
+	 */
+	unshare(CLONE_FS);
+
 	display_setup_sig();
 	pthread__unblock_sigwinch();
 repeat:
@@ -671,7 +686,7 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter,
 	struct perf_evsel *evsel = iter->evsel;
 
 	if (perf_hpp_list.sym && single)
-		perf_top__record_precise_ip(top, he, evsel->idx, al->addr);
+		perf_top__record_precise_ip(top, he, iter->sample, evsel->idx, al->addr);
 
 	hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
 		     !(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY));
@@ -1205,6 +1220,7 @@ int cmd_top(int argc, const char **argv)
 		    "Show raw trace event output (do not use print fmt or plugins)"),
 	OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
 		    "Show entries in a hierarchy"),
+	OPT_BOOLEAN(0, "force", &symbol_conf.force, "don't complain, do it"),
 	OPT_END()
 	};
 	const char * const top_usage[] = {
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 4b2a5d298197..d59cdadf3a79 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -64,6 +64,10 @@
 # define O_CLOEXEC		02000000
 #endif
 
+#ifndef F_LINUX_SPECIFIC_BASE
+# define F_LINUX_SPECIFIC_BASE	1024
+#endif
+
 struct trace {
 	struct perf_tool	tool;
 	struct syscalltbl	*sctbl;
@@ -279,34 +283,21 @@ out_delete:
 	({ struct syscall_tp *fields = evsel->priv; \
 	   fields->name.pointer(&fields->name, sample); })
 
-struct strarray {
-	int	    offset;
-	int	    nr_entries;
-	const char **entries;
-};
+size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
+{
+	int idx = val - sa->offset;
 
-#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
-	.nr_entries = ARRAY_SIZE(array), \
-	.entries = array, \
-}
+	if (idx < 0 || idx >= sa->nr_entries)
+		return scnprintf(bf, size, intfmt, val);
 
-#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
-	.offset	    = off, \
-	.nr_entries = ARRAY_SIZE(array), \
-	.entries = array, \
+	return scnprintf(bf, size, "%s", sa->entries[idx]);
 }
 
 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
 						const char *intfmt,
 					        struct syscall_arg *arg)
 {
-	struct strarray *sa = arg->parm;
-	int idx = arg->val - sa->offset;
-
-	if (idx < 0 || idx >= sa->nr_entries)
-		return scnprintf(bf, size, intfmt, arg->val);
-
-	return scnprintf(bf, size, "%s", sa->entries[idx]);
+	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
 }
 
 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
@@ -317,24 +308,35 @@ static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
 
 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
 
-#if defined(__i386__) || defined(__x86_64__)
-/*
- * FIXME: Make this available to all arches as soon as the ioctl beautifier
- * 	  gets rewritten to support all arches.
- */
-static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
-						 struct syscall_arg *arg)
-{
-	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
+struct strarrays {
+	int		nr_entries;
+	struct strarray **entries;
+};
+
+#define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
+	.nr_entries = ARRAY_SIZE(array), \
+	.entries = array, \
 }
 
-#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
-#endif /* defined(__i386__) || defined(__x86_64__) */
+size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
+					struct syscall_arg *arg)
+{
+	struct strarrays *sas = arg->parm;
+	int i;
 
-static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
-					struct syscall_arg *arg);
+	for (i = 0; i < sas->nr_entries; ++i) {
+		struct strarray *sa = sas->entries[i];
+		int idx = arg->val - sa->offset;
 
-#define SCA_FD syscall_arg__scnprintf_fd
+		if (idx >= 0 && idx < sa->nr_entries) {
+			if (sa->entries[idx] == NULL)
+				break;
+			return scnprintf(bf, size, "%s", sa->entries[idx]);
+		}
+	}
+
+	return scnprintf(bf, size, "%d", arg->val);
+}
 
 #ifndef AT_FDCWD
 #define AT_FDCWD	-100
@@ -358,21 +360,20 @@ static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
 
 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
 
-static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
-					 struct syscall_arg *arg)
+size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
 {
 	return scnprintf(bf, size, "%#lx", arg->val);
 }
 
-#define SCA_HEX syscall_arg__scnprintf_hex
-
-static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
-					 struct syscall_arg *arg)
+size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
 {
 	return scnprintf(bf, size, "%d", arg->val);
 }
 
-#define SCA_INT syscall_arg__scnprintf_int
+size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
+{
+	return scnprintf(bf, size, "%ld", arg->val);
+}
 
 static const char *bpf_cmd[] = {
 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
@@ -407,12 +408,27 @@ static DEFINE_STRARRAY(whences);
 
 static const char *fcntl_cmds[] = {
 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
-	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
-	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
-	"F_GETOWNER_UIDS",
+	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
+	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
+	"GETOWNER_UIDS",
 };
 static DEFINE_STRARRAY(fcntl_cmds);
 
+static const char *fcntl_linux_specific_cmds[] = {
+	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
+	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
+	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
+};
+
+static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
+
+static struct strarray *fcntl_cmds_arrays[] = {
+	&strarray__fcntl_cmds,
+	&strarray__fcntl_linux_specific_cmds,
+};
+
+static DEFINE_STRARRAYS(fcntl_cmds_arrays);
+
 static const char *rlimit_resources[] = {
 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
@@ -495,33 +511,6 @@ static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 
 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 
-#if defined(__i386__) || defined(__x86_64__)
-/*
- * FIXME: Make this available to all arches.
- */
-#define TCGETS		0x5401
-
-static const char *tioctls[] = {
-	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
-	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
-	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
-	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
-	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
-	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
-	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
-	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
-	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
-	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
-	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
-	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
-	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
-	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
-	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
-};
-
-static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
-#endif /* defined(__i386__) || defined(__x86_64__) */
-
 #ifndef GRND_NONBLOCK
 #define GRND_NONBLOCK	0x0001
 #endif
@@ -552,9 +541,9 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 
 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
 
-#define STRARRAY(arg, name, array) \
-	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
-	  .arg_parm	 = { [arg] = &strarray__##array, }
+#define STRARRAY(name, array) \
+	  { .scnprintf	= SCA_STRARRAY, \
+	    .parm	= &strarray__##array, }
 
 #include "trace/beauty/eventfd.c"
 #include "trace/beauty/flock.c"
@@ -571,242 +560,219 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
 #include "trace/beauty/socket_type.c"
 #include "trace/beauty/waitid_options.c"
 
+struct syscall_arg_fmt {
+	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
+	void	   *parm;
+	const char *name;
+	bool	   show_zero;
+};
+
 static struct syscall_fmt {
 	const char *name;
 	const char *alias;
-	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
-	void	   *arg_parm[6];
-	bool	   errmsg;
+	struct syscall_arg_fmt arg[6];
+	u8	   nr_args;
 	bool	   errpid;
 	bool	   timeout;
 	bool	   hexret;
 } syscall_fmts[] = {
-	{ .name	    = "access",	    .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
-	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
-	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
+	{ .name	    = "access",
+	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
+	{ .name	    = "arch_prctl", .alias = "prctl", },
+	{ .name	    = "bpf",
+	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
 	{ .name	    = "brk",	    .hexret = true,
-	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
-	{ .name	    = "chdir",	    .errmsg = true, },
-	{ .name	    = "chmod",	    .errmsg = true, },
-	{ .name	    = "chroot",	    .errmsg = true, },
-	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
-	{ .name	    = "clone",	    .errpid = true, },
-	{ .name	    = "close",	    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
-	{ .name	    = "connect",    .errmsg = true, },
-	{ .name	    = "creat",	    .errmsg = true, },
-	{ .name	    = "dup",	    .errmsg = true, },
-	{ .name	    = "dup2",	    .errmsg = true, },
-	{ .name	    = "dup3",	    .errmsg = true, },
-	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
-	{ .name	    = "eventfd2",   .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
-	{ .name	    = "faccessat",  .errmsg = true, },
-	{ .name	    = "fadvise64",  .errmsg = true, },
-	{ .name	    = "fallocate",  .errmsg = true, },
-	{ .name	    = "fchdir",	    .errmsg = true, },
-	{ .name	    = "fchmod",	    .errmsg = true, },
-	{ .name	    = "fchmodat",   .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
-	{ .name	    = "fchown",	    .errmsg = true, },
-	{ .name	    = "fchownat",   .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
-	{ .name	    = "fcntl",	    .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
-	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
-	{ .name	    = "fdatasync",  .errmsg = true, },
-	{ .name	    = "flock",	    .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
-	{ .name	    = "fsetxattr",  .errmsg = true, },
-	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
-	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
-	{ .name	    = "fstatfs",    .errmsg = true, },
-	{ .name	    = "fsync",    .errmsg = true, },
-	{ .name	    = "ftruncate", .errmsg = true, },
-	{ .name	    = "futex",	    .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
-	{ .name	    = "futimesat", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
-	{ .name	    = "getdents",   .errmsg = true, },
-	{ .name	    = "getdents64", .errmsg = true, },
-	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
+	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
+	{ .name     = "clock_gettime",
+	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
+	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
+	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
+		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
+		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
+		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
+		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
+	{ .name	    = "close",
+	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
+	{ .name	    = "epoll_ctl",
+	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
+	{ .name	    = "eventfd2",
+	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
+	{ .name	    = "fchmodat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
+	{ .name	    = "fchownat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
+	{ .name	    = "fcntl",
+	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
+			   .parm      = &strarrays__fcntl_cmds_arrays,
+			   .show_zero = true, },
+		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
+	{ .name	    = "flock",
+	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
+	{ .name	    = "fstat", .alias = "newfstat", },
+	{ .name	    = "fstatat", .alias = "newfstatat", },
+	{ .name	    = "futex",
+	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
+	{ .name	    = "futimesat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
+	{ .name	    = "getitimer",
+	  .arg = { [0] = STRARRAY(which, itimers), }, },
 	{ .name	    = "getpid",	    .errpid = true, },
 	{ .name	    = "getpgid",    .errpid = true, },
 	{ .name	    = "getppid",    .errpid = true, },
-	{ .name	    = "getrandom",  .errmsg = true,
-	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
-	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
-	{ .name	    = "getxattr",   .errmsg = true, },
-	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
-	{ .name	    = "ioctl",	    .errmsg = true,
-	  .arg_scnprintf = {
+	{ .name	    = "getrandom",
+	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
+	{ .name	    = "getrlimit",
+	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
+	{ .name	    = "ioctl",
+	  .arg = {
 #if defined(__i386__) || defined(__x86_64__)
 /*
  * FIXME: Make this available to all arches.
  */
-			     [1] = SCA_STRHEXARRAY, /* cmd */
-			     [2] = SCA_HEX, /* arg */ },
-	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
+		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
+		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 #else
-			     [2] = SCA_HEX, /* arg */ }, },
+		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
 #endif
-	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
-	{ .name	    = "kill",	    .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
-	{ .name	    = "lchown",    .errmsg = true, },
-	{ .name	    = "lgetxattr",  .errmsg = true, },
-	{ .name	    = "linkat",	    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
-	{ .name	    = "listxattr",  .errmsg = true, },
-	{ .name	    = "llistxattr", .errmsg = true, },
-	{ .name	    = "lremovexattr",  .errmsg = true, },
-	{ .name	    = "lseek",	    .errmsg = true,
-	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
-	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
-	{ .name	    = "lsetxattr",  .errmsg = true, },
-	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
-	{ .name	    = "lsxattr",    .errmsg = true, },
-	{ .name     = "madvise",    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
-			     [2] = SCA_MADV_BHV, /* behavior */ }, },
-	{ .name	    = "mkdir",    .errmsg = true, },
-	{ .name	    = "mkdirat",    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
-	{ .name	    = "mknod",      .errmsg = true, },
-	{ .name	    = "mknodat",    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
-	{ .name	    = "mlock",	    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
-	{ .name	    = "mlockall",   .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
+	{ .name	    = "keyctl",
+	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
+	{ .name	    = "kill",
+	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
+	{ .name	    = "linkat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
+	{ .name	    = "lseek",
+	  .arg = { [2] = STRARRAY(whence, whences), }, },
+	{ .name	    = "lstat", .alias = "newlstat", },
+	{ .name     = "madvise",
+	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
+		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
+	{ .name	    = "mkdirat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
+	{ .name	    = "mknodat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
+	{ .name	    = "mlock",
+	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
+	{ .name	    = "mlockall",
+	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
 	{ .name	    = "mmap",	    .hexret = true,
 /* The standard mmap maps to old_mmap on s390x */
 #if defined(__s390x__)
 	.alias = "old_mmap",
 #endif
-	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
-			     [2] = SCA_MMAP_PROT, /* prot */
-			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
-	{ .name	    = "mprotect",   .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
-			     [2] = SCA_MMAP_PROT, /* prot */ }, },
-	{ .name	    = "mq_unlink", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
+	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
+		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
+		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
+	{ .name	    = "mprotect",
+	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
+		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
+	{ .name	    = "mq_unlink",
+	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
 	{ .name	    = "mremap",	    .hexret = true,
-	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
-			     [3] = SCA_MREMAP_FLAGS, /* flags */
-			     [4] = SCA_HEX, /* new_addr */ }, },
-	{ .name	    = "munlock",    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
-	{ .name	    = "munmap",	    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
-	{ .name	    = "name_to_handle_at", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
-	{ .name	    = "newfstatat", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
-	{ .name	    = "open",	    .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
-	{ .name	    = "open_by_handle_at", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
-	{ .name	    = "openat",	    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
-			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
-	{ .name	    = "perf_event_open", .errmsg = true,
-	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
-			     [3] = SCA_FD,  /* group_fd */
-			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
-	{ .name	    = "pipe2",	    .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
-	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
-	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
-	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
-	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
-	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
-	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
-	{ .name	    = "pwritev",    .errmsg = true, },
-	{ .name	    = "read",	    .errmsg = true, },
-	{ .name	    = "readlink",   .errmsg = true, },
-	{ .name	    = "readlinkat", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
-	{ .name	    = "readv",	    .errmsg = true, },
-	{ .name	    = "recvfrom",   .errmsg = true,
-	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
-	{ .name	    = "recvmmsg",   .errmsg = true,
-	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
-	{ .name	    = "recvmsg",    .errmsg = true,
-	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
-	{ .name	    = "removexattr", .errmsg = true, },
-	{ .name	    = "renameat",   .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
-	{ .name	    = "rmdir",    .errmsg = true, },
-	{ .name	    = "rt_sigaction", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
-	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
-	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
-	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
-	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
-	{ .name	    = "sched_getattr",	      .errmsg = true, },
-	{ .name	    = "sched_setattr",	      .errmsg = true, },
-	{ .name	    = "sched_setscheduler",   .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
-	{ .name	    = "seccomp", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
-			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
-	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
-	{ .name	    = "sendmmsg",    .errmsg = true,
-	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
-	{ .name	    = "sendmsg",    .errmsg = true,
-	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
-	{ .name	    = "sendto",	    .errmsg = true,
-	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
+		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
+		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
+	{ .name	    = "munlock",
+	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
+	{ .name	    = "munmap",
+	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
+	{ .name	    = "name_to_handle_at",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
+	{ .name	    = "newfstatat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
+	{ .name	    = "open",
+	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
+	{ .name	    = "open_by_handle_at",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
+		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
+	{ .name	    = "openat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
+		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
+	{ .name	    = "perf_event_open",
+	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
+		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
+		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
+	{ .name	    = "pipe2",
+	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
+	{ .name	    = "pkey_alloc",
+	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
+	{ .name	    = "pkey_free",
+	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
+	{ .name	    = "pkey_mprotect",
+	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
+		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
+		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
+	{ .name	    = "poll", .timeout = true, },
+	{ .name	    = "ppoll", .timeout = true, },
+	{ .name	    = "pread", .alias = "pread64", },
+	{ .name	    = "preadv", .alias = "pread", },
+	{ .name	    = "prlimit64",
+	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
+	{ .name	    = "pwrite", .alias = "pwrite64", },
+	{ .name	    = "readlinkat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
+	{ .name	    = "recvfrom",
+	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
+	{ .name	    = "recvmmsg",
+	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
+	{ .name	    = "recvmsg",
+	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
+	{ .name	    = "renameat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
+	{ .name	    = "rt_sigaction",
+	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
+	{ .name	    = "rt_sigprocmask",
+	  .arg = { [0] = STRARRAY(how, sighow), }, },
+	{ .name	    = "rt_sigqueueinfo",
+	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
+	{ .name	    = "rt_tgsigqueueinfo",
+	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
+	{ .name	    = "sched_setscheduler",
+	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
+	{ .name	    = "seccomp",
+	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
+		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
+	{ .name	    = "select", .timeout = true, },
+	{ .name	    = "sendmmsg",
+	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
+	{ .name	    = "sendmsg",
+	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
+	{ .name	    = "sendto",
+	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
 	{ .name	    = "set_tid_address", .errpid = true, },
-	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
-	{ .name	    = "setpgid",    .errmsg = true, },
-	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
-	{ .name	    = "setxattr",   .errmsg = true, },
-	{ .name	    = "shutdown",   .errmsg = true, },
-	{ .name	    = "socket",	    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
-			     [1] = SCA_SK_TYPE, /* type */ },
-	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
-	{ .name	    = "socketpair", .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
-			     [1] = SCA_SK_TYPE, /* type */ },
-	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
-	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
-	{ .name	    = "statfs",	    .errmsg = true, },
-	{ .name	    = "statx",	    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
-			     [2] = SCA_STATX_FLAGS, /* flags */
-			     [3] = SCA_STATX_MASK, /* mask */ }, },
-	{ .name	    = "swapoff",    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
-	{ .name	    = "swapon",	    .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
-	{ .name	    = "symlinkat",  .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
-	{ .name	    = "tgkill",	    .errmsg = true,
-	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
-	{ .name	    = "tkill",	    .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
-	{ .name	    = "truncate",   .errmsg = true, },
-	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
-	{ .name	    = "unlinkat",   .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
-	{ .name	    = "utime",  .errmsg = true, },
-	{ .name	    = "utimensat",  .errmsg = true,
-	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
-	{ .name	    = "utimes",  .errmsg = true, },
-	{ .name	    = "vmsplice",  .errmsg = true, },
+	{ .name	    = "setitimer",
+	  .arg = { [0] = STRARRAY(which, itimers), }, },
+	{ .name	    = "setrlimit",
+	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
+	{ .name	    = "socket",
+	  .arg = { [0] = STRARRAY(family, socket_families),
+		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
+	{ .name	    = "socketpair",
+	  .arg = { [0] = STRARRAY(family, socket_families),
+		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
+	{ .name	    = "stat", .alias = "newstat", },
+	{ .name	    = "statx",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
+		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
+		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
+	{ .name	    = "swapoff",
+	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
+	{ .name	    = "swapon",
+	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
+	{ .name	    = "symlinkat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
+	{ .name	    = "tgkill",
+	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
+	{ .name	    = "tkill",
+	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
+	{ .name	    = "uname", .alias = "newuname", },
+	{ .name	    = "unlinkat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
+	{ .name	    = "utimensat",
+	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
 	{ .name	    = "wait4",	    .errpid = true,
-	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
+	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 	{ .name	    = "waitid",	    .errpid = true,
-	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
-	{ .name	    = "write",	    .errmsg = true, },
-	{ .name	    = "writev",	    .errmsg = true, },
+	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
 };
 
 static int syscall_fmt__cmp(const void *name, const void *fmtp)
@@ -828,8 +794,7 @@ struct syscall {
 	const char	    *name;
 	bool		    is_exit;
 	struct syscall_fmt  *fmt;
-	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
-	void		    **arg_parm;
+	struct syscall_arg_fmt *arg_fmt;
 };
 
 /*
@@ -859,6 +824,8 @@ static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
  * filename.ptr: The filename char pointer that will be vfs_getname'd
  * filename.entry_str_pos: Where to insert the string translated from
  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
+ * ret_scnprintf: syscall args may set this to a different syscall return
+ *                formatter, for instance, fcntl may return fds, file flags, etc.
  */
 struct thread_trace {
 	u64		  entry_time;
@@ -867,6 +834,7 @@ struct thread_trace {
 	unsigned long	  pfmaj, pfmin;
 	char		  *entry_str;
 	double		  runtime_ms;
+	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
         struct {
 		unsigned long ptr;
 		short int     entry_str_pos;
@@ -917,6 +885,15 @@ fail:
 	return NULL;
 }
 
+
+void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
+				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
+{
+	struct thread_trace *ttrace = thread__priv(arg->thread);
+
+	ttrace->ret_scnprintf = ret_scnprintf;
+}
+
 #define TRACE_PFMAJ		(1 << 0)
 #define TRACE_PFMIN		(1 << 1)
 
@@ -996,8 +973,7 @@ static const char *thread__fd_path(struct thread *thread, int fd,
 	return ttrace->paths.table[fd];
 }
 
-static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
-					struct syscall_arg *arg)
+size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
 {
 	int fd = arg->val;
 	size_t printed = scnprintf(bf, size, "%d", fd);
@@ -1162,32 +1138,46 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
 	return err;
 }
 
+static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
+{
+	int idx;
+
+	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
+		nr_args = sc->fmt->nr_args;
+
+	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
+	if (sc->arg_fmt == NULL)
+		return -1;
+
+	for (idx = 0; idx < nr_args; ++idx) {
+		if (sc->fmt)
+			sc->arg_fmt[idx] = sc->fmt->arg[idx];
+	}
+
+	sc->nr_args = nr_args;
+	return 0;
+}
+
 static int syscall__set_arg_fmts(struct syscall *sc)
 {
 	struct format_field *field;
 	int idx = 0, len;
 
-	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
-	if (sc->arg_scnprintf == NULL)
-		return -1;
-
-	if (sc->fmt)
-		sc->arg_parm = sc->fmt->arg_parm;
+	for (field = sc->args; field; field = field->next, ++idx) {
+		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
+			continue;
 
-	for (field = sc->args; field; field = field->next) {
-		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
-			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
-		else if (strcmp(field->type, "const char *") == 0 &&
+		if (strcmp(field->type, "const char *") == 0 &&
 			 (strcmp(field->name, "filename") == 0 ||
 			  strcmp(field->name, "path") == 0 ||
 			  strcmp(field->name, "pathname") == 0))
-			sc->arg_scnprintf[idx] = SCA_FILENAME;
+			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
 		else if (field->flags & FIELD_IS_POINTER)
-			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
+			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
 		else if (strcmp(field->type, "pid_t") == 0)
-			sc->arg_scnprintf[idx] = SCA_PID;
+			sc->arg_fmt[idx].scnprintf = SCA_PID;
 		else if (strcmp(field->type, "umode_t") == 0)
-			sc->arg_scnprintf[idx] = SCA_MODE_T;
+			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
 		else if ((strcmp(field->type, "int") == 0 ||
 			  strcmp(field->type, "unsigned int") == 0 ||
 			  strcmp(field->type, "long") == 0) &&
@@ -1200,9 +1190,8 @@ static int syscall__set_arg_fmts(struct syscall *sc)
 			 * 23 unsigned int
 			 * 7 unsigned long
 			 */
-			sc->arg_scnprintf[idx] = SCA_FD;
+			sc->arg_fmt[idx].scnprintf = SCA_FD;
 		}
-		++idx;
 	}
 
 	return 0;
@@ -1247,11 +1236,13 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
 	}
 
+	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
+		return -1;
+
 	if (IS_ERR(sc->tp_format))
 		return -1;
 
 	sc->args = sc->tp_format->format.fields;
-	sc->nr_args = sc->tp_format->format.nr_fields;
 	/*
 	 * We need to check and discard the first variable '__syscall_nr'
 	 * or 'nr' that mean the syscall number. It is needless here.
@@ -1321,33 +1312,68 @@ out:
  * variable to read it. Most notably this avoids extended load instructions
  * on unaligned addresses
  */
+unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
+{
+	unsigned long val;
+	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
+
+	memcpy(&val, p, sizeof(val));
+	return val;
+}
+
+static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
+				      struct syscall_arg *arg)
+{
+	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
+		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
+
+	return scnprintf(bf, size, "arg%d: ", arg->idx);
+}
+
+static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
+				     struct syscall_arg *arg, unsigned long val)
+{
+	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
+		arg->val = val;
+		if (sc->arg_fmt[arg->idx].parm)
+			arg->parm = sc->arg_fmt[arg->idx].parm;
+		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
+	}
+	return scnprintf(bf, size, "%ld", val);
+}
 
 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 				      unsigned char *args, struct trace *trace,
 				      struct thread *thread)
 {
 	size_t printed = 0;
-	unsigned char *p;
 	unsigned long val;
+	u8 bit = 1;
+	struct syscall_arg arg = {
+		.args	= args,
+		.idx	= 0,
+		.mask	= 0,
+		.trace  = trace,
+		.thread = thread,
+	};
+	struct thread_trace *ttrace = thread__priv(thread);
+
+	/*
+	 * Things like fcntl will set this in its 'cmd' formatter to pick the
+	 * right formatter for the return value (an fd? file flags?), which is
+	 * not needed for syscalls that always return a given type, say an fd.
+	 */
+	ttrace->ret_scnprintf = NULL;
 
 	if (sc->args != NULL) {
 		struct format_field *field;
-		u8 bit = 1;
-		struct syscall_arg arg = {
-			.idx	= 0,
-			.mask	= 0,
-			.trace  = trace,
-			.thread = thread,
-		};
 
 		for (field = sc->args; field;
 		     field = field->next, ++arg.idx, bit <<= 1) {
 			if (arg.mask & bit)
 				continue;
 
-			/* special care for unaligned accesses */
-			p = args + sizeof(unsigned long) * arg.idx;
-			memcpy(&val, p, sizeof(val));
+			val = syscall_arg__val(&arg, arg.idx);
 
 			/*
  			 * Suppress this argument if its value is zero and
@@ -1355,23 +1381,16 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
  			 * strarray for it.
  			 */
 			if (val == 0 &&
-			    !(sc->arg_scnprintf &&
-			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
-			      sc->arg_parm[arg.idx]))
+			    !(sc->arg_fmt &&
+			      (sc->arg_fmt[arg.idx].show_zero ||
+			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
+			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
+			      sc->arg_fmt[arg.idx].parm))
 				continue;
 
 			printed += scnprintf(bf + printed, size - printed,
 					     "%s%s: ", printed ? ", " : "", field->name);
-			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
-				arg.val = val;
-				if (sc->arg_parm)
-					arg.parm = sc->arg_parm[arg.idx];
-				printed += sc->arg_scnprintf[arg.idx](bf + printed,
-								      size - printed, &arg);
-			} else {
-				printed += scnprintf(bf + printed, size - printed,
-						     "%ld", val);
-			}
+			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
 		}
 	} else if (IS_ERR(sc->tp_format)) {
 		/*
@@ -1379,16 +1398,17 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 		 * may end up not having any args, like with gettid(), so only
 		 * print the raw args when we didn't manage to read it.
 		 */
-		int i = 0;
-
-		while (i < 6) {
-			/* special care for unaligned accesses */
-			p = args + sizeof(unsigned long) * i;
-			memcpy(&val, p, sizeof(val));
-			printed += scnprintf(bf + printed, size - printed,
-					     "%sarg%d: %ld",
-					     printed ? ", " : "", i, val);
-			++i;
+		while (arg.idx < sc->nr_args) {
+			if (arg.mask & bit)
+				goto next_arg;
+			val = syscall_arg__val(&arg, arg.idx);
+			if (printed)
+				printed += scnprintf(bf + printed, size - printed, ", ");
+			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
+			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
+next_arg:
+			++arg.idx;
+			bit <<= 1;
 		}
 	}
 
@@ -1635,17 +1655,31 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	}
 
 	if (sc->fmt == NULL) {
+		if (ret < 0)
+			goto errno_print;
 signed_print:
 		fprintf(trace->output, ") = %ld", ret);
-	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
+	} else if (ret < 0) {
+errno_print: {
 		char bf[STRERR_BUFSIZE];
 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
 			   *e = audit_errno_to_name(-ret);
 
 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
+	}
 	} else if (ret == 0 && sc->fmt->timeout)
 		fprintf(trace->output, ") = 0 Timeout");
-	else if (sc->fmt->hexret)
+	else if (ttrace->ret_scnprintf) {
+		char bf[1024];
+		struct syscall_arg arg = {
+			.val	= ret,
+			.thread	= thread,
+			.trace	= trace,
+		};
+		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
+		ttrace->ret_scnprintf = NULL;
+		fprintf(trace->output, ") = %s", bf);
+	} else if (sc->fmt->hexret)
 		fprintf(trace->output, ") = %#lx", ret);
 	else if (sc->fmt->errpid) {
 		struct thread *child = machine__find_thread(trace->host, ret, ret);
@@ -2171,6 +2205,30 @@ out_enomem:
 	goto out;
 }
 
+static int trace__set_filter_loop_pids(struct trace *trace)
+{
+	unsigned int nr = 1;
+	pid_t pids[32] = {
+		getpid(),
+	};
+	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
+
+	while (thread && nr < ARRAY_SIZE(pids)) {
+		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
+
+		if (parent == NULL)
+			break;
+
+		if (!strcmp(thread__comm_str(parent), "sshd")) {
+			pids[nr++] = parent->tid;
+			break;
+		}
+		thread = parent;
+	}
+
+	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
+}
+
 static int trace__run(struct trace *trace, int argc, const char **argv)
 {
 	struct perf_evlist *evlist = trace->evlist;
@@ -2294,7 +2352,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (trace->filter_pids.nr > 0)
 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
 	else if (thread_map__pid(evlist->threads, 0) == -1)
-		err = perf_evlist__set_filter_pid(evlist, getpid());
+		err = trace__set_filter_loop_pids(trace);
 
 	if (err < 0)
 		goto out_error_mem;
@@ -2756,7 +2814,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
 	struct trace *trace = (struct trace *)opt->value;
 	const char *s = str;
 	char *sep = NULL, *lists[2] = { NULL, NULL, };
-	int len = strlen(str), err = -1, list;
+	int len = strlen(str) + 1, err = -1, list;
 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
 	char group_name[PATH_MAX];
 
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 83fe2202382e..932fda54b8a6 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -1,9 +1,15 @@
 #!/bin/sh
 
 HEADERS='
+include/uapi/drm/drm.h
+include/uapi/drm/i915_drm.h
 include/uapi/linux/fcntl.h
+include/uapi/linux/kvm.h
 include/uapi/linux/perf_event.h
+include/uapi/linux/sched.h
 include/uapi/linux/stat.h
+include/uapi/linux/vhost.h
+include/uapi/sound/asound.h
 include/linux/hash.h
 include/uapi/linux/hw_breakpoint.h
 arch/x86/include/asm/disabled-features.h
@@ -16,6 +22,7 @@ arch/x86/include/uapi/asm/perf_regs.h
 arch/x86/include/uapi/asm/kvm.h
 arch/x86/include/uapi/asm/kvm_perf.h
 arch/x86/include/uapi/asm/svm.h
+arch/x86/include/uapi/asm/unistd.h
 arch/x86/include/uapi/asm/vmx.h
 arch/powerpc/include/uapi/asm/kvm.h
 arch/s390/include/uapi/asm/kvm.h
@@ -29,12 +36,13 @@ include/asm-generic/bitops/__fls.h
 include/asm-generic/bitops/fls.h
 include/asm-generic/bitops/fls64.h
 include/linux/coresight-pmu.h
+include/uapi/asm-generic/ioctls.h
 include/uapi/asm-generic/mman-common.h
 '
 
 check () {
   file=$1
-  opts=
+  opts="--ignore-blank-lines --ignore-space-change"
 
   shift
   while [ -n "$*" ]; do
@@ -45,7 +53,7 @@ check () {
   cmd="diff $opts ../$file ../../$file > /dev/null"
 
   test -f ../../$file &&
-  eval $cmd || echo "Warning: $file differs from kernel" >&2
+  eval $cmd || echo "Warning: Kernel ABI header at 'tools/$file' differs from latest version at '$file'" >&2
 }
 
 
@@ -55,7 +63,7 @@ for i in $HEADERS; do
 done
 
 # diff with extra ignore lines
-check arch/x86/lib/memcpy_64.S        -B -I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>"
-check arch/x86/lib/memset_64.S        -B -I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>"
-check include/uapi/asm-generic/mman.h -B -I "^#include <\(uapi/\)*asm-generic/mman-common.h>"
-check include/uapi/linux/mman.h       -B -I "^#include <\(uapi/\)*asm/mman.h>"
+check arch/x86/lib/memcpy_64.S        -I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>"
+check arch/x86/lib/memset_64.S        -I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>"
+check include/uapi/asm-generic/mman.h -I "^#include <\(uapi/\)*asm-generic/mman-common.h>"
+check include/uapi/linux/mman.h       -I "^#include <\(uapi/\)*asm/mman.h>"
diff --git a/tools/perf/perf-sys.h b/tools/perf/perf-sys.h
index e4b717e9eb6c..c11f0c76e90c 100644
--- a/tools/perf/perf-sys.h
+++ b/tools/perf/perf-sys.h
@@ -9,16 +9,6 @@
 #include <linux/perf_event.h>
 #include <asm/barrier.h>
 
-#if defined(__i386__)
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#define CPUINFO_PROC	{"model name"}
-#endif
-
-#if defined(__x86_64__)
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#define CPUINFO_PROC	{"model name"}
-#endif
-
 #ifdef __powerpc__
 #define CPUINFO_PROC	{"cpu"}
 #endif
@@ -43,19 +33,10 @@
 #define CPUINFO_PROC	{"cpu model"}
 #endif
 
-#ifdef __ia64__
-#define cpu_relax()	asm volatile ("hint @pause" ::: "memory")
-#define CPUINFO_PROC	{"model name"}
-#endif
-
 #ifdef __arm__
 #define CPUINFO_PROC	{"model name", "Processor"}
 #endif
 
-#ifdef __aarch64__
-#define cpu_relax()	asm volatile("yield" ::: "memory")
-#endif
-
 #ifdef __mips__
 #define CPUINFO_PROC	{"cpu model"}
 #endif
@@ -72,13 +53,8 @@
 #define CPUINFO_PROC	{"core ID"}
 #endif
 
-#ifdef __tile__
-#define cpu_relax()	asm volatile ("mfspr zero, PASS" ::: "memory")
-#define CPUINFO_PROC    {"model name"}
-#endif
-
-#ifndef cpu_relax
-#define cpu_relax() barrier()
+#ifndef CPUINFO_PROC
+#define CPUINFO_PROC	{ "model name", }
 #endif
 
 static inline int
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 628a5e412cb1..e0279babe0c0 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -89,7 +89,7 @@ struct pager_config {
 static int pager_command_config(const char *var, const char *value, void *data)
 {
 	struct pager_config *c = data;
-	if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
+	if (strstarts(var, "pager.") && !strcmp(var + 6, c->cmd))
 		c->val = perf_config_bool(var, value);
 	return 0;
 }
@@ -108,9 +108,9 @@ static int check_pager_config(const char *cmd)
 static int browser_command_config(const char *var, const char *value, void *data)
 {
 	struct pager_config *c = data;
-	if (!prefixcmp(var, "tui.") && !strcmp(var + 4, c->cmd))
+	if (strstarts(var, "tui.") && !strcmp(var + 4, c->cmd))
 		c->val = perf_config_bool(var, value);
-	if (!prefixcmp(var, "gtk.") && !strcmp(var + 4, c->cmd))
+	if (strstarts(var, "gtk.") && !strcmp(var + 4, c->cmd))
 		c->val = perf_config_bool(var, value) ? 2 : 0;
 	return 0;
 }
@@ -192,7 +192,7 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
 		/*
 		 * Check remaining flags.
 		 */
-		if (!prefixcmp(cmd, CMD_EXEC_PATH)) {
+		if (strstarts(cmd, CMD_EXEC_PATH)) {
 			cmd += strlen(CMD_EXEC_PATH);
 			if (*cmd == '=')
 				set_argv_exec_path(cmd + 1);
@@ -229,7 +229,7 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
 				*envchanged = 1;
 			(*argv)++;
 			(*argc)--;
-		} else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) {
+		} else if (strstarts(cmd, CMD_DEBUGFS_DIR)) {
 			tracing_path_set(cmd + strlen(CMD_DEBUGFS_DIR));
 			fprintf(stderr, "dir: %s\n", tracing_path);
 			if (envchanged)
@@ -470,14 +470,14 @@ int main(int argc, const char **argv)
 	 * So we just directly call the internal command handler, and
 	 * die if that one cannot handle it.
 	 */
-	if (!prefixcmp(cmd, "perf-")) {
+	if (strstarts(cmd, "perf-")) {
 		cmd += 5;
 		argv[0] = cmd;
 		handle_internal_command(argc, argv);
 		fprintf(stderr, "cannot handle %s internally", cmd);
 		goto out;
 	}
-	if (!prefixcmp(cmd, "trace")) {
+	if (strstarts(cmd, "trace")) {
 #ifdef HAVE_LIBAUDIT_SUPPORT
 		setup_path();
 		argv[0] = "trace";
@@ -495,7 +495,7 @@ int main(int argc, const char **argv)
 	commit_pager_choice();
 
 	if (argc > 0) {
-		if (!prefixcmp(argv[0], "--"))
+		if (strstarts(argv[0], "--"))
 			argv[0] += 2;
 	} else {
 		/* The user didn't specify a command; give them help */
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 806c216a1078..2c010dd6a79d 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -7,6 +7,7 @@
 #include <linux/perf_event.h>
 
 extern bool test_attr__enabled;
+void test_attr__ready(void);
 void test_attr__init(void);
 void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
 		     int fd, int group_fd, unsigned long flags);
diff --git a/tools/perf/pmu-events/README b/tools/perf/pmu-events/README
index 1408ade0d773..c2ee3e4417fe 100644
--- a/tools/perf/pmu-events/README
+++ b/tools/perf/pmu-events/README
@@ -85,10 +85,6 @@ users to specify events by their name:
 
 where 'pm_1plus_ppc_cmpl' is a Power8 PMU event.
 
-In case of errors when processing files in the tools/perf/pmu-events/arch
-directory, 'jevents' tries to create an empty mapping file to allow the perf
-build to succeed even if the PMU event aliases cannot be used.
-
 However some errors in processing may cause the perf build to fail.
 
 Mapfile format
diff --git a/tools/perf/pmu-events/arch/powerpc/mapfile.csv b/tools/perf/pmu-events/arch/powerpc/mapfile.csv
index e925baa0c30b..a0f3a11ca19f 100644
--- a/tools/perf/pmu-events/arch/powerpc/mapfile.csv
+++ b/tools/perf/pmu-events/arch/powerpc/mapfile.csv
@@ -13,9 +13,13 @@
 #
 
 # Power8 entries
-004b0000,1,power8.json,core
-004b0201,1,power8.json,core
-004c0000,1,power8.json,core
-004d0000,1,power8.json,core
-004d0100,1,power8.json,core
-004d0200,1,power8.json,core
+004b0000,1,power8,core
+004b0201,1,power8,core
+004c0000,1,power8,core
+004d0000,1,power8,core
+004d0100,1,power8,core
+004d0200,1,power8,core
+004c0100,1,power8,core
+004e0100,1,power9,core
+004e0200,1,power9,core
+004e1200,1,power9,core
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/cache.json b/tools/perf/pmu-events/arch/powerpc/power9/cache.json
new file mode 100644
index 000000000000..18f6645f2897
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/cache.json
@@ -0,0 +1,137 @@
+[
+  {,
+    "EventCode": "0x300F4",
+    "EventName": "PM_THRD_CONC_RUN_INST",
+    "BriefDescription": "PPC Instructions Finished by this thread when all threads in the core had the run-latch set"
+  },
+  {,
+    "EventCode": "0x1E056",
+    "EventName": "PM_CMPLU_STALL_FLUSH_ANY_THREAD",
+    "BriefDescription": "Cycles in which the NTC instruction is not allowed to complete because any of the 4 threads in the same core suffered a flush, which blocks completion"
+  },
+  {,
+    "EventCode": "0x4D016",
+    "EventName": "PM_CMPLU_STALL_FXLONG",
+    "BriefDescription": "Completion stall due to a long latency scalar fixed point instruction (division, square root)"
+  },
+  {,
+    "EventCode": "0x2D016",
+    "EventName": "PM_CMPLU_STALL_FXU",
+    "BriefDescription": "Finish stall due to a scalar fixed point or CR instruction in the execution pipeline. These instructions get routed to the ALU, ALU2, and DIV pipes"
+  },
+  {,
+    "EventCode": "0x1D15C",
+    "EventName": "PM_MRK_DTLB_MISS_1G",
+    "BriefDescription": "Marked Data TLB reload (after a miss) page size 2M. Implies radix translation was used"
+  },
+  {,
+    "EventCode": "0x4D12A",
+    "EventName": "PM_MRK_DATA_FROM_RL4_CYC",
+    "BriefDescription": "Duration in cycles to reload from another chip's L4 on the same Node or Group ( Remote) due to a marked load"
+  },
+  {,
+    "EventCode": "0x1003C",
+    "EventName": "PM_CMPLU_STALL_DMISS_L2L3",
+    "BriefDescription": "Completion stall by Dcache miss which resolved in L2/L3"
+  },
+  {,
+    "EventCode": "0x4C014",
+    "EventName": "PM_CMPLU_STALL_LMQ_FULL",
+    "BriefDescription": "Finish stall because the NTF instruction was a load that missed in the L1 and the LMQ was unable to accept this load miss request because it was full"
+  },
+  {,
+    "EventCode": "0x14048",
+    "EventName": "PM_INST_FROM_ON_CHIP_CACHE",
+    "BriefDescription": "The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x4D014",
+    "EventName": "PM_CMPLU_STALL_LOAD_FINISH",
+    "BriefDescription": "Finish stall because the NTF instruction was a load instruction with all its dependencies satisfied just going through the LSU pipe to finish"
+  },
+  {,
+    "EventCode": "0x2404A",
+    "EventName": "PM_INST_FROM_RL4",
+    "BriefDescription": "The processor's Instruction cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x1404A",
+    "EventName": "PM_INST_FROM_RL2L3_SHR",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x401EA",
+    "EventName": "PM_THRESH_EXC_128",
+    "BriefDescription": "Threshold counter exceeded a value of 128"
+  },
+  {,
+    "EventCode": "0x400F6",
+    "EventName": "PM_BR_MPRED_CMPL",
+    "BriefDescription": "Number of Branch Mispredicts"
+  },
+  {,
+    "EventCode": "0x2F140",
+    "EventName": "PM_MRK_DPTEG_FROM_L2_MEPF",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x101E6",
+    "EventName": "PM_THRESH_EXC_4096",
+    "BriefDescription": "Threshold counter exceed a count of 4096"
+  },
+  {,
+    "EventCode": "0x3D156",
+    "EventName": "PM_MRK_DTLB_MISS_64K",
+    "BriefDescription": "Marked Data TLB Miss page size 64K"
+  },
+  {,
+    "EventCode": "0x4C15E",
+    "EventName": "PM_MRK_DTLB_MISS_16M",
+    "BriefDescription": "Marked Data TLB Miss page size 16M"
+  },
+  {,
+    "EventCode": "0x2D15E",
+    "EventName": "PM_MRK_DTLB_MISS_16G",
+    "BriefDescription": "Marked Data TLB Miss page size 16G"
+  },
+  {,
+    "EventCode": "0x3F14A",
+    "EventName": "PM_MRK_DPTEG_FROM_RMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4C016",
+    "EventName": "PM_CMPLU_STALL_DMISS_L2L3_CONFLICT",
+    "BriefDescription": "Completion stall due to cache miss that resolves in the L2 or L3 with a conflict"
+  },
+  {,
+    "EventCode": "0x2C01A",
+    "EventName": "PM_CMPLU_STALL_LHS",
+    "BriefDescription": "Finish stall because the NTF instruction was a load that hit on an older store and it was waiting for store data"
+  },
+  {,
+    "EventCode": "0x401E4",
+    "EventName": "PM_MRK_DTLB_MISS",
+    "BriefDescription": "Marked dtlb miss"
+  },
+  {,
+    "EventCode": "0x24046",
+    "EventName": "PM_INST_FROM_RL2L3_MOD",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x1002A",
+    "EventName": "PM_CMPLU_STALL_LARX",
+    "BriefDescription": "Finish stall because the NTF instruction was a larx waiting to be satisfied"
+  },
+  {,
+    "EventCode": "0x3006C",
+    "EventName": "PM_RUN_CYC_SMT2_MODE",
+    "BriefDescription": "Cycles in which this thread's run latch is set and the core is in SMT2 mode"
+  },
+  {,
+    "EventCode": "0x1C058",
+    "EventName": "PM_DTLB_MISS_16G",
+    "BriefDescription": "Data TLB Miss page size 16G"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/floating-point.json b/tools/perf/pmu-events/arch/powerpc/power9/floating-point.json
new file mode 100644
index 000000000000..8a83bca26552
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/floating-point.json
@@ -0,0 +1,32 @@
+[
+  {,
+    "EventCode": "0x1415A",
+    "EventName": "PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L2 with load hit store conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x10058",
+    "EventName": "PM_MEM_LOC_THRESH_IFU",
+    "BriefDescription": "Local Memory above threshold for IFU speculation control"
+  },
+  {,
+    "EventCode": "0x2D028",
+    "EventName": "PM_RADIX_PWC_L2_PDE_FROM_L2",
+    "BriefDescription": "A Page Directory Entry was reloaded to a level 2 page walk cache from the core's L2 data cache"
+  },
+  {,
+    "EventCode": "0x30012",
+    "EventName": "PM_FLUSH_COMPLETION",
+    "BriefDescription": "The instruction that was next to complete did not complete because it suffered a flush"
+  },
+  {,
+    "EventCode": "0x2D154",
+    "EventName": "PM_MRK_DERAT_MISS_64K",
+    "BriefDescription": "Marked Data ERAT Miss (Data TLB Access) page size 64K"
+  },
+  {,
+    "EventCode": "0x4016E",
+    "EventName": "PM_THRESH_NOT_MET",
+    "BriefDescription": "Threshold counter did not meet threshold"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json
new file mode 100644
index 000000000000..7e62c46d7a20
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json
@@ -0,0 +1,377 @@
+[
+  {,
+    "EventCode": "0x3E15C",
+    "EventName": "PM_MRK_L2_TM_ST_ABORT_SISTER",
+    "BriefDescription": "TM marked store abort for this thread"
+  },
+  {,
+    "EventCode": "0x25044",
+    "EventName": "PM_IPTEG_FROM_L31_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x101E8",
+    "EventName": "PM_THRESH_EXC_256",
+    "BriefDescription": "Threshold counter exceed a count of 256"
+  },
+  {,
+    "EventCode": "0x4504E",
+    "EventName": "PM_IPTEG_FROM_L3MISS",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a location other than the local core's L3 due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x1006A",
+    "EventName": "PM_NTC_ISSUE_HELD_DARQ_FULL",
+    "BriefDescription": "The NTC instruction is being held at dispatch because there are no slots in the DARQ for it"
+  },
+  {,
+    "EventCode": "0x4E016",
+    "EventName": "PM_CMPLU_STALL_LSAQ_ARB",
+    "BriefDescription": "Finish stall because the NTF instruction was a load or store that was held in LSAQ because an older instruction from SRQ or LRQ won arbitration to the LSU pipe when this instruction tried to launch"
+  },
+  {,
+    "EventCode": "0x1001A",
+    "EventName": "PM_LSU_SRQ_FULL_CYC",
+    "BriefDescription": "Cycles in which the Store Queue is full on all 4 slices. This is event is not per thread. All the threads will see the same count for this core resource"
+  },
+  {,
+    "EventCode": "0x1E15E",
+    "EventName": "PM_MRK_L2_TM_REQ_ABORT",
+    "BriefDescription": "TM abort"
+  },
+  {,
+    "EventCode": "0x34052",
+    "EventName": "PM_INST_SYS_PUMP_MPRED",
+    "BriefDescription": "Final Pump Scope (system) mispredicted. Either the original scope was too small (Chip/Group) or the original scope was System and it should have been smaller. Counts for an instruction fetch"
+  },
+  {,
+    "EventCode": "0x20114",
+    "EventName": "PM_MRK_L2_RC_DISP",
+    "BriefDescription": "Marked Instruction RC dispatched in L2"
+  },
+  {,
+    "EventCode": "0x4C044",
+    "EventName": "PM_DATA_FROM_L31_ECO_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x1C044",
+    "EventName": "PM_DATA_FROM_L3_NO_CONFLICT",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L3 without conflict due to a demand load"
+  },
+  {,
+    "EventCode": "0x44050",
+    "EventName": "PM_INST_SYS_PUMP_MPRED_RTY",
+    "BriefDescription": "Final Pump Scope (system) ended up larger than Initial Pump Scope (Chip/Group) for an instruction fetch"
+  },
+  {,
+    "EventCode": "0x30154",
+    "EventName": "PM_MRK_FAB_RSP_DCLAIM",
+    "BriefDescription": "Marked store had to do a dclaim"
+  },
+  {,
+    "EventCode": "0x30014",
+    "EventName": "PM_CMPLU_STALL_STORE_FIN_ARB",
+    "BriefDescription": "Finish stall because the NTF instruction was a store waiting for a slot in the store finish pipe. This means the instruction is ready to finish but there are instructions ahead of it, using the finish pipe"
+  },
+  {,
+    "EventCode": "0x3E054",
+    "EventName": "PM_LD_MISS_L1",
+    "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load."
+  },
+  {,
+    "EventCode": "0x400F0",
+    "EventName": "PM_LD_MISS_L1",
+    "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load."
+  },
+  {,
+    "EventCode": "0x2E01A",
+    "EventName": "PM_CMPLU_STALL_LSU_FLUSH_NEXT",
+    "BriefDescription": "Completion stall of one cycle because the LSU requested to flush the next iop in the sequence. It takes 1 cycle for the ISU to process this request before the LSU instruction is allowed to complete"
+  },
+  {,
+    "EventCode": "0x2D01C",
+    "EventName": "PM_CMPLU_STALL_STCX",
+    "BriefDescription": "Finish stall because the NTF instruction was a stcx waiting for response from L2"
+  },
+  {,
+    "EventCode": "0x2C010",
+    "EventName": "PM_CMPLU_STALL_LSU",
+    "BriefDescription": "Completion stall by LSU instruction"
+  },
+  {,
+    "EventCode": "0x2C042",
+    "EventName": "PM_DATA_FROM_L3_MEPF",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state due to a demand load"
+  },
+  {,
+    "EventCode": "0x4E012",
+    "EventName": "PM_CMPLU_STALL_MTFPSCR",
+    "BriefDescription": "Completion stall because the ISU is updating the register and notifying the Effective Address Table (EAT)"
+  },
+  {,
+    "EventCode": "0x100F2",
+    "EventName": "PM_1PLUS_PPC_CMPL",
+    "BriefDescription": "1 or more ppc insts finished"
+  },
+  {,
+    "EventCode": "0x3001C",
+    "EventName": "PM_LSU_REJECT_LMQ_FULL",
+    "BriefDescription": "LSU Reject due to LMQ full (up to 4 per cycles)"
+  },
+  {,
+    "EventCode": "0x15046",
+    "EventName": "PM_IPTEG_FROM_L31_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x1015E",
+    "EventName": "PM_MRK_FAB_RSP_RD_T_INTV",
+    "BriefDescription": "Sampled Read got a T intervention"
+  },
+  {,
+    "EventCode": "0x101EC",
+    "EventName": "PM_THRESH_MET",
+    "BriefDescription": "threshold exceeded"
+  },
+  {,
+    "EventCode": "0x10020",
+    "EventName": "PM_PMC4_REWIND",
+    "BriefDescription": "PMC4 Rewind Event"
+  },
+  {,
+    "EventCode": "0x301EA",
+    "EventName": "PM_THRESH_EXC_1024",
+    "BriefDescription": "Threshold counter exceeded a value of 1024"
+  },
+  {,
+    "EventCode": "0x34056",
+    "EventName": "PM_CMPLU_STALL_LSU_MFSPR",
+    "BriefDescription": "Finish stall because the NTF instruction was a mfspr instruction targeting an LSU SPR and it was waiting for the register data to be returned"
+  },
+  {,
+    "EventCode": "0x44056",
+    "EventName": "PM_VECTOR_ST_CMPL",
+    "BriefDescription": "Number of vector store instructions completed"
+  },
+  {,
+    "EventCode": "0x2C124",
+    "EventName": "PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 with dispatch conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x4C12A",
+    "EventName": "PM_MRK_DATA_FROM_RL2L3_SHR_CYC",
+    "BriefDescription": "Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x3C056",
+    "EventName": "PM_DTLB_MISS_64K",
+    "BriefDescription": "Data TLB Miss page size 64K"
+  },
+  {,
+    "EventCode": "0x30060",
+    "EventName": "PM_TM_TRANS_RUN_INST",
+    "BriefDescription": "Run instructions completed in transactional state (gated by the run latch)"
+  },
+  {,
+    "EventCode": "0x2C014",
+    "EventName": "PM_CMPLU_STALL_STORE_FINISH",
+    "BriefDescription": "Finish stall because the NTF instruction was a store with all its dependencies met, just waiting to go through the LSU pipe to finish"
+  },
+  {,
+    "EventCode": "0x3515A",
+    "EventName": "PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC",
+    "BriefDescription": "Duration in cycles to reload either shared or modified data from another core's L2/L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x34050",
+    "EventName": "PM_INST_SYS_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope was system pump (prediction=correct) for an instruction fetch"
+  },
+  {,
+    "EventCode": "0x3015E",
+    "EventName": "PM_MRK_FAB_RSP_CLAIM_RTY",
+    "BriefDescription": "Sampled store did a rwitm and got a rty"
+  },
+  {,
+    "EventCode": "0x0",
+    "EventName": "PM_SUSPENDED",
+    "BriefDescription": "Counter OFF"
+  },
+  {,
+    "EventCode": "0x10010",
+    "EventName": "PM_PMC4_OVERFLOW",
+    "BriefDescription": "Overflow from counter 4"
+  },
+  {,
+    "EventCode": "0x3E04A",
+    "EventName": "PM_DPTEG_FROM_RMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2F152",
+    "EventName": "PM_MRK_FAB_RSP_DCLAIM_CYC",
+    "BriefDescription": "cycles L2 RC took for a dclaim"
+  },
+  {,
+    "EventCode": "0x10004",
+    "EventName": "PM_CMPLU_STALL_LRQ_OTHER",
+    "BriefDescription": "Finish stall due to LRQ miscellaneous reasons, lost arbitration to LMQ slot, bank collisions, set prediction cleanup, set prediction multihit and others"
+  },
+  {,
+    "EventCode": "0x4F150",
+    "EventName": "PM_MRK_FAB_RSP_RWITM_CYC",
+    "BriefDescription": "cycles L2 RC took for a rwitm"
+  },
+  {,
+    "EventCode": "0x4E042",
+    "EventName": "PM_DPTEG_FROM_L3",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x1F054",
+    "EventName": "PM_TLB_HIT",
+    "BriefDescription": "Number of times the TLB had the data required by the instruction. Applies to both HPT and RPT"
+  },
+  {,
+    "EventCode": "0x2C01E",
+    "EventName": "PM_CMPLU_STALL_SYNC_PMU_INT",
+    "BriefDescription": "Cycles in which the NTC instruction is waiting for a synchronous PMU interrupt"
+  },
+  {,
+    "EventCode": "0x24050",
+    "EventName": "PM_IOPS_CMPL",
+    "BriefDescription": "Internal Operations completed"
+  },
+  {,
+    "EventCode": "0x1515C",
+    "EventName": "PM_SYNC_MRK_BR_MPRED",
+    "BriefDescription": "Marked Branch mispredict that can cause a synchronous interrupt"
+  },
+  {,
+    "EventCode": "0x300FA",
+    "EventName": "PM_INST_FROM_L3MISS",
+    "BriefDescription": "Marked instruction was reloaded from a location beyond the local chiplet"
+  },
+  {,
+    "EventCode": "0x15044",
+    "EventName": "PM_IPTEG_FROM_L3_NO_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x15152",
+    "EventName": "PM_SYNC_MRK_BR_LINK",
+    "BriefDescription": "Marked Branch and link branch that can cause a synchronous interrupt"
+  },
+  {,
+    "EventCode": "0x1E050",
+    "EventName": "PM_CMPLU_STALL_TEND",
+    "BriefDescription": "Finish stall because the NTF instruction was a tend instruction awaiting response from L2"
+  },
+  {,
+    "EventCode": "0x1013E",
+    "EventName": "PM_MRK_LD_MISS_EXPOSED_CYC",
+    "BriefDescription": "Marked Load exposed Miss (use edge detect to count #)"
+  },
+  {,
+    "EventCode": "0x25042",
+    "EventName": "PM_IPTEG_FROM_L3_MEPF",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x14054",
+    "EventName": "PM_INST_PUMP_CPRED",
+    "BriefDescription": "Pump prediction correct. Counts across all types of pumps for an instruction fetch"
+  },
+  {,
+    "EventCode": "0x4015E",
+    "EventName": "PM_MRK_FAB_RSP_RD_RTY",
+    "BriefDescription": "Sampled L2 reads retry count"
+  },
+  {,
+    "EventCode": "0x45048",
+    "EventName": "PM_IPTEG_FROM_DL2L3_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x44052",
+    "EventName": "PM_INST_PUMP_MPRED",
+    "BriefDescription": "Pump misprediction. Counts across all types of pumps for an instruction fetch"
+  },
+  {,
+    "EventCode": "0x30026",
+    "EventName": "PM_CMPLU_STALL_STORE_DATA",
+    "BriefDescription": "Finish stall because the next to finish instruction was a store waiting on data"
+  },
+  {,
+    "EventCode": "0x301E6",
+    "EventName": "PM_MRK_DERAT_MISS",
+    "BriefDescription": "Erat Miss (TLB Access) All page sizes"
+  },
+  {,
+    "EventCode": "0x24154",
+    "EventName": "PM_THRESH_ACC",
+    "BriefDescription": "This event increments every time the threshold event counter ticks. Thresholding must be enabled (via MMCRA) and the thresholding start event must occur for this counter to increment. It will stop incrementing when the thresholding stop event occurs or when thresholding is disabled, until the next time a configured thresholding start event occurs."
+  },
+  {,
+    "EventCode": "0x2015E",
+    "EventName": "PM_MRK_FAB_RSP_RWITM_RTY",
+    "BriefDescription": "Sampled store did a rwitm and got a rty"
+  },
+  {,
+    "EventCode": "0x200FA",
+    "EventName": "PM_BR_TAKEN_CMPL",
+    "BriefDescription": "New event for Branch Taken"
+  },
+  {,
+    "EventCode": "0x35044",
+    "EventName": "PM_IPTEG_FROM_L31_ECO_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x4C010",
+    "EventName": "PM_CMPLU_STALL_STORE_PIPE_ARB",
+    "BriefDescription": "Finish stall because the NTF instruction was a store waiting for the next relaunch opportunity after an internal reject. This means the instruction is ready to relaunch and tried once but lost arbitration"
+  },
+  {,
+    "EventCode": "0x4C01C",
+    "EventName": "PM_CMPLU_STALL_ST_FWD",
+    "BriefDescription": "Completion stall due to store forward"
+  },
+  {,
+    "EventCode": "0x3515C",
+    "EventName": "PM_MRK_DATA_FROM_RL4",
+    "BriefDescription": "The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to a marked load"
+  },
+  {,
+    "EventCode": "0x2D14C",
+    "EventName": "PM_MRK_DATA_FROM_L31_ECO_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x40116",
+    "EventName": "PM_MRK_LARX_FIN",
+    "BriefDescription": "Larx finished"
+  },
+  {,
+    "EventCode": "0x4C056",
+    "EventName": "PM_DTLB_MISS_16M",
+    "BriefDescription": "Data TLB Miss page size 16M"
+  },
+  {,
+    "EventCode": "0x1003A",
+    "EventName": "PM_CMPLU_STALL_LSU_FIN",
+    "BriefDescription": "Finish stall because the NTF instruction was an LSU op (other than a load or a store) with all its dependencies met and just going through the LSU pipe to finish"
+  },
+  {,
+    "EventCode": "0x3012A",
+    "EventName": "PM_MRK_L2_RC_DONE",
+    "BriefDescription": "Marked RC done"
+  },
+  {,
+    "EventCode": "0x45044",
+    "EventName": "PM_IPTEG_FROM_L31_ECO_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a instruction side request"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/marked.json b/tools/perf/pmu-events/arch/powerpc/power9/marked.json
new file mode 100644
index 000000000000..b9df54fb37e3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/marked.json
@@ -0,0 +1,647 @@
+[
+  {,
+    "EventCode": "0x3C052",
+    "EventName": "PM_DATA_SYS_PUMP_MPRED",
+    "BriefDescription": "Final Pump Scope (system) mispredicted. Either the original scope was too small (Chip/Group) or the original scope was System and it should have been smaller. Counts for a demand load"
+  },
+  {,
+    "EventCode": "0x3013E",
+    "EventName": "PM_MRK_STALL_CMPLU_CYC",
+    "BriefDescription": "Number of cycles the marked instruction is experiencing a stall while it is next to complete (NTC)"
+  },
+  {,
+    "EventCode": "0x4F056",
+    "EventName": "PM_RADIX_PWC_L1_PDE_FROM_L3MISS",
+    "BriefDescription": "A Page Directory Entry was reloaded to a level 1 page walk cache from beyond the core's L3 data cache. The source could be local/remote/distant memory or another core's cache"
+  },
+  {,
+    "EventCode": "0x24158",
+    "EventName": "PM_MRK_INST",
+    "BriefDescription": "An instruction was marked. Includes both Random Instruction Sampling (RIS) at decode time and Random Event Sampling (RES) at the time the configured event happens"
+  },
+  {,
+    "EventCode": "0x1E046",
+    "EventName": "PM_DPTEG_FROM_L31_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x3C04A",
+    "EventName": "PM_DATA_FROM_RMEM",
+    "BriefDescription": "The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to a demand load"
+  },
+  {,
+    "EventCode": "0x2C01C",
+    "EventName": "PM_CMPLU_STALL_DMISS_REMOTE",
+    "BriefDescription": "Completion stall by Dcache miss which resolved from remote chip (cache or memory)"
+  },
+  {,
+    "EventCode": "0x44040",
+    "EventName": "PM_INST_FROM_L2_DISP_CONFLICT_OTHER",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L2 with dispatch conflict due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x2E050",
+    "EventName": "PM_DARQ0_7_9_ENTRIES",
+    "BriefDescription": "Cycles in which 7,8, or 9 DARQ entries (out of 12) are in use"
+  },
+  {,
+    "EventCode": "0x2D02E",
+    "EventName": "PM_RADIX_PWC_L3_PTE_FROM_L2",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 3 page walk cache from the core's L2 data cache. This implies that a level 4 PWC access was not necessary for this translation"
+  },
+  {,
+    "EventCode": "0x3F05E",
+    "EventName": "PM_RADIX_PWC_L3_PTE_FROM_L3",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 3 page walk cache from the core's L3 data cache. This implies that a level 4 PWC access was not necessary for this translation"
+  },
+  {,
+    "EventCode": "0x2E01E",
+    "EventName": "PM_CMPLU_STALL_NTC_FLUSH",
+    "BriefDescription": "Completion stall due to ntc flush"
+  },
+  {,
+    "EventCode": "0x1F14C",
+    "EventName": "PM_MRK_DPTEG_FROM_LL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x20130",
+    "EventName": "PM_MRK_INST_DECODED",
+    "BriefDescription": "An instruction was marked at decode time. Random Instruction Sampling (RIS) only"
+  },
+  {,
+    "EventCode": "0x3F144",
+    "EventName": "PM_MRK_DPTEG_FROM_L31_ECO_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4D058",
+    "EventName": "PM_VECTOR_FLOP_CMPL",
+    "BriefDescription": "Vector FP instruction completed"
+  },
+  {,
+    "EventCode": "0x14040",
+    "EventName": "PM_INST_FROM_L2_NO_CONFLICT",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L2 without conflict due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x4404E",
+    "EventName": "PM_INST_FROM_L3MISS_MOD",
+    "BriefDescription": "The processor's Instruction cache was reloaded from a location other than the local core's L3 due to a instruction fetch"
+  },
+  {,
+    "EventCode": "0x3003A",
+    "EventName": "PM_CMPLU_STALL_EXCEPTION",
+    "BriefDescription": "Cycles in which the NTC instruction is not allowed to complete because it was interrupted by ANY exception, which has to be serviced before the instruction can complete"
+  },
+  {,
+    "EventCode": "0x4F144",
+    "EventName": "PM_MRK_DPTEG_FROM_L31_ECO_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x3E044",
+    "EventName": "PM_DPTEG_FROM_L31_ECO_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's ECO L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x300F6",
+    "EventName": "PM_L1_DCACHE_RELOAD_VALID",
+    "BriefDescription": "DL1 reloaded due to Demand Load"
+  },
+  {,
+    "EventCode": "0x1415E",
+    "EventName": "PM_MRK_DATA_FROM_L3MISS_CYC",
+    "BriefDescription": "Duration in cycles to reload from a location other than the local core's L3 due to a marked load"
+  },
+  {,
+    "EventCode": "0x1E052",
+    "EventName": "PM_CMPLU_STALL_SLB",
+    "BriefDescription": "Finish stall because the NTF instruction was awaiting L2 response for an SLB"
+  },
+  {,
+    "EventCode": "0x4404C",
+    "EventName": "PM_INST_FROM_DMEM",
+    "BriefDescription": "The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group (Distant) due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x3000E",
+    "EventName": "PM_FXU_1PLUS_BUSY",
+    "BriefDescription": "At least one of the 4 FXU units is busy"
+  },
+  {,
+    "EventCode": "0x2C048",
+    "EventName": "PM_DATA_FROM_LMEM",
+    "BriefDescription": "The processor's data cache was reloaded from the local chip's Memory due to a demand load"
+  },
+  {,
+    "EventCode": "0x3000A",
+    "EventName": "PM_CMPLU_STALL_PM",
+    "BriefDescription": "Finish stall because the NTF instruction was issued to the Permute execution pipe and waiting to finish. Includes permute and decimal fixed point instructions (128 bit BCD arithmetic) + a few 128 bit fixpoint add/subtract instructions with carry. Not qualified by vector or multicycle"
+  },
+  {,
+    "EventCode": "0x1504E",
+    "EventName": "PM_IPTEG_FROM_L2MISS",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a location other than the local core's L2 due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x1C052",
+    "EventName": "PM_DATA_GRP_PUMP_MPRED_RTY",
+    "BriefDescription": "Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for a demand load"
+  },
+  {,
+    "EventCode": "0x30008",
+    "EventName": "PM_DISP_STARVED",
+    "BriefDescription": "Dispatched Starved"
+  },
+  {,
+    "EventCode": "0x14042",
+    "EventName": "PM_INST_FROM_L2",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L2 due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x4000C",
+    "EventName": "PM_FREQ_UP",
+    "BriefDescription": "Power Management: Above Threshold A"
+  },
+  {,
+    "EventCode": "0x3C050",
+    "EventName": "PM_DATA_SYS_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope was system pump (prediction=correct) for a demand load"
+  },
+  {,
+    "EventCode": "0x25040",
+    "EventName": "PM_IPTEG_FROM_L2_MEPF",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x10132",
+    "EventName": "PM_MRK_INST_ISSUED",
+    "BriefDescription": "Marked instruction issued"
+  },
+  {,
+    "EventCode": "0x1C046",
+    "EventName": "PM_DATA_FROM_L31_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x2C044",
+    "EventName": "PM_DATA_FROM_L31_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x2C04A",
+    "EventName": "PM_DATA_FROM_RL4",
+    "BriefDescription": "The processor's data cache was reloaded from another chip's L4 on the same Node or Group ( Remote) due to a demand load"
+  },
+  {,
+    "EventCode": "0x24044",
+    "EventName": "PM_INST_FROM_L31_MOD",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Modified (M) data from another core's L3 on the same chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x4C050",
+    "EventName": "PM_DATA_SYS_PUMP_MPRED_RTY",
+    "BriefDescription": "Final Pump Scope (system) ended up larger than Initial Pump Scope (Chip/Group) for a demand load"
+  },
+  {,
+    "EventCode": "0x2C052",
+    "EventName": "PM_DATA_GRP_PUMP_MPRED",
+    "BriefDescription": "Final Pump Scope (Group) ended up either larger or smaller than Initial Pump Scope for a demand load"
+  },
+  {,
+    "EventCode": "0x2F148",
+    "EventName": "PM_MRK_DPTEG_FROM_LMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's Memory due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4D01A",
+    "EventName": "PM_CMPLU_STALL_EIEIO",
+    "BriefDescription": "Finish stall because the NTF instruction is an EIEIO waiting for response from L2"
+  },
+  {,
+    "EventCode": "0x4F14E",
+    "EventName": "PM_MRK_DPTEG_FROM_L3MISS",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a location other than the local core's L3 due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4F05A",
+    "EventName": "PM_RADIX_PWC_L4_PTE_FROM_L3",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 4 page walk cache from the core's L3 data cache. This is the deepest level of PWC possible for a translation"
+  },
+  {,
+    "EventCode": "0x1F05A",
+    "EventName": "PM_RADIX_PWC_L4_PTE_FROM_L2",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 4 page walk cache from the core's L2 data cache. This is the deepest level of PWC possible for a translation"
+  },
+  {,
+    "EventCode": "0x30068",
+    "EventName": "PM_L1_ICACHE_RELOADED_PREF",
+    "BriefDescription": "Counts all Icache prefetch reloads ( includes demand turned into prefetch)"
+  },
+  {,
+    "EventCode": "0x4C04A",
+    "EventName": "PM_DATA_FROM_OFF_CHIP_CACHE",
+    "BriefDescription": "The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a demand load"
+  },
+  {,
+    "EventCode": "0x400FE",
+    "EventName": "PM_DATA_FROM_MEMORY",
+    "BriefDescription": "The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a demand load"
+  },
+  {,
+    "EventCode": "0x3F058",
+    "EventName": "PM_RADIX_PWC_L1_PDE_FROM_L3",
+    "BriefDescription": "A Page Directory Entry was reloaded to a level 1 page walk cache from the core's L3 data cache"
+  },
+  {,
+    "EventCode": "0x4D142",
+    "EventName": "PM_MRK_DATA_FROM_L3",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L3 due to a marked load"
+  },
+  {,
+    "EventCode": "0x30050",
+    "EventName": "PM_SYS_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope was system pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x30028",
+    "EventName": "PM_CMPLU_STALL_SPEC_FINISH",
+    "BriefDescription": "Finish stall while waiting for the non-speculative finish of either a stcx waiting for its result or a load waiting for non-critical sectors of data and ECC"
+  },
+  {,
+    "EventCode": "0x400F4",
+    "EventName": "PM_RUN_PURR",
+    "BriefDescription": "Run_PURR"
+  },
+  {,
+    "EventCode": "0x3404C",
+    "EventName": "PM_INST_FROM_DL4",
+    "BriefDescription": "The processor's Instruction cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x3D05A",
+    "EventName": "PM_NTC_ISSUE_HELD_OTHER",
+    "BriefDescription": "The NTC instruction is being held at dispatch during regular pipeline cycles, or because the VSU is busy with multi-cycle instructions, or because of a write-back collision with VSU"
+  },
+  {,
+    "EventCode": "0x2E048",
+    "EventName": "PM_DPTEG_FROM_LMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's Memory due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2D02A",
+    "EventName": "PM_RADIX_PWC_L3_PDE_FROM_L2",
+    "BriefDescription": "A Page Directory Entry was reloaded to a level 3 page walk cache from the core's L2 data cache"
+  },
+  {,
+    "EventCode": "0x1F05C",
+    "EventName": "PM_RADIX_PWC_L3_PDE_FROM_L3",
+    "BriefDescription": "A Page Directory Entry was reloaded to a level 3 page walk cache from the core's L3 data cache"
+  },
+  {,
+    "EventCode": "0x4D04A",
+    "EventName": "PM_DARQ0_0_3_ENTRIES",
+    "BriefDescription": "Cycles in which 3 or less DARQ entries (out of 12) are in use"
+  },
+  {,
+    "EventCode": "0x1404C",
+    "EventName": "PM_INST_FROM_LL4",
+    "BriefDescription": "The processor's Instruction cache was reloaded from the local chip's L4 cache due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x200FD",
+    "EventName": "PM_L1_ICACHE_MISS",
+    "BriefDescription": "Demand iCache Miss"
+  },
+  {,
+    "EventCode": "0x34040",
+    "EventName": "PM_INST_FROM_L2_DISP_CONFLICT_LDHITST",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L2 with load hit store conflict due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x20138",
+    "EventName": "PM_MRK_ST_NEST",
+    "BriefDescription": "Marked store sent to nest"
+  },
+  {,
+    "EventCode": "0x44048",
+    "EventName": "PM_INST_FROM_DL2L3_MOD",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x35046",
+    "EventName": "PM_IPTEG_FROM_L21_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x4C04E",
+    "EventName": "PM_DATA_FROM_L3MISS_MOD",
+    "BriefDescription": "The processor's data cache was reloaded from a location other than the local core's L3 due to a demand load"
+  },
+  {,
+    "EventCode": "0x401E0",
+    "EventName": "PM_MRK_INST_CMPL",
+    "BriefDescription": "marked instruction completed"
+  },
+  {,
+    "EventCode": "0x2C128",
+    "EventName": "PM_MRK_DATA_FROM_DL2L3_SHR_CYC",
+    "BriefDescription": "Duration in cycles to reload with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x34044",
+    "EventName": "PM_INST_FROM_L31_ECO_SHR",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x4E018",
+    "EventName": "PM_CMPLU_STALL_NTC_DISP_FIN",
+    "BriefDescription": "Finish stall because the NTF instruction was one that must finish at dispatch."
+  },
+  {,
+    "EventCode": "0x2E05E",
+    "EventName": "PM_LMQ_EMPTY_CYC",
+    "BriefDescription": "Cycles in which the LMQ has no pending load misses for this thread"
+  },
+  {,
+    "EventCode": "0x4C122",
+    "EventName": "PM_DARQ1_0_3_ENTRIES",
+    "BriefDescription": "Cycles in which 3 or fewer DARQ1 entries (out of 12) are in use"
+  },
+  {,
+    "EventCode": "0x4F058",
+    "EventName": "PM_RADIX_PWC_L2_PTE_FROM_L3",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 2 page walk cache from the core's L3 data cache. This implies that level 3 and level 4 PWC accesses were not necessary for this translation"
+  },
+  {,
+    "EventCode": "0x14046",
+    "EventName": "PM_INST_FROM_L31_SHR",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Shared (S) data from another core's L3 on the same chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x3012C",
+    "EventName": "PM_MRK_ST_FWD",
+    "BriefDescription": "Marked st forwards"
+  },
+  {,
+    "EventCode": "0x101E0",
+    "EventName": "PM_MRK_INST_DISP",
+    "BriefDescription": "The thread has dispatched a randomly sampled marked instruction"
+  },
+  {,
+    "EventCode": "0x1D058",
+    "EventName": "PM_DARQ0_10_12_ENTRIES",
+    "BriefDescription": "Cycles in which 10 or more DARQ entries (out of 12) are in use"
+  },
+  {,
+    "EventCode": "0x300FE",
+    "EventName": "PM_DATA_FROM_L3MISS",
+    "BriefDescription": "Demand LD - L3 Miss (not L2 hit and not L3 hit)"
+  },
+  {,
+    "EventCode": "0x30006",
+    "EventName": "PM_CMPLU_STALL_OTHER_CMPL",
+    "BriefDescription": "Instructions the core completed while this tread was stalled"
+  },
+  {,
+    "EventCode": "0x1005C",
+    "EventName": "PM_CMPLU_STALL_DP",
+    "BriefDescription": "Finish stall because the NTF instruction was a scalar instruction issued to the Double Precision execution pipe and waiting to finish. Includes binary floating point instructions in 32 and 64 bit binary floating point format. Not qualified multicycle. Qualified by NOT vector"
+  },
+  {,
+    "EventCode": "0x1E042",
+    "EventName": "PM_DPTEG_FROM_L2",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x1016E",
+    "EventName": "PM_MRK_BR_CMPL",
+    "BriefDescription": "Branch Instruction completed"
+  },
+  {,
+    "EventCode": "0x2013A",
+    "EventName": "PM_MRK_BRU_FIN",
+    "BriefDescription": "bru marked instr finish"
+  },
+  {,
+    "EventCode": "0x4F05E",
+    "EventName": "PM_RADIX_PWC_L3_PTE_FROM_L3MISS",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 3 page walk cache from beyond the core's L3 data cache. This implies that a level 4 PWC access was not necessary for this translation. The source could be local/remote/distant memory or another core's cache"
+  },
+  {,
+    "EventCode": "0x400FC",
+    "EventName": "PM_ITLB_MISS",
+    "BriefDescription": "ITLB Reloaded. Counts 1 per ITLB miss for HPT but multiple for radix depending on number of levels traveresed"
+  },
+  {,
+    "EventCode": "0x2D024",
+    "EventName": "PM_RADIX_PWC_L2_HIT",
+    "BriefDescription": "A radix translation attempt missed in the TLB but hit on both the first and second levels of page walk cache."
+  },
+  {,
+    "EventCode": "0x3F056",
+    "EventName": "PM_RADIX_PWC_L3_HIT",
+    "BriefDescription": "A radix translation attempt missed in the TLB but hit on the first, second, and third levels of page walk cache."
+  },
+  {,
+    "EventCode": "0x4E014",
+    "EventName": "PM_TM_TX_PASS_RUN_INST",
+    "BriefDescription": "Run instructions spent in successful transactions"
+  },
+  {,
+    "EventCode": "0x1E044",
+    "EventName": "PM_DPTEG_FROM_L3_NO_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4D05A",
+    "EventName": "PM_NON_MATH_FLOP_CMPL",
+    "BriefDescription": "Non FLOP operation completed"
+  },
+  {,
+    "EventCode": "0x101E2",
+    "EventName": "PM_MRK_BR_TAKEN_CMPL",
+    "BriefDescription": "Marked Branch Taken completed"
+  },
+  {,
+    "EventCode": "0x3E158",
+    "EventName": "PM_MRK_STCX_FAIL",
+    "BriefDescription": "marked stcx failed"
+  },
+  {,
+    "EventCode": "0x1C048",
+    "EventName": "PM_DATA_FROM_ON_CHIP_CACHE",
+    "BriefDescription": "The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x1C054",
+    "EventName": "PM_DATA_PUMP_CPRED",
+    "BriefDescription": "Pump prediction correct. Counts across all types of pumps for a demand load"
+  },
+  {,
+    "EventCode": "0x4405E",
+    "EventName": "PM_DARQ_STORE_REJECT",
+    "BriefDescription": "The DARQ attempted to transmit a store into an LSAQ or SRQ entry but It was rejected. Divide by PM_DARQ_STORE_XMIT to get reject ratio"
+  },
+  {,
+    "EventCode": "0x1C042",
+    "EventName": "PM_DATA_FROM_L2",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 due to a demand load"
+  },
+  {,
+    "EventCode": "0x1D14C",
+    "EventName": "PM_MRK_DATA_FROM_LL4",
+    "BriefDescription": "The processor's data cache was reloaded from the local chip's L4 cache due to a marked load"
+  },
+  {,
+    "EventCode": "0x1006C",
+    "EventName": "PM_RUN_CYC_ST_MODE",
+    "BriefDescription": "Cycles run latch is set and core is in ST mode"
+  },
+  {,
+    "EventCode": "0x3C044",
+    "EventName": "PM_DATA_FROM_L31_ECO_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another core's ECO L3 on the same chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x4C052",
+    "EventName": "PM_DATA_PUMP_MPRED",
+    "BriefDescription": "Pump misprediction. Counts across all types of pumps for a demand load"
+  },
+  {,
+    "EventCode": "0x20050",
+    "EventName": "PM_GRP_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope and data sourced across this scope was group pump for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x1F150",
+    "EventName": "PM_MRK_ST_L2DISP_TO_CMPL_CYC",
+    "BriefDescription": "cycles from L2 rc disp to l2 rc completion"
+  },
+  {,
+    "EventCode": "0x4505A",
+    "EventName": "PM_SP_FLOP_CMPL",
+    "BriefDescription": "SP instruction completed"
+  },
+  {,
+    "EventCode": "0x4000A",
+    "EventName": "PM_ISQ_36_44_ENTRIES",
+    "BriefDescription": "Cycles in which 36 or more Issue Queue entries are in use. This is a shared event, not per thread. There are 44 issue queue entries across 4 slices in the whole core"
+  },
+  {,
+    "EventCode": "0x2C12E",
+    "EventName": "PM_MRK_DATA_FROM_LL4_CYC",
+    "BriefDescription": "Duration in cycles to reload from the local chip's L4 cache due to a marked load"
+  },
+  {,
+    "EventCode": "0x2C058",
+    "EventName": "PM_MEM_PREF",
+    "BriefDescription": "Memory prefetch for this thread. Includes L4"
+  },
+  {,
+    "EventCode": "0x40012",
+    "EventName": "PM_L1_ICACHE_RELOADED_ALL",
+    "BriefDescription": "Counts all Icache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch"
+  },
+  {,
+    "EventCode": "0x4003C",
+    "EventName": "PM_DISP_HELD_SYNC_HOLD",
+    "BriefDescription": "Cycles in which dispatch is held because of a synchronizing instruction in the pipeline"
+  },
+  {,
+    "EventCode": "0x3003C",
+    "EventName": "PM_CMPLU_STALL_NESTED_TEND",
+    "BriefDescription": "Completion stall because the ISU is updating the TEXASR to keep track of the nested tend and decrement the TEXASR nested level. This is a short delay"
+  },
+  {,
+    "EventCode": "0x3D05C",
+    "EventName": "PM_DISP_HELD_HB_FULL",
+    "BriefDescription": "Dispatch held due to History Buffer full. Could be GPR/VSR/VMR/FPR/CR/XVF; CR; XVF (XER/VSCR/FPSCR)"
+  },
+  {,
+    "EventCode": "0x30052",
+    "EventName": "PM_SYS_PUMP_MPRED",
+    "BriefDescription": "Final Pump Scope (system) mispredicted. Either the original scope was too small (Chip/Group) or the original scope was System and it should have been smaller. Counts for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x2E044",
+    "EventName": "PM_DPTEG_FROM_L31_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x34048",
+    "EventName": "PM_INST_FROM_DL2L3_SHR",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x45042",
+    "EventName": "PM_IPTEG_FROM_L3",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x15042",
+    "EventName": "PM_IPTEG_FROM_L2",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x1C05E",
+    "EventName": "PM_MEM_LOC_THRESH_LSU_MED",
+    "BriefDescription": "Local memory above threshold for data prefetch"
+  },
+  {,
+    "EventCode": "0x40134",
+    "EventName": "PM_MRK_INST_TIMEO",
+    "BriefDescription": "marked Instruction finish timeout (instruction lost)"
+  },
+  {,
+    "EventCode": "0x1002C",
+    "EventName": "PM_L1_DCACHE_RELOADED_ALL",
+    "BriefDescription": "L1 data cache reloaded for demand. If MMCR1[16] is 1, prefetches will be included as well"
+  },
+  {,
+    "EventCode": "0x30130",
+    "EventName": "PM_MRK_INST_FIN",
+    "BriefDescription": "marked instruction finished"
+  },
+  {,
+    "EventCode": "0x1F14A",
+    "EventName": "PM_MRK_DPTEG_FROM_RL2L3_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x3504E",
+    "EventName": "PM_DARQ0_4_6_ENTRIES",
+    "BriefDescription": "Cycles in which 4, 5, or 6 DARQ entries (out of 12) are in use"
+  },
+  {,
+    "EventCode": "0x30064",
+    "EventName": "PM_DARQ_STORE_XMIT",
+    "BriefDescription": "The DARQ attempted to transmit a store into an LSAQ or SRQ entry. Includes rejects. Not qualified by thread, so it includes counts for the whole core"
+  },
+  {,
+    "EventCode": "0x45046",
+    "EventName": "PM_IPTEG_FROM_L21_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x2C016",
+    "EventName": "PM_CMPLU_STALL_PASTE",
+    "BriefDescription": "Finish stall because the NTF instruction was a paste waiting for response from L2"
+  },
+  {,
+    "EventCode": "0x24156",
+    "EventName": "PM_MRK_STCX_FIN",
+    "BriefDescription": "Number of marked stcx instructions finished. This includes instructions in the speculative path of a branch that may be flushed"
+  },
+  {,
+    "EventCode": "0x15150",
+    "EventName": "PM_SYNC_MRK_PROBE_NOP",
+    "BriefDescription": "Marked probeNops which can cause synchronous interrupts"
+  },
+  {,
+    "EventCode": "0x301E4",
+    "EventName": "PM_MRK_BR_MPRED_CMPL",
+    "BriefDescription": "Marked Branch Mispredicted"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/memory.json b/tools/perf/pmu-events/arch/powerpc/power9/memory.json
new file mode 100644
index 000000000000..9960d1c0dd44
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/memory.json
@@ -0,0 +1,132 @@
+[
+  {,
+    "EventCode": "0x3006E",
+    "EventName": "PM_NEST_REF_CLK",
+    "BriefDescription": "Multiply by 4 to obtain the number of PB cycles"
+  },
+  {,
+    "EventCode": "0x20010",
+    "EventName": "PM_PMC1_OVERFLOW",
+    "BriefDescription": "Overflow from counter 1"
+  },
+  {,
+    "EventCode": "0x2005A",
+    "EventName": "PM_DARQ1_7_9_ENTRIES",
+    "BriefDescription": "Cycles in which 7 to 9 DARQ1 entries (out of 12) are in use"
+  },
+  {,
+    "EventCode": "0x3C048",
+    "EventName": "PM_DATA_FROM_DL2L3_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x10008",
+    "EventName": "PM_RUN_SPURR",
+    "BriefDescription": "Run SPURR"
+  },
+  {,
+    "EventCode": "0x200F6",
+    "EventName": "PM_LSU_DERAT_MISS",
+    "BriefDescription": "DERAT Reloaded due to a DERAT miss"
+  },
+  {,
+    "EventCode": "0x4C048",
+    "EventName": "PM_DATA_FROM_DL2L3_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x1D15E",
+    "EventName": "PM_MRK_RUN_CYC",
+    "BriefDescription": "Run cycles in which a marked instruction is in the pipeline"
+  },
+  {,
+    "EventCode": "0x4003E",
+    "EventName": "PM_LD_CMPL",
+    "BriefDescription": "count of Loads completed"
+  },
+  {,
+    "EventCode": "0x2D156",
+    "EventName": "PM_MRK_DTLB_MISS_4K",
+    "BriefDescription": "Marked Data TLB Miss page size 4k"
+  },
+  {,
+    "EventCode": "0x4C042",
+    "EventName": "PM_DATA_FROM_L3",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L3 due to a demand load"
+  },
+  {,
+    "EventCode": "0x4D02C",
+    "EventName": "PM_PMC1_REWIND",
+    "BriefDescription": ""
+  },
+  {,
+    "EventCode": "0x15158",
+    "EventName": "PM_SYNC_MRK_L2HIT",
+    "BriefDescription": "Marked L2 Hits that can throw a synchronous interrupt"
+  },
+  {,
+    "EventCode": "0x3404A",
+    "EventName": "PM_INST_FROM_RMEM",
+    "BriefDescription": "The processor's Instruction cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x301E2",
+    "EventName": "PM_MRK_ST_CMPL",
+    "BriefDescription": "Marked store completed and sent to nest"
+  },
+  {,
+    "EventCode": "0x1C050",
+    "EventName": "PM_DATA_CHIP_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope was chip pump (prediction=correct) for a demand load"
+  },
+  {,
+    "EventCode": "0x4C040",
+    "EventName": "PM_DATA_FROM_L2_DISP_CONFLICT_OTHER",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 with dispatch conflict due to a demand load"
+  },
+  {,
+    "EventCode": "0x2E05C",
+    "EventName": "PM_LSU_REJECT_ERAT_MISS",
+    "BriefDescription": "LSU Reject due to ERAT (up to 4 per cycles)"
+  },
+  {,
+    "EventCode": "0x1000A",
+    "EventName": "PM_PMC3_REWIND",
+    "BriefDescription": "PMC3 rewind event. A rewind happens when a speculative event (such as latency or CPI stack) is selected on PMC3 and the stall reason or reload source did not match the one programmed in PMC3. When this occurs, the count in PMC3 will not change."
+  },
+  {,
+    "EventCode": "0x3C058",
+    "EventName": "PM_LARX_FIN",
+    "BriefDescription": "Larx finished"
+  },
+  {,
+    "EventCode": "0x1C040",
+    "EventName": "PM_DATA_FROM_L2_NO_CONFLICT",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 without conflict due to a demand load"
+  },
+  {,
+    "EventCode": "0x2C040",
+    "EventName": "PM_DATA_FROM_L2_MEPF",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state due to a demand load"
+  },
+  {,
+    "EventCode": "0x2E05A",
+    "EventName": "PM_LRQ_REJECT",
+    "BriefDescription": "Internal LSU reject from LRQ. Rejects cause the load to go back to LRQ, but it stays contained within the LSU once it gets issued. This event counts the number of times the LRQ attempts to relaunch an instruction after a reject. Any load can suffer multiple rejects"
+  },
+  {,
+    "EventCode": "0x2C05C",
+    "EventName": "PM_INST_GRP_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope was group pump (prediction=correct) for an instruction fetch (demand only)"
+  },
+  {,
+    "EventCode": "0x4D056",
+    "EventName": "PM_NON_FMA_FLOP_CMPL",
+    "BriefDescription": "Non FMA instruction completed"
+  },
+  {,
+    "EventCode": "0x3E050",
+    "EventName": "PM_DARQ1_4_6_ENTRIES",
+    "BriefDescription": "Cycles in which 4, 5, or 6 DARQ1 entries (out of 12) are in use"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/other.json b/tools/perf/pmu-events/arch/powerpc/power9/other.json
new file mode 100644
index 000000000000..00f3d2a21f31
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/other.json
@@ -0,0 +1,2512 @@
+[
+  {,
+    "EventCode": "0x3084",
+    "EventName": "PM_ISU1_ISS_HOLD_ALL",
+    "BriefDescription": "All ISU rejects"
+  },
+  {,
+    "EventCode": "0xF880",
+    "EventName": "PM_SNOOP_TLBIE",
+    "BriefDescription": "TLBIE snoop"
+  },
+  {,
+    "EventCode": "0x4088",
+    "EventName": "PM_IC_DEMAND_REQ",
+    "BriefDescription": "Demand Instruction fetch request"
+  },
+  {,
+    "EventCode": "0x20A4",
+    "EventName": "PM_TM_TRESUME",
+    "BriefDescription": "TM resume instruction completed"
+  },
+  {,
+    "EventCode": "0x40008",
+    "EventName": "PM_SRQ_EMPTY_CYC",
+    "BriefDescription": "Cycles in which the SRQ has at least one (out of four) empty slice"
+  },
+  {,
+    "EventCode": "0x20064",
+    "EventName": "PM_IERAT_RELOAD_4K",
+    "BriefDescription": "IERAT reloaded (after a miss) for 4K pages"
+  },
+  {,
+    "EventCode": "0x260B4",
+    "EventName": "PM_L3_P2_LCO_RTY",
+    "BriefDescription": "L3 initiated LCO received retry on port 2 (can try 4 times)"
+  },
+  {,
+    "EventCode": "0x20006",
+    "EventName": "PM_DISP_HELD_ISSQ_FULL",
+    "BriefDescription": "Dispatch held due to Issue q full. Includes issue queue and branch queue"
+  },
+  {,
+    "EventCode": "0x201E4",
+    "EventName": "PM_MRK_DATA_FROM_L3MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a location other than the local core's L3 due to a marked load"
+  },
+  {,
+    "EventCode": "0x4E044",
+    "EventName": "PM_DPTEG_FROM_L31_ECO_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x40B8",
+    "EventName": "PM_BR_MPRED_TAKEN_CR",
+    "BriefDescription": "A Conditional Branch that resolved to taken was mispredicted as not taken (due to the BHT Direction Prediction)."
+  },
+  {,
+    "EventCode": "0xF8AC",
+    "EventName": "PM_DC_DEALLOC_NO_CONF",
+    "BriefDescription": "A demand load referenced a line in an active fuzzy prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.Fuzzy stream confirm (out of order effects, or pf cant keep up)"
+  },
+  {,
+    "EventCode": "0xD090",
+    "EventName": "PM_LS0_DC_COLLISIONS",
+    "BriefDescription": "Read-write data cache collisions"
+  },
+  {,
+    "EventCode": "0x40BC",
+    "EventName": "PM_THRD_PRIO_0_1_CYC",
+    "BriefDescription": "Cycles thread running at priority level 0 or 1"
+  },
+  {,
+    "EventCode": "0x2084",
+    "EventName": "PM_FLUSH_HB_RESTORE_CYC",
+    "BriefDescription": "Cycles in which no new instructions can be dispatched to the ICT after a flush.  History buffer recovery"
+  },
+  {,
+    "EventCode": "0x4F054",
+    "EventName": "PM_RADIX_PWC_MISS",
+    "BriefDescription": "A radix translation attempt missed in the TLB and all levels of page walk cache."
+  },
+  {,
+    "EventCode": "0x24048",
+    "EventName": "PM_INST_FROM_LMEM",
+    "BriefDescription": "The processor's Instruction cache was reloaded from the local chip's Memory due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0xD8B4",
+    "EventName": "PM_LSU0_LRQ_S0_VALID_CYC",
+    "BriefDescription": "Slot 0 of LRQ valid"
+  },
+  {,
+    "EventCode": "0x2E052",
+    "EventName": "PM_TM_PASSED",
+    "BriefDescription": "Number of TM transactions that passed"
+  },
+  {,
+    "EventCode": "0xD1A0",
+    "EventName": "PM_MRK_LSU_FLUSH_LHS",
+    "BriefDescription": "Effective Address alias flush : no EA match but Real Address match.  If the data has not yet been returned for this load, the instruction will just be rejected, but if it has returned data, it will be flushed"
+  },
+  {,
+    "EventCode": "0xF088",
+    "EventName": "PM_LSU0_STORE_REJECT",
+    "BriefDescription": "All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met"
+  },
+  {,
+    "EventCode": "0x360B2",
+    "EventName": "PM_L3_GRP_GUESS_WRONG_LOW",
+    "BriefDescription": "Initial scope=group (GS or NNS) but data from outside group (far or rem). Prediction too Low"
+  },
+  {,
+    "EventCode": "0x168A6",
+    "EventName": "PM_TM_CAM_OVERFLOW",
+    "BriefDescription": "L3 TM cam overflow during L2 co of SC"
+  },
+  {,
+    "EventCode": "0xE8B0",
+    "EventName": "PM_TEND_PEND_CYC",
+    "BriefDescription": "TEND latency per thread"
+  },
+  {,
+    "EventCode": "0x4884",
+    "EventName": "PM_IBUF_FULL_CYC",
+    "BriefDescription": "Cycles No room in ibuff"
+  },
+  {,
+    "EventCode": "0xD08C",
+    "EventName": "PM_LSU2_LDMX_FIN",
+    "BriefDescription": "New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491):  The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region. This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56])"
+  },
+  {,
+    "EventCode": "0x300F8",
+    "EventName": "PM_TB_BIT_TRANS",
+    "BriefDescription": "timebase event"
+  },
+  {,
+    "EventCode": "0x3C040",
+    "EventName": "PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a demand load"
+  },
+  {,
+    "EventCode": "0xE0BC",
+    "EventName": "PM_LS0_PTE_TABLEWALK_CYC",
+    "BriefDescription": "Cycles when a tablewalk is pending on this thread on table 0"
+  },
+  {,
+    "EventCode": "0x3884",
+    "EventName": "PM_ISU3_ISS_HOLD_ALL",
+    "BriefDescription": "All ISU rejects"
+  },
+  {,
+    "EventCode": "0x460A6",
+    "EventName": "PM_RD_FORMING_SC",
+    "BriefDescription": "Read forming SC"
+  },
+  {,
+    "EventCode": "0x468A0",
+    "EventName": "PM_L3_PF_OFF_CHIP_MEM",
+    "BriefDescription": "L3 PF from Off chip memory"
+  },
+  {,
+    "EventCode": "0x268AA",
+    "EventName": "PM_L3_P1_LCO_DATA",
+    "BriefDescription": "LCO sent with data port 1"
+  },
+  {,
+    "EventCode": "0xE894",
+    "EventName": "PM_LSU1_TM_L1_HIT",
+    "BriefDescription": "Load tm hit in L1"
+  },
+  {,
+    "EventCode": "0x5888",
+    "EventName": "PM_IC_INVALIDATE",
+    "BriefDescription": "Ic line invalidated"
+  },
+  {,
+    "EventCode": "0x2890",
+    "EventName": "PM_DISP_CLB_HELD_TLBIE",
+    "BriefDescription": "Dispatch Hold: Due to TLBIE"
+  },
+  {,
+    "EventCode": "0x1001C",
+    "EventName": "PM_CMPLU_STALL_THRD",
+    "BriefDescription": "Completion Stalled because the thread was blocked"
+  },
+  {,
+    "EventCode": "0x368A6",
+    "EventName": "PM_SNP_TM_HIT_T",
+    "BriefDescription": "Snp TM sthit T/Tn/Te"
+  },
+  {,
+    "EventCode": "0x3001A",
+    "EventName": "PM_DATA_TABLEWALK_CYC",
+    "BriefDescription": "Data Tablewalk Cycles.  Could be 1 or 2 active tablewalks. Includes data prefetches."
+  },
+  {,
+    "EventCode": "0xD894",
+    "EventName": "PM_LS3_DC_COLLISIONS",
+    "BriefDescription": "Read-write data cache collisions"
+  },
+  {,
+    "EventCode": "0x35158",
+    "EventName": "PM_MRK_DATA_FROM_L31_ECO_MOD_CYC",
+    "BriefDescription": "Duration in cycles to reload with Modified (M) data from another core's ECO L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0xF0B4",
+    "EventName": "PM_DC_PREF_CONS_ALLOC",
+    "BriefDescription": "Prefetch stream allocated in the conservative phase by either the hardware prefetch mechanism or software prefetch"
+  },
+  {,
+    "EventCode": "0xF894",
+    "EventName": "PM_LSU3_L1_CAM_CANCEL",
+    "BriefDescription": "ls3 l1 tm cam cancel"
+  },
+  {,
+    "EventCode": "0x2888",
+    "EventName": "PM_FLUSH_DISP_TLBIE",
+    "BriefDescription": "Dispatch Flush: TLBIE"
+  },
+  {,
+    "EventCode": "0xD1A4",
+    "EventName": "PM_MRK_LSU_FLUSH_SAO",
+    "BriefDescription": "A load-hit-load condition with Strong Address Ordering will have address compare disabled and flush"
+  },
+  {,
+    "EventCode": "0x4E11E",
+    "EventName": "PM_MRK_DATA_FROM_DMEM_CYC",
+    "BriefDescription": "Duration in cycles to reload from another chip's memory on the same Node or Group (Distant) due to a marked load"
+  },
+  {,
+    "EventCode": "0x5894",
+    "EventName": "PM_LWSYNC",
+    "BriefDescription": "Lwsync instruction decoded and transferred"
+  },
+  {,
+    "EventCode": "0x14156",
+    "EventName": "PM_MRK_DATA_FROM_L2_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L2 due to a marked load"
+  },
+  {,
+    "EventCode": "0x468A6",
+    "EventName": "PM_RD_CLEARING_SC",
+    "BriefDescription": "Read clearing SC"
+  },
+  {,
+    "EventCode": "0x50A0",
+    "EventName": "PM_HWSYNC",
+    "BriefDescription": "Hwsync instruction decoded and transferred"
+  },
+  {,
+    "EventCode": "0x168B0",
+    "EventName": "PM_L3_P1_NODE_PUMP",
+    "BriefDescription": "L3 PF sent with nodal scope port 1, counts even retried requests"
+  },
+  {,
+    "EventCode": "0xD0BC",
+    "EventName": "PM_LSU0_1_LRQF_FULL_CYC",
+    "BriefDescription": "Counts the number of cycles the LRQF is full.  LRQF is the queue that holds loads between finish and completion.  If it fills up, instructions stay in LRQ until completion, potentially backing up the LRQ"
+  },
+  {,
+    "EventCode": "0x2D148",
+    "EventName": "PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 with load hit store conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x460A8",
+    "EventName": "PM_SN_HIT",
+    "BriefDescription": "Any port snooper hit L3.  Up to 4 can happen in a cycle but we only count 1"
+  },
+  {,
+    "EventCode": "0x360AA",
+    "EventName": "PM_L3_P0_CO_MEM",
+    "BriefDescription": "L3 CO to memory port 0 with or without data"
+  },
+  {,
+    "EventCode": "0xF0A4",
+    "EventName": "PM_DC_PREF_HW_ALLOC",
+    "BriefDescription": "Prefetch stream allocated by the hardware prefetch mechanism"
+  },
+  {,
+    "EventCode": "0xF0BC",
+    "EventName": "PM_LS2_UNALIGNED_ST",
+    "BriefDescription": "Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size.  If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+  },
+  {,
+    "EventCode": "0xD0AC",
+    "EventName": "PM_SRQ_SYNC_CYC",
+    "BriefDescription": "A sync is in the S2Q (edge detect to count)"
+  },
+  {,
+    "EventCode": "0x401E6",
+    "EventName": "PM_MRK_INST_FROM_L3MISS",
+    "BriefDescription": "Marked instruction was reloaded from a location beyond the local chiplet"
+  },
+  {,
+    "EventCode": "0x26082",
+    "EventName": "PM_L2_IC_INV",
+    "BriefDescription": "I-cache Invalidates sent over the realod bus to the core"
+  },
+  {,
+    "EventCode": "0xC8AC",
+    "EventName": "PM_LSU_FLUSH_RELAUNCH_MISS",
+    "BriefDescription": "If a load that has already returned data and has to relaunch for any reason then gets a miss (erat, setp, data cache), it will often be flushed at relaunch time because the data might be inconsistent"
+  },
+  {,
+    "EventCode": "0x260A4",
+    "EventName": "PM_L3_LD_HIT",
+    "BriefDescription": "L3 Hits for demand LDs"
+  },
+  {,
+    "EventCode": "0xF0A0",
+    "EventName": "PM_DATA_STORE",
+    "BriefDescription": "All ops that drain from s2q to L2 containing data"
+  },
+  {,
+    "EventCode": "0x1D148",
+    "EventName": "PM_MRK_DATA_FROM_RMEM",
+    "BriefDescription": "The processor's data cache was reloaded from another chip's memory on the same Node or Group ( Remote) due to a marked load"
+  },
+  {,
+    "EventCode": "0x16088",
+    "EventName": "PM_L2_LOC_GUESS_CORRECT",
+    "BriefDescription": "L2 guess local (LNS) and guess was correct (ie data local)"
+  },
+  {,
+    "EventCode": "0x160A4",
+    "EventName": "PM_L3_HIT",
+    "BriefDescription": "L3 Hits (L2 miss hitting L3, including data/instrn/xlate)"
+  },
+  {,
+    "EventCode": "0xE09C",
+    "EventName": "PM_LSU0_TM_L1_MISS",
+    "BriefDescription": "Load tm L1 miss"
+  },
+  {,
+    "EventCode": "0x168B4",
+    "EventName": "PM_L3_P1_LCO_RTY",
+    "BriefDescription": "L3 initiated LCO received retry on port 1 (can try 4 times)"
+  },
+  {,
+    "EventCode": "0x268AC",
+    "EventName": "PM_L3_RD_USAGE",
+    "BriefDescription": "Rotating sample of 16 RD actives"
+  },
+  {,
+    "EventCode": "0x1415C",
+    "EventName": "PM_MRK_DATA_FROM_L3_MEPF_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L3 without dispatch conflicts hit on Mepf state due to a marked load"
+  },
+  {,
+    "EventCode": "0xE880",
+    "EventName": "PM_L1_SW_PREF",
+    "BriefDescription": "Software L1 Prefetches, including SW Transient Prefetches"
+  },
+  {,
+    "EventCode": "0x288C",
+    "EventName": "PM_DISP_CLB_HELD_BAL",
+    "BriefDescription": "Dispatch/CLB Hold: Balance Flush"
+  },
+  {,
+    "EventCode": "0x101EA",
+    "EventName": "PM_MRK_L1_RELOAD_VALID",
+    "BriefDescription": "Marked demand reload"
+  },
+  {,
+    "EventCode": "0x1D156",
+    "EventName": "PM_MRK_LD_MISS_L1_CYC",
+    "BriefDescription": "Marked ld latency"
+  },
+  {,
+    "EventCode": "0x4C01A",
+    "EventName": "PM_CMPLU_STALL_DMISS_L3MISS",
+    "BriefDescription": "Completion stall due to cache miss resolving missed the L3"
+  },
+  {,
+    "EventCode": "0x2006C",
+    "EventName": "PM_RUN_CYC_SMT4_MODE",
+    "BriefDescription": "Cycles in which this thread's run latch is set and the core is in SMT4 mode"
+  },
+  {,
+    "EventCode": "0x5088",
+    "EventName": "PM_DECODE_FUSION_OP_PRESERV",
+    "BriefDescription": "Destructive op operand preservation"
+  },
+  {,
+    "EventCode": "0x1D14E",
+    "EventName": "PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC",
+    "BriefDescription": "Duration in cycles to reload either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load"
+  },
+  {,
+    "EventCode": "0x509C",
+    "EventName": "PM_FORCED_NOP",
+    "BriefDescription": "Instruction was forced to execute as a nop because it was found to behave like a nop (have no effect) at decode time"
+  },
+  {,
+    "EventCode": "0xC098",
+    "EventName": "PM_LS2_UNALIGNED_LD",
+    "BriefDescription": "Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size.  If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+  },
+  {,
+    "EventCode": "0x20058",
+    "EventName": "PM_DARQ1_10_12_ENTRIES",
+    "BriefDescription": "Cycles in which 10 or  more DARQ1 entries (out of 12) are in use"
+  },
+  {,
+    "EventCode": "0x360A6",
+    "EventName": "PM_SNP_TM_HIT_M",
+    "BriefDescription": "Snp TM st hit M/Mu"
+  },
+  {,
+    "EventCode": "0x5898",
+    "EventName": "PM_LINK_STACK_INVALID_PTR",
+    "BriefDescription": "It is most often caused by certain types of flush where the pointer is not available. Can result in the data in the link stack becoming unusable."
+  },
+  {,
+    "EventCode": "0x46088",
+    "EventName": "PM_L2_CHIP_PUMP",
+    "BriefDescription": "RC requests that were local (aka chip) pump attempts"
+  },
+  {,
+    "EventCode": "0x28A0",
+    "EventName": "PM_TM_TSUSPEND",
+    "BriefDescription": "TM suspend instruction completed"
+  },
+  {,
+    "EventCode": "0x20054",
+    "EventName": "PM_L1_PREF",
+    "BriefDescription": "A data line was written to the L1 due to a hardware or software prefetch"
+  },
+  {,
+    "EventCode": "0xF888",
+    "EventName": "PM_LSU1_STORE_REJECT",
+    "BriefDescription": "All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met"
+  },
+  {,
+    "EventCode": "0x4505E",
+    "EventName": "PM_FLOP_CMPL",
+    "BriefDescription": "Floating Point Operation Finished"
+  },
+  {,
+    "EventCode": "0x1D144",
+    "EventName": "PM_MRK_DATA_FROM_L3_DISP_CONFLICT",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x400FA",
+    "EventName": "PM_RUN_INST_CMPL",
+    "BriefDescription": "Run_Instructions"
+  },
+  {,
+    "EventCode": "0x15154",
+    "EventName": "PM_SYNC_MRK_L3MISS",
+    "BriefDescription": "Marked L3 misses that can throw a synchronous interrupt"
+  },
+  {,
+    "EventCode": "0xE0B4",
+    "EventName": "PM_LS0_TM_DISALLOW",
+    "BriefDescription": "A TM-ineligible instruction tries to execute inside a transaction and the LSU disallows it"
+  },
+  {,
+    "EventCode": "0x26884",
+    "EventName": "PM_DSIDE_MRU_TOUCH",
+    "BriefDescription": "D-side L2 MRU touch sent to L2"
+  },
+  {,
+    "EventCode": "0x30134",
+    "EventName": "PM_MRK_ST_CMPL_INT",
+    "BriefDescription": "marked store finished with intervention"
+  },
+  {,
+    "EventCode": "0xC0B8",
+    "EventName": "PM_LSU_FLUSH_SAO",
+    "BriefDescription": "A load-hit-load condition with Strong Address Ordering will have address compare disabled and flush"
+  },
+  {,
+    "EventCode": "0x50A8",
+    "EventName": "PM_EAT_FORCE_MISPRED",
+    "BriefDescription": "XL-form branch was mispredicted due to the predicted target address missing from EAT.  The EAT forces a mispredict in this case since there is no predicated target to validate.  This is a rare case that may occur when the EAT is full and a branch is issued"
+  },
+  {,
+    "EventCode": "0xC094",
+    "EventName": "PM_LS0_UNALIGNED_LD",
+    "BriefDescription": "Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size.  If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+  },
+  {,
+    "EventCode": "0xF8BC",
+    "EventName": "PM_LS3_UNALIGNED_ST",
+    "BriefDescription": "Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size.  If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+  },
+  {,
+    "EventCode": "0x58B0",
+    "EventName": "PM_BTAC_GOOD_RESULT",
+    "BriefDescription": "BTAC predicts a taken branch and the BHT agrees, and the target address is correct"
+  },
+  {,
+    "EventCode": "0x1C04C",
+    "EventName": "PM_DATA_FROM_LL4",
+    "BriefDescription": "The processor's data cache was reloaded from the local chip's L4 cache due to a demand load"
+  },
+  {,
+    "EventCode": "0x3608E",
+    "EventName": "PM_TM_ST_CONF",
+    "BriefDescription": "TM Store (fav or non-fav) ran into conflict (failed)"
+  },
+  {,
+    "EventCode": "0xD998",
+    "EventName": "PM_MRK_LSU_FLUSH_EMSH",
+    "BriefDescription": "An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address"
+  },
+  {,
+    "EventCode": "0xF8A0",
+    "EventName": "PM_NON_DATA_STORE",
+    "BriefDescription": "All ops that drain from s2q to L2 and contain no data"
+  },
+  {,
+    "EventCode": "0x3F146",
+    "EventName": "PM_MRK_DPTEG_FROM_L21_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x40A0",
+    "EventName": "PM_BR_UNCOND",
+    "BriefDescription": "Unconditional Branch Completed. HW branch prediction was not used for this branch. This can be an I-form branch, a B-form branch with BO-field set to branch always, or a B-form branch which was covenrted to a Resolve."
+  },
+  {,
+    "EventCode": "0x1F056",
+    "EventName": "PM_RADIX_PWC_L1_HIT",
+    "BriefDescription": "A radix translation attempt missed in the TLB and only the first level page walk cache was a hit."
+  },
+  {,
+    "EventCode": "0xF8A8",
+    "EventName": "PM_DC_PREF_FUZZY_CONF",
+    "BriefDescription": "A demand load referenced a line in an active fuzzy prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software.Fuzzy stream confirm (out of order effects, or pf cant keep up)"
+  },
+  {,
+    "EventCode": "0xF8A4",
+    "EventName": "PM_DC_PREF_SW_ALLOC",
+    "BriefDescription": "Prefetch stream allocated by software prefetching"
+  },
+  {,
+    "EventCode": "0xE0A0",
+    "EventName": "PM_LSU2_TM_L1_MISS",
+    "BriefDescription": "Load tm L1 miss"
+  },
+  {,
+    "EventCode": "0x2894",
+    "EventName": "PM_TM_OUTER_TEND",
+    "BriefDescription": "Completion time outer tend"
+  },
+  {,
+    "EventCode": "0xF098",
+    "EventName": "PM_XLATE_HPT_MODE",
+    "BriefDescription": "LSU reports every cycle the thread is in HPT translation mode (as opposed to radix mode)"
+  },
+  {,
+    "EventCode": "0x2C04E",
+    "EventName": "PM_LD_MISS_L1_FIN",
+    "BriefDescription": "Number of load instructions that finished with an L1 miss. Note that even if a load spans multiple slices this event will increment only once per load op."
+  },
+  {,
+    "EventCode": "0x30162",
+    "EventName": "PM_MRK_LSU_DERAT_MISS",
+    "BriefDescription": "Marked derat reload (miss) for any page size"
+  },
+  {,
+    "EventCode": "0x160A0",
+    "EventName": "PM_L3_PF_MISS_L3",
+    "BriefDescription": "L3 PF missed in L3"
+  },
+  {,
+    "EventCode": "0x1C04A",
+    "EventName": "PM_DATA_FROM_RL2L3_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a demand load"
+  },
+  {,
+    "EventCode": "0xD99C",
+    "EventName": "PM_MRK_LSU_FLUSH_UE",
+    "BriefDescription": "Correctable ECC error on reload data, reported at critical data forward time"
+  },
+  {,
+    "EventCode": "0x268B0",
+    "EventName": "PM_L3_P1_GRP_PUMP",
+    "BriefDescription": "L3 PF sent with grp scope port 1, counts even retried requests"
+  },
+  {,
+    "EventCode": "0x30016",
+    "EventName": "PM_CMPLU_STALL_SRQ_FULL",
+    "BriefDescription": "Finish stall because the NTF instruction was a store that was held in LSAQ because the SRQ was full"
+  },
+  {,
+    "EventCode": "0x40B4",
+    "EventName": "PM_BR_PRED_TA",
+    "BriefDescription": "Conditional Branch Completed that had its target address predicted. Only XL-form branches set this event.  This equal the sum of CCACHE, LSTACK, and PCACHE"
+  },
+  {,
+    "EventCode": "0x40AC",
+    "EventName": "PM_BR_MPRED_CCACHE",
+    "BriefDescription": "Conditional Branch Completed that was Mispredicted due to the Count Cache Target Prediction"
+  },
+  {,
+    "EventCode": "0x3688A",
+    "EventName": "PM_L2_RTY_LD",
+    "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)"
+  },
+  {,
+    "EventCode": "0x3689E",
+    "EventName": "PM_L2_RTY_LD",
+    "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)"
+  },
+  {,
+    "EventCode": "0xE08C",
+    "EventName": "PM_LSU0_ERAT_HIT",
+    "BriefDescription": "Primary ERAT hit.  There is no secondary ERAT"
+  },
+  {,
+    "EventCode": "0xE088",
+    "EventName": "PM_LS2_ERAT_MISS_PREF",
+    "BriefDescription": "LS0 Erat miss due to prefetch"
+  },
+  {,
+    "EventCode": "0xF0A8",
+    "EventName": "PM_DC_PREF_CONF",
+    "BriefDescription": "A demand load referenced a line in an active prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software. Includes forwards and backwards streams"
+  },
+  {,
+    "EventCode": "0x16888",
+    "EventName": "PM_L2_LOC_GUESS_WRONG",
+    "BriefDescription": "L2 guess local (LNS) and guess was not correct (ie data not on chip)"
+  },
+  {,
+    "EventCode": "0xE0A4",
+    "EventName": "PM_TMA_REQ_L2",
+    "BriefDescription": "addrs only req to L2 only on the first one,Indication that Load footprint is not expanding"
+  },
+  {,
+    "EventCode": "0x5884",
+    "EventName": "PM_DECODE_LANES_NOT_AVAIL",
+    "BriefDescription": "Decode has something to transmit but dispatch lanes are not available"
+  },
+  {,
+    "EventCode": "0x3C042",
+    "EventName": "PM_DATA_FROM_L3_DISP_CONFLICT",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L3 with dispatch conflict due to a demand load"
+  },
+  {,
+    "EventCode": "0x168AA",
+    "EventName": "PM_L3_P1_LCO_NO_DATA",
+    "BriefDescription": "Dataless L3 LCO sent port 1"
+  },
+  {,
+    "EventCode": "0x3D140",
+    "EventName": "PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L2 with dispatch conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0xC89C",
+    "EventName": "PM_LS1_LAUNCH_HELD_PREF",
+    "BriefDescription": "Number of times a load or store instruction was unable to launch/relaunch because a high priority prefetch used that relaunch cycle"
+  },
+  {,
+    "EventCode": "0x4894",
+    "EventName": "PM_IC_RELOAD_PRIVATE",
+    "BriefDescription": "Reloading line was brought in private for a specific thread.  Most lines are brought in shared for all eight threads.  If RA does not match then invalidates and then brings it shared to other thread. In P7 line brought in private , then line was invalidat"
+  },
+  {,
+    "EventCode": "0x1688E",
+    "EventName": "PM_TM_LD_CAUSED_FAIL",
+    "BriefDescription": "Non-TM Load caused any thread to fail"
+  },
+  {,
+    "EventCode": "0x26084",
+    "EventName": "PM_L2_RCLD_DISP_FAIL_OTHER",
+    "BriefDescription": "All I-or-D side load dispatch attempts for this thread that failed due to reason other than address collision (excludes i_l2mru_tch_reqs)"
+  },
+  {,
+    "EventCode": "0x101E4",
+    "EventName": "PM_MRK_L1_ICACHE_MISS",
+    "BriefDescription": "sampled Instruction suffered an icache Miss"
+  },
+  {,
+    "EventCode": "0x20A0",
+    "EventName": "PM_TM_NESTED_TBEGIN",
+    "BriefDescription": "Completion Tm nested tbegin"
+  },
+  {,
+    "EventCode": "0x368AA",
+    "EventName": "PM_L3_P1_CO_MEM",
+    "BriefDescription": "L3 CO to memory port 1 with or without data"
+  },
+  {,
+    "EventCode": "0xC8A4",
+    "EventName": "PM_LSU3_FALSE_LHS",
+    "BriefDescription": "False LHS match detected"
+  },
+  {,
+    "EventCode": "0xD9A4",
+    "EventName": "PM_MRK_LSU_FLUSH_LARX_STCX",
+    "BriefDescription": "A larx is flushed because an older larx has an LMQ reservation for the same thread.  A stcx is flushed because an older stcx is in the LMQ.  The flush happens when the older larx/stcx relaunches"
+  },
+  {,
+    "EventCode": "0x4D012",
+    "EventName": "PM_PMC3_SAVED",
+    "BriefDescription": "PMC3 Rewind Value saved"
+  },
+  {,
+    "EventCode": "0xE888",
+    "EventName": "PM_LS3_ERAT_MISS_PREF",
+    "BriefDescription": "LS1 Erat miss due to prefetch"
+  },
+  {,
+    "EventCode": "0x368B4",
+    "EventName": "PM_L3_RD0_BUSY",
+    "BriefDescription": "Lifetime, sample of RD machine 0 valid"
+  },
+  {,
+    "EventCode": "0x468B4",
+    "EventName": "PM_L3_RD0_BUSY",
+    "BriefDescription": "Lifetime, sample of RD machine 0 valid"
+  },
+  {,
+    "EventCode": "0x46080",
+    "EventName": "PM_L2_DISP_ALL_L2MISS",
+    "BriefDescription": "All successful Ld/St dispatches for this thread that were an L2 miss (excludes i_l2mru_tch_reqs)"
+  },
+  {,
+    "EventCode": "0xF8B8",
+    "EventName": "PM_LS1_UNALIGNED_ST",
+    "BriefDescription": "Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size.  If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+  },
+  {,
+    "EventCode": "0x408C",
+    "EventName": "PM_L1_DEMAND_WRITE",
+    "BriefDescription": "Instruction Demand sectors written into IL1"
+  },
+  {,
+    "EventCode": "0x368A8",
+    "EventName": "PM_SN_INVL",
+    "BriefDescription": "Any port snooper detects a store to a line in the Sx state and invalidates the line.  Up to 4 can happen in a cycle but we only count 1"
+  },
+  {,
+    "EventCode": "0x160B2",
+    "EventName": "PM_L3_LOC_GUESS_CORRECT",
+    "BriefDescription": "initial scope=node/chip (LNS) and data from local node (local) (pred successful) - always PFs only"
+  },
+  {,
+    "EventCode": "0x48B4",
+    "EventName": "PM_DECODE_FUSION_CONST_GEN",
+    "BriefDescription": "32-bit constant generation"
+  },
+  {,
+    "EventCode": "0x4D146",
+    "EventName": "PM_MRK_DATA_FROM_L21_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0xE080",
+    "EventName": "PM_S2Q_FULL",
+    "BriefDescription": "Cycles during which the S2Q is full"
+  },
+  {,
+    "EventCode": "0x268B4",
+    "EventName": "PM_L3_P3_LCO_RTY",
+    "BriefDescription": "L3 initiated LCO received retry on port 3 (can try 4 times)"
+  },
+  {,
+    "EventCode": "0xD8B8",
+    "EventName": "PM_LSU0_LMQ_S0_VALID",
+    "BriefDescription": "Slot 0 of LMQ valid"
+  },
+  {,
+    "EventCode": "0x2098",
+    "EventName": "PM_TM_NESTED_TEND",
+    "BriefDescription": "Completion time nested tend"
+  },
+  {,
+    "EventCode": "0x36084",
+    "EventName": "PM_L2_RCST_DISP",
+    "BriefDescription": "All D-side store dispatch attempts for this thread"
+  },
+  {,
+    "EventCode": "0x368A0",
+    "EventName": "PM_L3_PF_OFF_CHIP_CACHE",
+    "BriefDescription": "L3 PF from Off chip cache"
+  },
+  {,
+    "EventCode": "0x20056",
+    "EventName": "PM_TAKEN_BR_MPRED_CMPL",
+    "BriefDescription": "Total number of taken branches that were incorrectly predicted as not-taken. This event counts branches completed and does not include speculative instructions"
+  },
+  {,
+    "EventCode": "0x4688A",
+    "EventName": "PM_L2_SYS_PUMP",
+    "BriefDescription": "RC requests that were system pump attempts"
+  },
+  {,
+    "EventCode": "0xE090",
+    "EventName": "PM_LSU2_ERAT_HIT",
+    "BriefDescription": "Primary ERAT hit.  There is no secondary ERAT"
+  },
+  {,
+    "EventCode": "0x4001C",
+    "EventName": "PM_INST_IMC_MATCH_CMPL",
+    "BriefDescription": "IMC Match Count"
+  },
+  {,
+    "EventCode": "0x40A8",
+    "EventName": "PM_BR_PRED_LSTACK",
+    "BriefDescription": "Conditional Branch Completed  that used the Link Stack for Target Prediction"
+  },
+  {,
+    "EventCode": "0x268A2",
+    "EventName": "PM_L3_CI_MISS",
+    "BriefDescription": "L3 castins miss (total count)"
+  },
+  {,
+    "EventCode": "0x289C",
+    "EventName": "PM_TM_NON_FAV_TBEGIN",
+    "BriefDescription": "Dispatch time non favored tbegin"
+  },
+  {,
+    "EventCode": "0xF08C",
+    "EventName": "PM_LSU2_STORE_REJECT",
+    "BriefDescription": "All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met"
+  },
+  {,
+    "EventCode": "0x360A0",
+    "EventName": "PM_L3_PF_ON_CHIP_CACHE",
+    "BriefDescription": "L3 PF from On chip cache"
+  },
+  {,
+    "EventCode": "0x35152",
+    "EventName": "PM_MRK_DATA_FROM_L2MISS_CYC",
+    "BriefDescription": "Duration in cycles to reload from a location other than the local core's L2 due to a marked load"
+  },
+  {,
+    "EventCode": "0x160AC",
+    "EventName": "PM_L3_SN_USAGE",
+    "BriefDescription": "Rotating sample of 16 snoop valids"
+  },
+  {,
+    "EventCode": "0x16084",
+    "EventName": "PM_L2_RCLD_DISP",
+    "BriefDescription": "All I-or-D side load dispatch attempts for this thread (excludes i_l2mru_tch_reqs)"
+  },
+  {,
+    "EventCode": "0x1608C",
+    "EventName": "PM_RC0_BUSY",
+    "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)"
+  },
+  {,
+    "EventCode": "0x2608C",
+    "EventName": "PM_RC0_BUSY",
+    "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)"
+  },
+  {,
+    "EventCode": "0x36082",
+    "EventName": "PM_L2_LD_DISP",
+    "BriefDescription": "All successful I-or-D side load dispatches for this thread (excludes i_l2mru_tch_reqs)."
+  },
+  {,
+    "EventCode": "0x1609E",
+    "EventName": "PM_L2_LD_DISP",
+    "BriefDescription": "All successful D side load dispatches for this thread (L2 miss + L2 hits)"
+  },
+  {,
+    "EventCode": "0xF8B0",
+    "EventName": "PM_L3_SW_PREF",
+    "BriefDescription": "L3 load prefetch, sourced from a software prefetch stream, was sent to the nest"
+  },
+  {,
+    "EventCode": "0xF884",
+    "EventName": "PM_TABLEWALK_CYC_PREF",
+    "BriefDescription": "tablewalk qualified for pte  prefetches"
+  },
+  {,
+    "EventCode": "0x4D144",
+    "EventName": "PM_MRK_DATA_FROM_L31_ECO_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x16884",
+    "EventName": "PM_L2_RCLD_DISP_FAIL_ADDR",
+    "BriefDescription": "All I-od-D side load dispatch attempts for this thread that failed due to address collision with RC/CO/SN/SQ machine (excludes i_l2mru_tch_reqs)"
+  },
+  {,
+    "EventCode": "0x460A0",
+    "EventName": "PM_L3_PF_ON_CHIP_MEM",
+    "BriefDescription": "L3 PF from On chip memory"
+  },
+  {,
+    "EventCode": "0xF084",
+    "EventName": "PM_PTE_PREFETCH",
+    "BriefDescription": "PTE prefetches"
+  },
+  {,
+    "EventCode": "0x2D026",
+    "EventName": "PM_RADIX_PWC_L1_PDE_FROM_L2",
+    "BriefDescription": "A Page Directory Entry was reloaded to a level 1 page walk cache from the core's L2 data cache"
+  },
+  {,
+    "EventCode": "0x48B0",
+    "EventName": "PM_BR_MPRED_PCACHE",
+    "BriefDescription": "Conditional Branch Completed that was Mispredicted due to pattern cache prediction"
+  },
+  {,
+    "EventCode": "0x2C126",
+    "EventName": "PM_MRK_DATA_FROM_L2",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 due to a marked load"
+  },
+  {,
+    "EventCode": "0xE0AC",
+    "EventName": "PM_TM_FAIL_TLBIE",
+    "BriefDescription": "Transaction failed because there was a TLBIE hit in the bloom filter"
+  },
+  {,
+    "EventCode": "0x260AA",
+    "EventName": "PM_L3_P0_LCO_DATA",
+    "BriefDescription": "LCO sent with data port 0"
+  },
+  {,
+    "EventCode": "0x4888",
+    "EventName": "PM_IC_PREF_REQ",
+    "BriefDescription": "Instruction prefetch requests"
+  },
+  {,
+    "EventCode": "0xC898",
+    "EventName": "PM_LS3_UNALIGNED_LD",
+    "BriefDescription": "Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size.  If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+  },
+  {,
+    "EventCode": "0x488C",
+    "EventName": "PM_IC_PREF_WRITE",
+    "BriefDescription": "Instruction prefetch written into IL1"
+  },
+  {,
+    "EventCode": "0xF89C",
+    "EventName": "PM_XLATE_MISS",
+    "BriefDescription": "The LSU requested a line from L2 for translation.  It may be satisfied from any source beyond L2.  Includes speculative instructions"
+  },
+  {,
+    "EventCode": "0x14158",
+    "EventName": "PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L2 without conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x35156",
+    "EventName": "PM_MRK_DATA_FROM_L31_SHR_CYC",
+    "BriefDescription": "Duration in cycles to reload with Shared (S) data from another core's L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x268A6",
+    "EventName": "PM_TM_RST_SC",
+    "BriefDescription": "TM-snp rst RM SC"
+  },
+  {,
+    "EventCode": "0x468A4",
+    "EventName": "PM_L3_TRANS_PF",
+    "BriefDescription": "L3 Transient prefetch received from L2"
+  },
+  {,
+    "EventCode": "0x4094",
+    "EventName": "PM_IC_PREF_CANCEL_L2",
+    "BriefDescription": "L2 Squashed a demand or prefetch request"
+  },
+  {,
+    "EventCode": "0x48AC",
+    "EventName": "PM_BR_MPRED_LSTACK",
+    "BriefDescription": "Conditional Branch Completed that was Mispredicted due to the Link Stack Target Prediction"
+  },
+  {,
+    "EventCode": "0xE88C",
+    "EventName": "PM_LSU1_ERAT_HIT",
+    "BriefDescription": "Primary ERAT hit.  There is no secondary ERAT"
+  },
+  {,
+    "EventCode": "0xC0B4",
+    "EventName": "PM_LSU_FLUSH_WRK_ARND",
+    "BriefDescription": "LSU workaround flush.  These flushes are setup with programmable scan only latches to perform various actions when the flush macro receives a trigger from the dbg macros. These actions include things like flushing the next op encountered for a particular thread or flushing the next op that is NTC op that is encountered on a particular slice. The kind of flush that the workaround is setup to perform is highly variable."
+  },
+  {,
+    "EventCode": "0x34054",
+    "EventName": "PM_PARTIAL_ST_FIN",
+    "BriefDescription": "Any store finished by an LSU slice"
+  },
+  {,
+    "EventCode": "0x5880",
+    "EventName": "PM_THRD_PRIO_6_7_CYC",
+    "BriefDescription": "Cycles thread running at priority level 6 or 7"
+  },
+  {,
+    "EventCode": "0x4898",
+    "EventName": "PM_IC_DEMAND_L2_BR_REDIRECT",
+    "BriefDescription": "L2 I cache demand request due to branch Mispredict ( 15 cycle path)"
+  },
+  {,
+    "EventCode": "0x4880",
+    "EventName": "PM_BANK_CONFLICT",
+    "BriefDescription": "Read blocked due to interleave conflict.  The ifar logic will detect an interleave conflict and kill the data that was read that cycle."
+  },
+  {,
+    "EventCode": "0x360B0",
+    "EventName": "PM_L3_P0_SYS_PUMP",
+    "BriefDescription": "L3 PF sent with sys scope port 0, counts even retried requests"
+  },
+  {,
+    "EventCode": "0x3006A",
+    "EventName": "PM_IERAT_RELOAD_64K",
+    "BriefDescription": "IERAT Reloaded (Miss) for a 64k page"
+  },
+  {,
+    "EventCode": "0xD8BC",
+    "EventName": "PM_LSU2_3_LRQF_FULL_CYC",
+    "BriefDescription": "Counts the number of cycles the LRQF is full.  LRQF is the queue that holds loads between finish and completion.  If it fills up, instructions stay in LRQ until completion, potentially backing up the LRQ"
+  },
+  {,
+    "EventCode": "0x46086",
+    "EventName": "PM_L2_SN_M_RD_DONE",
+    "BriefDescription": "SNP dispatched for a read and was M (true M)"
+  },
+  {,
+    "EventCode": "0x40154",
+    "EventName": "PM_MRK_FAB_RSP_BKILL",
+    "BriefDescription": "Marked store had to do a bkill"
+  },
+  {,
+    "EventCode": "0xF094",
+    "EventName": "PM_LSU2_L1_CAM_CANCEL",
+    "BriefDescription": "ls2 l1 tm cam cancel"
+  },
+  {,
+    "EventCode": "0x2D014",
+    "EventName": "PM_CMPLU_STALL_LRQ_FULL",
+    "BriefDescription": "Finish stall because the NTF instruction was a load that was held in LSAQ (load-store address queue) because the LRQ (load-reorder queue) was full"
+  },
+  {,
+    "EventCode": "0x3E05E",
+    "EventName": "PM_L3_CO_MEPF",
+    "BriefDescription": "L3 castouts in Mepf state for this thread"
+  },
+  {,
+    "EventCode": "0x168A0",
+    "EventName": "PM_L3_CO_MEPF",
+    "BriefDescription": "L3 CO of line in Mep state (includes casthrough to memory).  The Mepf state indicates that a line was brought in to satisfy an L3 prefetch request"
+  },
+  {,
+    "EventCode": "0x460A2",
+    "EventName": "PM_L3_LAT_CI_HIT",
+    "BriefDescription": "L3 Lateral Castins Hit"
+  },
+  {,
+    "EventCode": "0x3D14E",
+    "EventName": "PM_MRK_DATA_FROM_DL2L3_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x3D15E",
+    "EventName": "PM_MULT_MRK",
+    "BriefDescription": "mult marked instr"
+  },
+  {,
+    "EventCode": "0x4084",
+    "EventName": "PM_EAT_FULL_CYC",
+    "BriefDescription": "Cycles No room in EAT"
+  },
+  {,
+    "EventCode": "0x5098",
+    "EventName": "PM_LINK_STACK_WRONG_ADD_PRED",
+    "BriefDescription": "Link stack predicts wrong address, because of link stack design limitation or software violating the coding conventions"
+  },
+  {,
+    "EventCode": "0x2C050",
+    "EventName": "PM_DATA_GRP_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope was group pump (prediction=correct) for a demand load"
+  },
+  {,
+    "EventCode": "0xC0A4",
+    "EventName": "PM_LSU2_FALSE_LHS",
+    "BriefDescription": "False LHS match detected"
+  },
+  {,
+    "EventCode": "0x58A0",
+    "EventName": "PM_LINK_STACK_CORRECT",
+    "BriefDescription": "Link stack predicts right address"
+  },
+  {,
+    "EventCode": "0x4C05A",
+    "EventName": "PM_DTLB_MISS_1G",
+    "BriefDescription": "Data TLB reload (after a miss) page size 1G. Implies radix translation was used"
+  },
+  {,
+    "EventCode": "0x36886",
+    "EventName": "PM_L2_SN_SX_I_DONE",
+    "BriefDescription": "SNP dispatched and went from Sx to Ix"
+  },
+  {,
+    "EventCode": "0x4E04A",
+    "EventName": "PM_DPTEG_FROM_OFF_CHIP_CACHE",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2C12C",
+    "EventName": "PM_MRK_DATA_FROM_DL4_CYC",
+    "BriefDescription": "Duration in cycles to reload from another chip's L4 on a different Node or Group (Distant) due to a marked load"
+  },
+  {,
+    "EventCode": "0x2608E",
+    "EventName": "PM_TM_LD_CONF",
+    "BriefDescription": "TM Load (fav or non-fav) ran into conflict (failed)"
+  },
+  {,
+    "EventCode": "0x4080",
+    "EventName": "PM_INST_FROM_L1",
+    "BriefDescription": "Instruction fetches from L1.  L1 instruction hit"
+  },
+  {,
+    "EventCode": "0xE898",
+    "EventName": "PM_LSU3_TM_L1_HIT",
+    "BriefDescription": "Load tm hit in L1"
+  },
+  {,
+    "EventCode": "0x260A0",
+    "EventName": "PM_L3_CO_MEM",
+    "BriefDescription": "L3 CO to memory OR of port 0 and 1 (lossy = may undercount if two cresp come in the same cyc)"
+  },
+  {,
+    "EventCode": "0x16082",
+    "EventName": "PM_L2_CASTOUT_MOD",
+    "BriefDescription": "L2 Castouts - Modified (M,Mu,Me)"
+  },
+  {,
+    "EventCode": "0xC09C",
+    "EventName": "PM_LS0_LAUNCH_HELD_PREF",
+    "BriefDescription": "Number of times a load or store instruction was unable to launch/relaunch because a high priority prefetch used that relaunch cycle"
+  },
+  {,
+    "EventCode": "0xC8B8",
+    "EventName": "PM_LSU_FLUSH_LARX_STCX",
+    "BriefDescription": "A larx is flushed because an older larx has an LMQ reservation for the same thread.  A stcx is flushed because an older stcx is in the LMQ.  The flush happens when the older larx/stcx relaunches"
+  },
+  {,
+    "EventCode": "0x260A6",
+    "EventName": "PM_NON_TM_RST_SC",
+    "BriefDescription": "Non-TM snp rst TM SC"
+  },
+  {,
+    "EventCode": "0x3608A",
+    "EventName": "PM_L2_RTY_ST",
+    "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)"
+  },
+  {,
+    "EventCode": "0x4689E",
+    "EventName": "PM_L2_RTY_ST",
+    "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)"
+  },
+  {,
+    "EventCode": "0x24040",
+    "EventName": "PM_INST_FROM_L2_MEPF",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x209C",
+    "EventName": "PM_TM_FAV_TBEGIN",
+    "BriefDescription": "Dispatch time Favored tbegin"
+  },
+  {,
+    "EventCode": "0x2D01E",
+    "EventName": "PM_ICT_NOSLOT_DISP_HELD_ISSQ",
+    "BriefDescription": "Ict empty for this thread due to dispatch hold on this thread due to Issue q full, BRQ full, XVCF Full, Count cache, Link, Tar full"
+  },
+  {,
+    "EventCode": "0x50A4",
+    "EventName": "PM_FLUSH_MPRED",
+    "BriefDescription": "Branch mispredict flushes.  Includes target and address misprecition"
+  },
+  {,
+    "EventCode": "0x508C",
+    "EventName": "PM_SHL_CREATED",
+    "BriefDescription": "Store-Hit-Load Table Entry Created"
+  },
+  {,
+    "EventCode": "0x1504C",
+    "EventName": "PM_IPTEG_FROM_LL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x268A4",
+    "EventName": "PM_L3_LD_MISS",
+    "BriefDescription": "L3 Misses for demand LDs"
+  },
+  {,
+    "EventCode": "0x26088",
+    "EventName": "PM_L2_GRP_GUESS_CORRECT",
+    "BriefDescription": "L2 guess grp (GS or NNS) and guess was correct (data intra-group AND ^on-chip)"
+  },
+  {,
+    "EventCode": "0xD088",
+    "EventName": "PM_LSU0_LDMX_FIN",
+    "BriefDescription": "New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491):  The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region.  This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56])."
+  },
+  {,
+    "EventCode": "0xE8B4",
+    "EventName": "PM_LS1_TM_DISALLOW",
+    "BriefDescription": "A TM-ineligible instruction tries to execute inside a transaction and the LSU disallows it"
+  },
+  {,
+    "EventCode": "0x1688C",
+    "EventName": "PM_RC_USAGE",
+    "BriefDescription": "Continuous 16 cycle (2to1) window where this signals rotates thru sampling each RC machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running"
+  },
+  {,
+    "EventCode": "0x3F054",
+    "EventName": "PM_RADIX_PWC_L4_PTE_FROM_L3MISS",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 4 page walk cache from beyond the core's L3 data cache. This is the deepest level of PWC possible for a translation. The source could be local/remote/distant memory or another core's cache"
+  },
+  {,
+    "EventCode": "0x2608A",
+    "EventName": "PM_ISIDE_DISP_FAIL_ADDR",
+    "BriefDescription": "All I-side dispatch attempts for this thread that failed due to a addr collision with another machine (excludes i_l2mru_tch_reqs)"
+  },
+  {,
+    "EventCode": "0x50B4",
+    "EventName": "PM_TAGE_CORRECT_TAKEN_CMPL",
+    "BriefDescription": "The TAGE overrode BHT direction prediction and it was correct.  Counted at completion for taken branches only"
+  },
+  {,
+    "EventCode": "0x2090",
+    "EventName": "PM_DISP_CLB_HELD_SB",
+    "BriefDescription": "Dispatch/CLB Hold: Scoreboard"
+  },
+  {,
+    "EventCode": "0xE0B0",
+    "EventName": "PM_TM_FAIL_NON_TX_CONFLICT",
+    "BriefDescription": "Non transactional conflict from LSU, gets reported to TEXASR"
+  },
+  {,
+    "EventCode": "0xD198",
+    "EventName": "PM_MRK_LSU_FLUSH_ATOMIC",
+    "BriefDescription": "Quad-word loads (lq) are considered atomic because they always span at least 2 slices.  If a snoop or store from another thread changes the data the load is accessing between the 2 or 3 pieces of the lq instruction, the lq will be flushed"
+  },
+  {,
+    "EventCode": "0x201E0",
+    "EventName": "PM_MRK_DATA_FROM_MEMORY",
+    "BriefDescription": "The processor's data cache was reloaded from a memory location including L4 from local remote or distant due to a marked load"
+  },
+  {,
+    "EventCode": "0x368A2",
+    "EventName": "PM_L3_L2_CO_MISS",
+    "BriefDescription": "L2 CO miss"
+  },
+  {,
+    "EventCode": "0x3608C",
+    "EventName": "PM_CO0_BUSY",
+    "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)"
+  },
+  {,
+    "EventCode": "0x4608C",
+    "EventName": "PM_CO0_BUSY",
+    "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)"
+  },
+  {,
+    "EventCode": "0x2C122",
+    "EventName": "PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L3 with dispatch conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x35154",
+    "EventName": "PM_MRK_DATA_FROM_L3_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L3 due to a marked load"
+  },
+  {,
+    "EventCode": "0x1D140",
+    "EventName": "PM_MRK_DATA_FROM_L31_MOD_CYC",
+    "BriefDescription": "Duration in cycles to reload with Modified (M) data from another core's L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x4404A",
+    "EventName": "PM_INST_FROM_OFF_CHIP_CACHE",
+    "BriefDescription": "The processor's Instruction cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x28AC",
+    "EventName": "PM_TM_FAIL_SELF",
+    "BriefDescription": "TM aborted because a self-induced conflict occurred in Suspended state, due to one of the following: a store to a storage location that was previously accessed transactionally; a dcbf, dcbi, or icbi specify- ing a block that was previously accessed transactionally; a dcbst specifying a block that was previously written transactionally; or a tlbie that specifies a translation that was pre- viously used transactionally"
+  },
+  {,
+    "EventCode": "0x45056",
+    "EventName": "PM_SCALAR_FLOP_CMPL",
+    "BriefDescription": "Scalar flop operation completed"
+  },
+  {,
+    "EventCode": "0x16092",
+    "EventName": "PM_L2_LD_MISS_128B",
+    "BriefDescription": "All successful D-side load dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 128B (i.e., M=0)"
+  },
+  {,
+    "EventCode": "0x2E014",
+    "EventName": "PM_STCX_FIN",
+    "BriefDescription": "Number of stcx instructions finished. This includes instructions in the speculative path of a branch that may be flushed"
+  },
+  {,
+    "EventCode": "0xE0B8",
+    "EventName": "PM_LS2_TM_DISALLOW",
+    "BriefDescription": "A TM-ineligible instruction tries to execute inside a transaction and the LSU disallows it"
+  },
+  {,
+    "EventCode": "0x2094",
+    "EventName": "PM_TM_OUTER_TBEGIN",
+    "BriefDescription": "Completion time outer tbegin"
+  },
+  {,
+    "EventCode": "0x160B4",
+    "EventName": "PM_L3_P0_LCO_RTY",
+    "BriefDescription": "L3 initiated LCO received retry on port 0 (can try 4 times)"
+  },
+  {,
+    "EventCode": "0x36892",
+    "EventName": "PM_DSIDE_OTHER_64B_L2MEMACC",
+    "BriefDescription": "Valid when first beat of data comes in for an D-side fetch where data came EXCLUSIVELY from memory that was for hpc_read64, (RC had to fetch other 64B of a line from MC) i.e., number of times RC had to go to memory to get 'missing' 64B"
+  },
+  {,
+    "EventCode": "0x20A8",
+    "EventName": "PM_TM_FAIL_FOOTPRINT_OVERFLOW",
+    "BriefDescription": "TM aborted because the tracking limit for transactional storage accesses was exceeded.. Asynchronous"
+  },
+  {,
+    "EventCode": "0x30018",
+    "EventName": "PM_ICT_NOSLOT_DISP_HELD_HB_FULL",
+    "BriefDescription": "Ict empty for this thread due to dispatch holds because the History Buffer was full. Could be GPR/VSR/VMR/FPR/CR/XVF; CR; XVF (XER/VSCR/FPSCR)"
+  },
+  {,
+    "EventCode": "0xC894",
+    "EventName": "PM_LS1_UNALIGNED_LD",
+    "BriefDescription": "Load instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the load of that size.  If the load wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+  },
+  {,
+    "EventCode": "0x360A2",
+    "EventName": "PM_L3_L2_CO_HIT",
+    "BriefDescription": "L2 CO hits"
+  },
+  {,
+    "EventCode": "0x36092",
+    "EventName": "PM_DSIDE_L2MEMACC",
+    "BriefDescription": "Valid when first beat of data comes in for an D-side fetch where data came EXCLUSIVELY from memory (excluding hpcread64 accesses), i.e., total memory accesses by RCs"
+  },
+  {,
+    "EventCode": "0x10138",
+    "EventName": "PM_MRK_BR_2PATH",
+    "BriefDescription": "marked branches which are not strongly biased"
+  },
+  {,
+    "EventCode": "0x2884",
+    "EventName": "PM_ISYNC",
+    "BriefDescription": "Isync completion count per thread"
+  },
+  {,
+    "EventCode": "0x16882",
+    "EventName": "PM_L2_CASTOUT_SHR",
+    "BriefDescription": "L2 Castouts - Shared (Tx,Sx)"
+  },
+  {,
+    "EventCode": "0xD884",
+    "EventName": "PM_LSU3_SET_MPRED",
+    "BriefDescription": "Set prediction(set-p) miss.  The entry was not found in the Set prediction table"
+  },
+  {,
+    "EventCode": "0x26092",
+    "EventName": "PM_L2_LD_MISS_64B",
+    "BriefDescription": "All successful D-side load dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 64B(i.e., M=1)"
+  },
+  {,
+    "EventCode": "0x26080",
+    "EventName": "PM_L2_LD_MISS",
+    "BriefDescription": "All successful D-Side Load dispatches that were an L2 miss for this thread"
+  },
+  {,
+    "EventCode": "0x3D14C",
+    "EventName": "PM_MRK_DATA_FROM_DMEM",
+    "BriefDescription": "The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to a marked load"
+  },
+  {,
+    "EventCode": "0x100FA",
+    "EventName": "PM_ANY_THRD_RUN_CYC",
+    "BriefDescription": "Cycles in which at least one thread has the run latch set"
+  },
+  {,
+    "EventCode": "0x2C12A",
+    "EventName": "PM_MRK_DATA_FROM_RMEM_CYC",
+    "BriefDescription": "Duration in cycles to reload from another chip's memory on the same Node or Group ( Remote) due to a marked load"
+  },
+  {,
+    "EventCode": "0x25048",
+    "EventName": "PM_IPTEG_FROM_LMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's Memory due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x40006",
+    "EventName": "PM_ISLB_MISS",
+    "BriefDescription": "Number of ISLB misses for this thread"
+  },
+  {,
+    "EventCode": "0xD8A8",
+    "EventName": "PM_ISLB_MISS",
+    "BriefDescription": "Instruction SLB miss - Total of all segment sizes"
+  },
+  {,
+    "EventCode": "0xD19C",
+    "EventName": "PM_MRK_LSU_FLUSH_RELAUNCH_MISS",
+    "BriefDescription": "If a load that has already returned data and has to relaunch for any reason then gets a miss (erat, setp, data cache), it will often be flushed at relaunch time because the data might be inconsistent"
+  },
+  {,
+    "EventCode": "0x260A2",
+    "EventName": "PM_L3_CI_HIT",
+    "BriefDescription": "L3 Castins Hit (total count)"
+  },
+  {,
+    "EventCode": "0x44054",
+    "EventName": "PM_VECTOR_LD_CMPL",
+    "BriefDescription": "Number of vector load instructions completed"
+  },
+  {,
+    "EventCode": "0x1E05C",
+    "EventName": "PM_CMPLU_STALL_NESTED_TBEGIN",
+    "BriefDescription": "Completion stall because the ISU is updating the TEXASR to keep track of the nested tbegin. This is a short delay, and it includes ROT"
+  },
+  {,
+    "EventCode": "0x1608E",
+    "EventName": "PM_ST_CAUSED_FAIL",
+    "BriefDescription": "Non-TM Store caused any thread to fail"
+  },
+  {,
+    "EventCode": "0x3080",
+    "EventName": "PM_ISU0_ISS_HOLD_ALL",
+    "BriefDescription": "All ISU rejects"
+  },
+  {,
+    "EventCode": "0x1515A",
+    "EventName": "PM_SYNC_MRK_L2MISS",
+    "BriefDescription": "Marked L2 Miss that can throw a synchronous interrupt"
+  },
+  {,
+    "EventCode": "0x26892",
+    "EventName": "PM_L2_ST_MISS_64B",
+    "BriefDescription": "All successful D-side store dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 64B (i.e., M=1)"
+  },
+  {,
+    "EventCode": "0x2688C",
+    "EventName": "PM_CO_USAGE",
+    "BriefDescription": "Continuous 16 cycle (2to1) window where this signals rotates thru sampling each CO machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running"
+  },
+  {,
+    "EventCode": "0xD084",
+    "EventName": "PM_LSU2_SET_MPRED",
+    "BriefDescription": "Set prediction(set-p) miss.  The entry was not found in the Set prediction table"
+  },
+  {,
+    "EventCode": "0x48B8",
+    "EventName": "PM_BR_MPRED_TAKEN_TA",
+    "BriefDescription": "Conditional Branch Completed that was Mispredicted due to the Target Address Prediction from the Count Cache or Link Stack.  Only XL-form branches that resolved Taken set this event."
+  },
+  {,
+    "EventCode": "0x50B0",
+    "EventName": "PM_BTAC_BAD_RESULT",
+    "BriefDescription": "BTAC thinks branch will be taken but it is either predicted not-taken by the BHT, or the target address is wrong (less common).  In both cases, a redirect will happen"
+  },
+  {,
+    "EventCode": "0xD888",
+    "EventName": "PM_LSU1_LDMX_FIN",
+    "BriefDescription": "New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491):  The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region.  This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56])."
+  },
+  {,
+    "EventCode": "0x58B4",
+    "EventName": "PM_TAGE_CORRECT",
+    "BriefDescription": "The TAGE overrode BHT direction prediction and it was correct.   Includes taken and not taken and is counted at execution time"
+  },
+  {,
+    "EventCode": "0x3688C",
+    "EventName": "PM_SN_USAGE",
+    "BriefDescription": "Continuous 16 cycle (2to1) window where this signals rotates thru sampling each SN machine busy. PMU uses this wave to then do 16 cyc count to sample total number of machs running"
+  },
+  {,
+    "EventCode": "0x46084",
+    "EventName": "PM_L2_RCST_DISP_FAIL_OTHER",
+    "BriefDescription": "All D-side store dispatch attempts for this thread that failed due to reason other than address collision"
+  },
+  {,
+    "EventCode": "0xF0AC",
+    "EventName": "PM_DC_PREF_STRIDED_CONF",
+    "BriefDescription": "A demand load referenced a line in an active strided prefetch stream. The stream could have been allocated through the hardware prefetch mechanism or through software."
+  },
+  {,
+    "EventCode": "0x45054",
+    "EventName": "PM_FMA_CMPL",
+    "BriefDescription": "two flops operation completed (fmadd, fnmadd, fmsub, fnmsub) Scalar instructions only. "
+  },
+  {,
+    "EventCode": "0x5090",
+    "EventName": "PM_SHL_ST_DISABLE",
+    "BriefDescription": "Store-Hit-Load Table Read Hit with entry Disabled (entry was disabled due to the entry shown to not prevent the flush)"
+  },
+  {,
+    "EventCode": "0x201E8",
+    "EventName": "PM_THRESH_EXC_512",
+    "BriefDescription": "Threshold counter exceeded a value of 512"
+  },
+  {,
+    "EventCode": "0x5084",
+    "EventName": "PM_DECODE_FUSION_EXT_ADD",
+    "BriefDescription": "32-bit extended addition"
+  },
+  {,
+    "EventCode": "0x36080",
+    "EventName": "PM_L2_INST",
+    "BriefDescription": "All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs)."
+  },
+  {,
+    "EventCode": "0x3609E",
+    "EventName": "PM_L2_INST",
+    "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)"
+  },
+  {,
+    "EventCode": "0x3504C",
+    "EventName": "PM_IPTEG_FROM_DL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a instruction side request"
+  },
+  {,
+    "EventCode": "0xD890",
+    "EventName": "PM_LS1_DC_COLLISIONS",
+    "BriefDescription": "Read-write data cache collisions"
+  },
+  {,
+    "EventCode": "0x1688A",
+    "EventName": "PM_ISIDE_DISP",
+    "BriefDescription": "All I-side dispatch attempts for this thread (excludes i_l2mru_tch_reqs)"
+  },
+  {,
+    "EventCode": "0x468AA",
+    "EventName": "PM_L3_P1_CO_L31",
+    "BriefDescription": "L3 CO to L3.1 (LCO) port 1 with or without data"
+  },
+  {,
+    "EventCode": "0x28B0",
+    "EventName": "PM_DISP_HELD_TBEGIN",
+    "BriefDescription": "This outer tbegin transaction cannot be dispatched until the previous tend instruction completes"
+  },
+  {,
+    "EventCode": "0xE8A0",
+    "EventName": "PM_LSU3_TM_L1_MISS",
+    "BriefDescription": "Load tm L1 miss"
+  },
+  {,
+    "EventCode": "0x2C05E",
+    "EventName": "PM_INST_GRP_PUMP_MPRED",
+    "BriefDescription": "Final Pump Scope (Group) ended up either larger or smaller than Initial Pump Scope for an instruction fetch (demand only)"
+  },
+  {,
+    "EventCode": "0xC8BC",
+    "EventName": "PM_STCX_SUCCESS_CMPL",
+    "BriefDescription": "Number of stcx instructions that completed successfully"
+  },
+  {,
+    "EventCode": "0xE098",
+    "EventName": "PM_LSU2_TM_L1_HIT",
+    "BriefDescription": "Load tm hit in L1"
+  },
+  {,
+    "EventCode": "0x44044",
+    "EventName": "PM_INST_FROM_L31_ECO_MOD",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Modified (M) data from another core's ECO L3 on the same chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x16886",
+    "EventName": "PM_CO_DISP_FAIL",
+    "BriefDescription": "CO dispatch failed due to all CO machines being busy"
+  },
+  {,
+    "EventCode": "0x3D146",
+    "EventName": "PM_MRK_DATA_FROM_L3_NO_CONFLICT",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L3 without conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x16892",
+    "EventName": "PM_L2_ST_MISS_128B",
+    "BriefDescription": "All successful D-side store dispatches that were an L2 miss (NOT Sx,Tx,Mx) for this thread and the RC calculated the request should be for 128B (i.e., M=0)"
+  },
+  {,
+    "EventCode": "0x26890",
+    "EventName": "PM_ISIDE_L2MEMACC",
+    "BriefDescription": "Valid when first beat of data comes in for an I-side fetch where data came from memory"
+  },
+  {,
+    "EventCode": "0xD094",
+    "EventName": "PM_LS2_DC_COLLISIONS",
+    "BriefDescription": "Read-write data cache collisions"
+  },
+  {,
+    "EventCode": "0x3C05E",
+    "EventName": "PM_MEM_RWITM",
+    "BriefDescription": "Memory Read With Intent to Modify for this thread"
+  },
+  {,
+    "EventCode": "0x26882",
+    "EventName": "PM_L2_DC_INV",
+    "BriefDescription": "D-cache invalidates sent over the reload bus to the core"
+  },
+  {,
+    "EventCode": "0xC090",
+    "EventName": "PM_LSU_STCX",
+    "BriefDescription": "STCX sent to nest, i.e. total"
+  },
+  {,
+    "EventCode": "0xD080",
+    "EventName": "PM_LSU0_SET_MPRED",
+    "BriefDescription": "Set prediction(set-p) miss.  The entry was not found in the Set prediction table"
+  },
+  {,
+    "EventCode": "0x2C120",
+    "EventName": "PM_MRK_DATA_FROM_L2_NO_CONFLICT",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 without conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x36086",
+    "EventName": "PM_L2_RC_ST_DONE",
+    "BriefDescription": "RC did store to line that was Tx or Sx"
+  },
+  {,
+    "EventCode": "0xE8AC",
+    "EventName": "PM_TM_FAIL_TX_CONFLICT",
+    "BriefDescription": "Transactional conflict from LSU, gets reported to TEXASR"
+  },
+  {,
+    "EventCode": "0x48A8",
+    "EventName": "PM_DECODE_FUSION_LD_ST_DISP",
+    "BriefDescription": "32-bit displacement D-form and 16-bit displacement X-form"
+  },
+  {,
+    "EventCode": "0x3D144",
+    "EventName": "PM_MRK_DATA_FROM_L2_MEPF_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load"
+  },
+  {,
+    "EventCode": "0x44046",
+    "EventName": "PM_INST_FROM_L21_MOD",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Modified (M) data from another core's L2 on the same chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x40B0",
+    "EventName": "PM_BR_PRED_TAKEN_CR",
+    "BriefDescription": "Conditional Branch that had its direction predicted. I-form branches do not set this event.  In addition, B-form branches which do not use the BHT do not set this event - these are branches with BO-field set to 'always taken' and branches"
+  },
+  {,
+    "EventCode": "0x15040",
+    "EventName": "PM_IPTEG_FROM_L2_NO_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a instruction side request"
+  },
+  {,
+    "EventCode": "0xD9A0",
+    "EventName": "PM_MRK_LSU_FLUSH_LHL_SHL",
+    "BriefDescription": "The instruction was flushed because of a sequential load/store consistency.  If a load or store hits on an older load that has either been snooped (for loads) or has stale data (for stores)."
+  },
+  {,
+    "EventCode": "0x35042",
+    "EventName": "PM_IPTEG_FROM_L3_DISP_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a instruction side request"
+  },
+  {,
+    "EventCode": "0xF898",
+    "EventName": "PM_XLATE_RADIX_MODE",
+    "BriefDescription": "LSU reports every cycle the thread is in radix translation mode (as opposed to HPT mode)"
+  },
+  {,
+    "EventCode": "0x2D142",
+    "EventName": "PM_MRK_DATA_FROM_L3_MEPF",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked load"
+  },
+  {,
+    "EventCode": "0x160B0",
+    "EventName": "PM_L3_P0_NODE_PUMP",
+    "BriefDescription": "L3 PF sent with nodal scope port 0, counts even retried requests"
+  },
+  {,
+    "EventCode": "0xD88C",
+    "EventName": "PM_LSU3_LDMX_FIN",
+    "BriefDescription": "New P9 instruction LDMX. The definition of this new PMU event is (from the ldmx RFC02491):  The thread has executed an ldmx instruction that accessed a doubleword that contains an effective address within an enabled section of the Load Monitored region.  This event, therefore, should not occur if the FSCR has disabled the load monitored facility (FSCR[52]) or disabled the EBB facility (FSCR[56])."
+  },
+  {,
+    "EventCode": "0x36882",
+    "EventName": "PM_L2_LD_HIT",
+    "BriefDescription": "All successful I-or-D side load dispatches for this thread that were L2 hits (excludes i_l2mru_tch_reqs)"
+  },
+  {,
+    "EventCode": "0x2609E",
+    "EventName": "PM_L2_LD_HIT",
+    "BriefDescription": "All successful D side load dispatches for this thread that were L2 hits for this thread"
+  },
+  {,
+    "EventCode": "0x168AC",
+    "EventName": "PM_L3_CI_USAGE",
+    "BriefDescription": "Rotating sample of 16 CI or CO actives"
+  },
+  {,
+    "EventCode": "0x20134",
+    "EventName": "PM_MRK_FXU_FIN",
+    "BriefDescription": "fxu marked instr finish"
+  },
+  {,
+    "EventCode": "0x4608E",
+    "EventName": "PM_TM_CAP_OVERFLOW",
+    "BriefDescription": "TM Footprint Capacity Overflow"
+  },
+  {,
+    "EventCode": "0x4F05C",
+    "EventName": "PM_RADIX_PWC_L2_PTE_FROM_L3MISS",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 2 page walk cache from beyond the core's L3 data cache. This implies that level 3 and level 4 PWC accesses were not necessary for this translation. The source could be local/remote/distant memory or another core's cache"
+  },
+  {,
+    "EventCode": "0x40014",
+    "EventName": "PM_PROBE_NOP_DISP",
+    "BriefDescription": "ProbeNops dispatched"
+  },
+  {,
+    "EventCode": "0x58A8",
+    "EventName": "PM_DECODE_HOLD_ICT_FULL",
+    "BriefDescription": "Counts the number of cycles in which the IFU was not able to decode and transmit one or more instructions because all itags were in use.  This means the ICT is full for this thread"
+  },
+  {,
+    "EventCode": "0x10052",
+    "EventName": "PM_GRP_PUMP_MPRED_RTY",
+    "BriefDescription": "Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x2505E",
+    "EventName": "PM_BACK_BR_CMPL",
+    "BriefDescription": "Branch instruction completed with a target address less than current instruction address"
+  },
+  {,
+    "EventCode": "0x2688A",
+    "EventName": "PM_ISIDE_DISP_FAIL_OTHER",
+    "BriefDescription": "All I-side dispatch attempts for this thread that failed due to a reason other than addrs collision (excludes i_l2mru_tch_reqs)"
+  },
+  {,
+    "EventCode": "0x2001A",
+    "EventName": "PM_NTC_ALL_FIN",
+    "BriefDescription": "Cycles after all instructions have finished to group completed"
+  },
+  {,
+    "EventCode": "0x3005A",
+    "EventName": "PM_ISQ_0_8_ENTRIES",
+    "BriefDescription": "Cycles in which 8 or less Issue Queue entries are in use. This is a shared event, not per thread"
+  },
+  {,
+    "EventCode": "0x3515E",
+    "EventName": "PM_MRK_BACK_BR_CMPL",
+    "BriefDescription": "Marked branch instruction completed with a target address less than current instruction address"
+  },
+  {,
+    "EventCode": "0xF890",
+    "EventName": "PM_LSU1_L1_CAM_CANCEL",
+    "BriefDescription": "ls1 l1 tm cam cancel"
+  },
+  {,
+    "EventCode": "0xE884",
+    "EventName": "PM_LS1_ERAT_MISS_PREF",
+    "BriefDescription": "LS1 Erat miss due to prefetch"
+  },
+  {,
+    "EventCode": "0xE89C",
+    "EventName": "PM_LSU1_TM_L1_MISS",
+    "BriefDescription": "Load tm L1 miss"
+  },
+  {,
+    "EventCode": "0x28A8",
+    "EventName": "PM_TM_FAIL_CONF_NON_TM",
+    "BriefDescription": "TM aborted because a conflict occurred with a non-transactional access by another processor"
+  },
+  {,
+    "EventCode": "0x16890",
+    "EventName": "PM_L1PF_L2MEMACC",
+    "BriefDescription": "Valid when first beat of data comes in for an L1PF where data came from memory"
+  },
+  {,
+    "EventCode": "0x4504C",
+    "EventName": "PM_IPTEG_FROM_DMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x1002E",
+    "EventName": "PM_LMQ_MERGE",
+    "BriefDescription": "A demand miss collides with a prefetch for the same line"
+  },
+  {,
+    "EventCode": "0x160B6",
+    "EventName": "PM_L3_WI0_BUSY",
+    "BriefDescription": "Rotating sample of 8 WI valid"
+  },
+  {,
+    "EventCode": "0x260B6",
+    "EventName": "PM_L3_WI0_BUSY",
+    "BriefDescription": "Rotating sample of 8 WI valid (duplicate)"
+  },
+  {,
+    "EventCode": "0x368AC",
+    "EventName": "PM_L3_CO0_BUSY",
+    "BriefDescription": "Lifetime, sample of CO machine 0 valid"
+  },
+  {,
+    "EventCode": "0x468AC",
+    "EventName": "PM_L3_CO0_BUSY",
+    "BriefDescription": "Lifetime, sample of CO machine 0 valid"
+  },
+  {,
+    "EventCode": "0x2E040",
+    "EventName": "PM_DPTEG_FROM_L2_MEPF",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x1D152",
+    "EventName": "PM_MRK_DATA_FROM_DL4",
+    "BriefDescription": "The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to a marked load"
+  },
+  {,
+    "EventCode": "0x46880",
+    "EventName": "PM_ISIDE_MRU_TOUCH",
+    "BriefDescription": "I-side L2 MRU touch sent to L2 for this thread"
+  },
+  {,
+    "EventCode": "0x1C05C",
+    "EventName": "PM_DTLB_MISS_2M",
+    "BriefDescription": "Data TLB reload (after a miss) page size 2M. Implies radix translation was used"
+  },
+  {,
+    "EventCode": "0x50B8",
+    "EventName": "PM_TAGE_OVERRIDE_WRONG",
+    "BriefDescription": "The TAGE overrode BHT direction prediction but it was incorrect.  Counted at completion for taken branches only"
+  },
+  {,
+    "EventCode": "0x160AE",
+    "EventName": "PM_L3_P0_PF_RTY",
+    "BriefDescription": "L3 PF received retry port 0, every retry counted"
+  },
+  {,
+    "EventCode": "0x260AE",
+    "EventName": "PM_L3_P0_PF_RTY",
+    "BriefDescription": "L3 PF received retry port 0, every retry counted"
+  },
+  {,
+    "EventCode": "0x268B2",
+    "EventName": "PM_L3_LOC_GUESS_WRONG",
+    "BriefDescription": "Initial scope=node (LNS) but data from out side local node (near or far or rem). Prediction too Low"
+  },
+  {,
+    "EventCode": "0x36088",
+    "EventName": "PM_L2_SYS_GUESS_CORRECT",
+    "BriefDescription": "L2 guess system (VGS or RNS) and guess was correct (ie data beyond-group)"
+  },
+  {,
+    "EventCode": "0x589C",
+    "EventName": "PM_PTESYNC",
+    "BriefDescription": "ptesync instruction counted when the instruction is decoded and transmitted"
+  },
+  {,
+    "EventCode": "0x26086",
+    "EventName": "PM_CO_TM_SC_FOOTPRINT",
+    "BriefDescription": "L2 did a cleanifdirty CO to the L3 (ie created an SC line in the L3) OR L2 TM_store hit dirty HPC line and L3 indicated SC line formed in L3 on RDR bus"
+  },
+  {,
+    "EventCode": "0x1E05A",
+    "EventName": "PM_CMPLU_STALL_ANY_SYNC",
+    "BriefDescription": "Cycles in which the NTC sync instruction (isync, lwsync or hwsync) is not allowed to complete"
+  },
+  {,
+    "EventCode": "0xF090",
+    "EventName": "PM_LSU0_L1_CAM_CANCEL",
+    "BriefDescription": "ls0 l1 tm cam cancel"
+  },
+  {,
+    "EventCode": "0xC0A8",
+    "EventName": "PM_LSU_FLUSH_CI",
+    "BriefDescription": "Load was not issued to LSU as a cache inhibited (non-cacheable) load but it was later determined to be cache inhibited"
+  },
+  {,
+    "EventCode": "0x20AC",
+    "EventName": "PM_TM_FAIL_CONF_TM",
+    "BriefDescription": "TM aborted because a conflict occurred with another transaction."
+  },
+  {,
+    "EventCode": "0x588C",
+    "EventName": "PM_SHL_ST_DEP_CREATED",
+    "BriefDescription": "Store-Hit-Load Table Read Hit with entry Enabled"
+  },
+  {,
+    "EventCode": "0x360AC",
+    "EventName": "PM_L3_SN0_BUSY",
+    "BriefDescription": "Lifetime, sample of snooper machine 0 valid"
+  },
+  {,
+    "EventCode": "0x460AC",
+    "EventName": "PM_L3_SN0_BUSY",
+    "BriefDescription": "Lifetime, sample of snooper machine 0 valid"
+  },
+  {,
+    "EventCode": "0x3005C",
+    "EventName": "PM_BFU_BUSY",
+    "BriefDescription": "Cycles in which all 4 Binary Floating Point units are busy. The BFU is running at capacity"
+  },
+  {,
+    "EventCode": "0x48A0",
+    "EventName": "PM_BR_PRED_PCACHE",
+    "BriefDescription": "Conditional branch completed that used pattern cache prediction"
+  },
+  {,
+    "EventCode": "0x26880",
+    "EventName": "PM_L2_ST_MISS",
+    "BriefDescription": "All successful D-Side Store dispatches that were an L2 miss for this thread"
+  },
+  {,
+    "EventCode": "0xF8B4",
+    "EventName": "PM_DC_PREF_XCONS_ALLOC",
+    "BriefDescription": "Prefetch stream allocated in the Ultra conservative phase by either the hardware prefetch mechanism or software prefetch"
+  },
+  {,
+    "EventCode": "0x35048",
+    "EventName": "PM_IPTEG_FROM_DL2L3_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x260A8",
+    "EventName": "PM_L3_PF_HIT_L3",
+    "BriefDescription": "L3 PF hit in L3 (abandoned)"
+  },
+  {,
+    "EventCode": "0x360B4",
+    "EventName": "PM_L3_PF0_BUSY",
+    "BriefDescription": "Lifetime, sample of PF machine 0 valid"
+  },
+  {,
+    "EventCode": "0x460B4",
+    "EventName": "PM_L3_PF0_BUSY",
+    "BriefDescription": "Lifetime, sample of PF machine 0 valid"
+  },
+  {,
+    "EventCode": "0xC0B0",
+    "EventName": "PM_LSU_FLUSH_UE",
+    "BriefDescription": "Correctable ECC error on reload data, reported at critical data forward time"
+  },
+  {,
+    "EventCode": "0x4013A",
+    "EventName": "PM_MRK_IC_MISS",
+    "BriefDescription": "Marked instruction experienced I cache miss"
+  },
+  {,
+    "EventCode": "0x2088",
+    "EventName": "PM_FLUSH_DISP_SB",
+    "BriefDescription": "Dispatch Flush: Scoreboard"
+  },
+  {,
+    "EventCode": "0x401E8",
+    "EventName": "PM_MRK_DATA_FROM_L2MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a location other than the local core's L2 due to a marked load"
+  },
+  {,
+    "EventCode": "0x3688E",
+    "EventName": "PM_TM_ST_CAUSED_FAIL",
+    "BriefDescription": "TM Store (fav or non-fav) caused another thread to fail"
+  },
+  {,
+    "EventCode": "0x460B2",
+    "EventName": "PM_L3_SYS_GUESS_WRONG",
+    "BriefDescription": "Initial scope=system (VGS or RNS) but data from local or near. Prediction too high"
+  },
+  {,
+    "EventCode": "0x58B8",
+    "EventName": "PM_TAGE_OVERRIDE_WRONG_SPEC",
+    "BriefDescription": "The TAGE overrode BHT direction prediction and it was correct.   Includes taken and not taken and is counted at execution time"
+  },
+  {,
+    "EventCode": "0xE890",
+    "EventName": "PM_LSU3_ERAT_HIT",
+    "BriefDescription": "Primary ERAT hit.  There is no secondary ERAT"
+  },
+  {,
+    "EventCode": "0x2898",
+    "EventName": "PM_TM_TABORT_TRECLAIM",
+    "BriefDescription": "Completion time tabortnoncd, tabortcd, treclaim"
+  },
+  {,
+    "EventCode": "0x4C054",
+    "EventName": "PM_DERAT_MISS_16G",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 16G"
+  },
+  {,
+    "EventCode": "0x268A0",
+    "EventName": "PM_L3_CO_L31",
+    "BriefDescription": "L3 CO to L3.1 OR of port 0 and 1 (lossy = may undercount if two cresps come in the same cyc)"
+  },
+  {,
+    "EventCode": "0x5080",
+    "EventName": "PM_THRD_PRIO_4_5_CYC",
+    "BriefDescription": "Cycles thread running at priority level 4 or 5"
+  },
+  {,
+    "EventCode": "0x2505C",
+    "EventName": "PM_VSU_FIN",
+    "BriefDescription": "VSU instruction finished. Up to 4 per cycle"
+  },
+  {,
+    "EventCode": "0x40A4",
+    "EventName": "PM_BR_PRED_CCACHE",
+    "BriefDescription": "Conditional Branch Completed that used the Count Cache for Target Prediction"
+  },
+  {,
+    "EventCode": "0x2E04A",
+    "EventName": "PM_DPTEG_FROM_RL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4D12E",
+    "EventName": "PM_MRK_DATA_FROM_DL2L3_MOD_CYC",
+    "BriefDescription": "Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load"
+  },
+  {,
+    "EventCode": "0xC8B4",
+    "EventName": "PM_LSU_FLUSH_LHL_SHL",
+    "BriefDescription": "The instruction was flushed because of a sequential load/store consistency.  If a load or store hits on an older load that has either been snooped (for loads) or has stale data (for stores)."
+  },
+  {,
+    "EventCode": "0x58A4",
+    "EventName": "PM_FLUSH_LSU",
+    "BriefDescription": "LSU flushes.  Includes all lsu flushes"
+  },
+  {,
+    "EventCode": "0x1D150",
+    "EventName": "PM_MRK_DATA_FROM_DL2L3_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked load"
+  },
+  {,
+    "EventCode": "0xC8A0",
+    "EventName": "PM_LSU1_FALSE_LHS",
+    "BriefDescription": "False LHS match detected"
+  },
+  {,
+    "EventCode": "0x48BC",
+    "EventName": "PM_THRD_PRIO_2_3_CYC",
+    "BriefDescription": "Cycles thread running at priority level 2 or 3"
+  },
+  {,
+    "EventCode": "0x10134",
+    "EventName": "PM_MRK_ST_DONE_L2",
+    "BriefDescription": "marked store completed in L2 ( RC machine done)"
+  },
+  {,
+    "EventCode": "0x368B2",
+    "EventName": "PM_L3_GRP_GUESS_WRONG_HIGH",
+    "BriefDescription": "Initial scope=group (GS or NNS) but data from local node. Prediction too high"
+  },
+  {,
+    "EventCode": "0xE8BC",
+    "EventName": "PM_LS1_PTE_TABLEWALK_CYC",
+    "BriefDescription": "Cycles when a tablewalk is pending on this thread on table 1"
+  },
+  {,
+    "EventCode": "0x1F152",
+    "EventName": "PM_MRK_FAB_RSP_BKILL_CYC",
+    "BriefDescription": "cycles L2 RC took for a bkill"
+  },
+  {,
+    "EventCode": "0x4C124",
+    "EventName": "PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC",
+    "BriefDescription": "Duration in cycles to reload from local core's L3 without conflict due to a marked load"
+  },
+  {,
+    "EventCode": "0x2F14A",
+    "EventName": "PM_MRK_DPTEG_FROM_RL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x26888",
+    "EventName": "PM_L2_GRP_GUESS_WRONG",
+    "BriefDescription": "L2 guess grp (GS or NNS) and guess was not correct (ie data on-chip OR beyond-group)"
+  },
+  {,
+    "EventCode": "0x368AE",
+    "EventName": "PM_L3_P1_CO_RTY",
+    "BriefDescription": "L3 CO received retry port 1 (memory only), every retry counted"
+  },
+  {,
+    "EventCode": "0x468AE",
+    "EventName": "PM_L3_P1_CO_RTY",
+    "BriefDescription": "L3 CO received retry port 3 (memory only), every retry counted"
+  },
+  {,
+    "EventCode": "0xC0AC",
+    "EventName": "PM_LSU_FLUSH_EMSH",
+    "BriefDescription": "An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address"
+  },
+  {,
+    "EventCode": "0x260B2",
+    "EventName": "PM_L3_SYS_GUESS_CORRECT",
+    "BriefDescription": "Initial scope=system (VGS or RNS) and data from outside group (far or rem)(pred successful)"
+  },
+  {,
+    "EventCode": "0x1D146",
+    "EventName": "PM_MRK_DATA_FROM_MEMORY_CYC",
+    "BriefDescription": "Duration in cycles to reload from a memory location including L4 from local remote or distant due to a marked load"
+  },
+  {,
+    "EventCode": "0xE094",
+    "EventName": "PM_LSU0_TM_L1_HIT",
+    "BriefDescription": "Load tm hit in L1"
+  },
+  {,
+    "EventCode": "0x46888",
+    "EventName": "PM_L2_GROUP_PUMP",
+    "BriefDescription": "RC requests that were on group (aka nodel) pump attempts"
+  },
+  {,
+    "EventCode": "0xF0B0",
+    "EventName": "PM_L3_LD_PREF",
+    "BriefDescription": "L3 load prefetch, sourced from a hardware or software stream, was sent to the nest"
+  },
+  {,
+    "EventCode": "0x16080",
+    "EventName": "PM_L2_LD",
+    "BriefDescription": "All successful D-side Load dispatches for this thread (L2 miss + L2 hits)"
+  },
+  {,
+    "EventCode": "0x4505C",
+    "EventName": "PM_MATH_FLOP_CMPL",
+    "BriefDescription": "Math flop instruction completed"
+  },
+  {,
+    "EventCode": "0x368B0",
+    "EventName": "PM_L3_P1_SYS_PUMP",
+    "BriefDescription": "L3 PF sent with sys scope port 1, counts even retried requests"
+  },
+  {,
+    "EventCode": "0x1F146",
+    "EventName": "PM_MRK_DPTEG_FROM_L31_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L3 on the same chip due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2000C",
+    "EventName": "PM_THRD_ALL_RUN_CYC",
+    "BriefDescription": "Cycles in which all the threads have the run latch set"
+  },
+  {,
+    "EventCode": "0xC0BC",
+    "EventName": "PM_LSU_FLUSH_OTHER",
+    "BriefDescription": "Other LSU flushes including: Sync (sync ack from L2 caused search of LRQ for oldest snooped load, This will either signal a Precise Flush of the oldest snooped loa or a Flush Next PPC); Data Valid Flush Next (several cases of this, one example is store and reload are lined up such that a store-hit-reload scenario exists and the CDF has already launched and has gotten bad/stale data); Bad Data Valid Flush Next (might be a few cases of this, one example is a larxa (D$ hit) return data and dval but can't allocate to LMQ (LMQ full or other reason). Already gave dval but can't watch it for snoop_hit_larx. Need to take the “bad dval” back and flush all younger ops)"
+  },
+  {,
+    "EventCode": "0x5094",
+    "EventName": "PM_IC_MISS_ICBI",
+    "BriefDescription": "threaded version, IC Misses where we got EA dir hit but no sector valids were on. ICBI took line out"
+  },
+  {,
+    "EventCode": "0xC8A8",
+    "EventName": "PM_LSU_FLUSH_ATOMIC",
+    "BriefDescription": "Quad-word loads (lq) are considered atomic because they always span at least 2 slices.  If a snoop or store from another thread changes the data the load is accessing between the 2 or 3 pieces of the lq instruction, the lq will be flushed"
+  },
+  {,
+    "EventCode": "0x1E04E",
+    "EventName": "PM_DPTEG_FROM_L2MISS",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a location other than the local core's L2 due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4D05E",
+    "EventName": "PM_BR_CMPL",
+    "BriefDescription": "Any Branch instruction completed"
+  },
+  {,
+    "EventCode": "0x260B0",
+    "EventName": "PM_L3_P0_GRP_PUMP",
+    "BriefDescription": "L3 PF sent with grp scope port 0, counts even retried requests"
+  },
+  {,
+    "EventCode": "0x30132",
+    "EventName": "PM_MRK_VSU_FIN",
+    "BriefDescription": "VSU marked instr finish"
+  },
+  {,
+    "EventCode": "0x2D120",
+    "EventName": "PM_MRK_DATA_FROM_OFF_CHIP_CACHE",
+    "BriefDescription": "The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked load"
+  },
+  {,
+    "EventCode": "0x1E048",
+    "EventName": "PM_DPTEG_FROM_ON_CHIP_CACHE",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x16086",
+    "EventName": "PM_L2_SN_M_WR_DONE",
+    "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)"
+  },
+  {,
+    "EventCode": "0x46886",
+    "EventName": "PM_L2_SN_M_WR_DONE",
+    "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)"
+  },
+  {,
+    "EventCode": "0x489C",
+    "EventName": "PM_BR_CORECT_PRED_TAKEN_CMPL",
+    "BriefDescription": "Conditional Branch Completed in which the HW correctly predicted the direction as taken.  Counted at completion time"
+  },
+  {,
+    "EventCode": "0xF0B8",
+    "EventName": "PM_LS0_UNALIGNED_ST",
+    "BriefDescription": "Store instructions whose data crosses a double-word boundary, which causes it to require an additional slice than than what normally would be required of the Store of that size.  If the Store wraps from slice 3 to slice 0, thee is an additional 3-cycle penalty"
+  },
+  {,
+    "EventCode": "0x20132",
+    "EventName": "PM_MRK_DFU_FIN",
+    "BriefDescription": "Decimal Unit marked Instruction Finish"
+  },
+  {,
+    "EventCode": "0x160A6",
+    "EventName": "PM_TM_SC_CO",
+    "BriefDescription": "L3 castout TM SC line"
+  },
+  {,
+    "EventCode": "0xC8B0",
+    "EventName": "PM_LSU_FLUSH_LHS",
+    "BriefDescription": "Effective Address alias flush : no EA match but Real Address match.  If the data has not yet been returned for this load, the instruction will just be rejected, but if it has returned data, it will be flushed"
+  },
+  {,
+    "EventCode": "0x3F150",
+    "EventName": "PM_MRK_ST_DRAIN_TO_L2DISP_CYC",
+    "BriefDescription": "cycles to drain st from core to L2"
+  },
+  {,
+    "EventCode": "0x168A4",
+    "EventName": "PM_L3_MISS",
+    "BriefDescription": "L3 Misses (L2 miss also missing L3, including data/instrn/xlate)"
+  },
+  {,
+    "EventCode": "0xF080",
+    "EventName": "PM_LSU_STCX_FAIL",
+    "BriefDescription": ""
+  },
+  {,
+    "EventCode": "0x30038",
+    "EventName": "PM_CMPLU_STALL_DMISS_LMEM",
+    "BriefDescription": "Completion stall due to cache miss that resolves in local memory"
+  },
+  {,
+    "EventCode": "0x28A4",
+    "EventName": "PM_MRK_TEND_FAIL",
+    "BriefDescription": "Nested or not nested tend failed for a marked tend instruction"
+  },
+  {,
+    "EventCode": "0x100FC",
+    "EventName": "PM_LD_REF_L1",
+    "BriefDescription": "All L1 D cache load references counted at finish, gated by reject"
+  },
+  {,
+    "EventCode": "0xC0A0",
+    "EventName": "PM_LSU0_FALSE_LHS",
+    "BriefDescription": "False LHS match detected"
+  },
+  {,
+    "EventCode": "0x468A8",
+    "EventName": "PM_SN_MISS",
+    "BriefDescription": "Any port snooper L3 miss or collision.  Up to 4 can happen in a cycle but we only count 1"
+  },
+  {,
+    "EventCode": "0x36888",
+    "EventName": "PM_L2_SYS_GUESS_WRONG",
+    "BriefDescription": "L2 guess system (VGS or RNS) and guess was not correct (ie data ^beyond-group)"
+  },
+  {,
+    "EventCode": "0x2080",
+    "EventName": "PM_EE_OFF_EXT_INT",
+    "BriefDescription": "CyclesMSR[EE] is off and external interrupts are active"
+  },
+  {,
+    "EventCode": "0xE8B8",
+    "EventName": "PM_LS3_TM_DISALLOW",
+    "BriefDescription": "A TM-ineligible instruction tries to execute inside a transaction and the LSU disallows it"
+  },
+  {,
+    "EventCode": "0x2688E",
+    "EventName": "PM_TM_FAV_CAUSED_FAIL",
+    "BriefDescription": "TM Load (fav) caused another thread to fail"
+  },
+  {,
+    "EventCode": "0x16090",
+    "EventName": "PM_SN0_BUSY",
+    "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)"
+  },
+  {,
+    "EventCode": "0x26090",
+    "EventName": "PM_SN0_BUSY",
+    "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)"
+  },
+  {,
+    "EventCode": "0x360AE",
+    "EventName": "PM_L3_P0_CO_RTY",
+    "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted"
+  },
+  {,
+    "EventCode": "0x460AE",
+    "EventName": "PM_L3_P0_CO_RTY",
+    "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted"
+  },
+  {,
+    "EventCode": "0x168A8",
+    "EventName": "PM_L3_WI_USAGE",
+    "BriefDescription": "Lifetime, sample of Write Inject machine 0 valid"
+  },
+  {,
+    "EventCode": "0x468A2",
+    "EventName": "PM_L3_LAT_CI_MISS",
+    "BriefDescription": "L3 Lateral Castins Miss"
+  },
+  {,
+    "EventCode": "0x4090",
+    "EventName": "PM_IC_PREF_CANCEL_PAGE",
+    "BriefDescription": "Prefetch Canceled due to page boundary"
+  },
+  {,
+    "EventCode": "0xF09C",
+    "EventName": "PM_SLB_TABLEWALK_CYC",
+    "BriefDescription": "Cycles when a tablewalk is pending on this thread on the SLB table"
+  },
+  {,
+    "EventCode": "0x460AA",
+    "EventName": "PM_L3_P0_CO_L31",
+    "BriefDescription": "L3 CO to L3.1 (LCO) port 0 with or without data"
+  },
+  {,
+    "EventCode": "0x2880",
+    "EventName": "PM_FLUSH_DISP",
+    "BriefDescription": "Dispatch flush"
+  },
+  {,
+    "EventCode": "0x168AE",
+    "EventName": "PM_L3_P1_PF_RTY",
+    "BriefDescription": "L3 PF received retry port 1, every retry counted"
+  },
+  {,
+    "EventCode": "0x268AE",
+    "EventName": "PM_L3_P1_PF_RTY",
+    "BriefDescription": "L3 PF received retry port 3, every retry counted"
+  },
+  {,
+    "EventCode": "0x46082",
+    "EventName": "PM_L2_ST_DISP",
+    "BriefDescription": "All successful D-side store dispatches for this thread "
+  },
+  {,
+    "EventCode": "0x1689E",
+    "EventName": "PM_L2_ST_DISP",
+    "BriefDescription": "All successful D-side store dispatches for this thread (L2 miss + L2 hits)"
+  },
+  {,
+    "EventCode": "0x36880",
+    "EventName": "PM_L2_INST_MISS",
+    "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)"
+  },
+  {,
+    "EventCode": "0x4609E",
+    "EventName": "PM_L2_INST_MISS",
+    "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)"
+  },
+  {,
+    "EventCode": "0xE084",
+    "EventName": "PM_LS0_ERAT_MISS_PREF",
+    "BriefDescription": "LS0 Erat miss due to prefetch"
+  },
+  {,
+    "EventCode": "0x409C",
+    "EventName": "PM_BR_PRED",
+    "BriefDescription": "Conditional Branch Executed in which the HW predicted the Direction or Target.  Includes taken and not taken and is counted at execution time"
+  },
+  {,
+    "EventCode": "0x2D144",
+    "EventName": "PM_MRK_DATA_FROM_L31_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another core's L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x360A4",
+    "EventName": "PM_L3_CO_LCO",
+    "BriefDescription": "Total L3 COs occurred on LCO L3.1 (good cresp, may end up in mem on a retry)"
+  },
+  {,
+    "EventCode": "0x4890",
+    "EventName": "PM_IC_PREF_CANCEL_HIT",
+    "BriefDescription": "Prefetch Canceled due to icache hit"
+  },
+  {,
+    "EventCode": "0x268A8",
+    "EventName": "PM_RD_HIT_PF",
+    "BriefDescription": "RD machine hit L3 PF machine"
+  },
+  {,
+    "EventCode": "0x16880",
+    "EventName": "PM_L2_ST",
+    "BriefDescription": "All successful D-side store dispatches for this thread (L2 miss + L2 hits)"
+  },
+  {,
+    "EventCode": "0x4098",
+    "EventName": "PM_IC_DEMAND_L2_BHT_REDIRECT",
+    "BriefDescription": "L2 I cache demand request due to BHT redirect, branch redirect ( 2 bubbles 3 cycles)"
+  },
+  {,
+    "EventCode": "0xD0B4",
+    "EventName": "PM_LSU0_SRQ_S0_VALID_CYC",
+    "BriefDescription": "Slot 0 of SRQ valid"
+  },
+  {,
+    "EventCode": "0x160AA",
+    "EventName": "PM_L3_P0_LCO_NO_DATA",
+    "BriefDescription": "Dataless L3 LCO sent port 0"
+  },
+  {,
+    "EventCode": "0x208C",
+    "EventName": "PM_CLB_HELD",
+    "BriefDescription": "CLB (control logic block - indicates quadword fetch block) Hold: Any Reason"
+  },
+  {,
+    "EventCode": "0xF88C",
+    "EventName": "PM_LSU3_STORE_REJECT",
+    "BriefDescription": "All internal store rejects cause the instruction to go back to the SRQ and go to sleep until woken up to try again after the condition has been met"
+  },
+  {,
+    "EventCode": "0x200F2",
+    "EventName": "PM_INST_DISP",
+    "BriefDescription": "# PPC Dispatched"
+  },
+  {,
+    "EventCode": "0x300F2",
+    "EventName": "PM_INST_DISP",
+    "BriefDescription": "# PPC Dispatched"
+  },
+  {,
+    "EventCode": "0x4E05E",
+    "EventName": "PM_TM_OUTER_TBEGIN_DISP",
+    "BriefDescription": "Number of outer tbegin instructions dispatched. The dispatch unit determines whether the tbegin instruction is outer or nested. This is a speculative count, which includes flushed instructions"
+  },
+  {,
+    "EventCode": "0x2D018",
+    "EventName": "PM_CMPLU_STALL_EXEC_UNIT",
+    "BriefDescription": "Completion stall due to execution units (FXU/VSU/CRU)"
+  },
+  {,
+    "EventCode": "0x20B0",
+    "EventName": "PM_LSU_FLUSH_NEXT",
+    "BriefDescription": "LSU flush next reported at flush time.  Sometimes these also come with an exception"
+  },
+  {,
+    "EventCode": "0x3880",
+    "EventName": "PM_ISU2_ISS_HOLD_ALL",
+    "BriefDescription": "All ISU rejects"
+  },
+  {,
+    "EventCode": "0x46882",
+    "EventName": "PM_L2_ST_HIT",
+    "BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits"
+  },
+  {,
+    "EventCode": "0x2689E",
+    "EventName": "PM_L2_ST_HIT",
+    "BriefDescription": "All successful D-side store dispatches that were L2 hits for this thread"
+  },
+  {,
+    "EventCode": "0x360A8",
+    "EventName": "PM_L3_CO",
+    "BriefDescription": "L3 castout occurring (does not include casthrough or log writes (cinj/dmaw))"
+  },
+  {,
+    "EventCode": "0x368A4",
+    "EventName": "PM_L3_CINJ",
+    "BriefDescription": "L3 castin of cache inject"
+  },
+  {,
+    "EventCode": "0xC890",
+    "EventName": "PM_LSU_NCST",
+    "BriefDescription": "Asserts when a i=1 store op is sent to the nest. No record of issue pipe (LS0/LS1) is maintained so this is for both pipes. Probably don't need separate LS0 and LS1"
+  },
+  {,
+    "EventCode": "0xD880",
+    "EventName": "PM_LSU1_SET_MPRED",
+    "BriefDescription": "Set prediction(set-p) miss.  The entry was not found in the Set prediction table"
+  },
+  {,
+    "EventCode": "0xD0B8",
+    "EventName": "PM_LSU_LMQ_FULL_CYC",
+    "BriefDescription": "Counts the number of cycles the LMQ is full"
+  },
+  {,
+    "EventCode": "0x168B2",
+    "EventName": "PM_L3_GRP_GUESS_CORRECT",
+    "BriefDescription": "Initial scope=group (GS or NNS) and data from same group (near) (pred successful)"
+  },
+  {,
+    "EventCode": "0x48A4",
+    "EventName": "PM_STOP_FETCH_PENDING_CYC",
+    "BriefDescription": "Fetching is stopped due to an incoming instruction that will result in a flush"
+  },
+  {,
+    "EventCode": "0x36884",
+    "EventName": "PM_L2_RCST_DISP_FAIL_ADDR",
+    "BriefDescription": "All D-side store dispatch attempts for this thread that failed due to address collision with RC/CO/SN/SQ"
+  },
+  {,
+    "EventCode": "0x260AC",
+    "EventName": "PM_L3_PF_USAGE",
+    "BriefDescription": "Rotating sample of 32 PF actives"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
new file mode 100644
index 000000000000..47a82568a8df
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
@@ -0,0 +1,557 @@
+[
+  {,
+    "EventCode": "0x4D04C",
+    "EventName": "PM_DFU_BUSY",
+    "BriefDescription": "Cycles in which all 4 Decimal Floating Point units are busy. The DFU is running at capacity"
+  },
+  {,
+    "EventCode": "0x100F6",
+    "EventName": "PM_IERAT_RELOAD",
+    "BriefDescription": "Number of I-ERAT reloads"
+  },
+  {,
+    "EventCode": "0x201E2",
+    "EventName": "PM_MRK_LD_MISS_L1",
+    "BriefDescription": "Marked DL1 Demand Miss counted at exec time. Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load."
+  },
+  {,
+    "EventCode": "0x40010",
+    "EventName": "PM_PMC3_OVERFLOW",
+    "BriefDescription": "Overflow from counter 3"
+  },
+  {,
+    "EventCode": "0x1005A",
+    "EventName": "PM_CMPLU_STALL_DFLONG",
+    "BriefDescription": "Finish stall because the NTF instruction was a multi-cycle instruction issued to the Decimal Floating Point execution pipe and waiting to finish. Includes decimal floating point instructions + 128 bit binary floating point instructions. Qualified by multicycle"
+  },
+  {,
+    "EventCode": "0x4D140",
+    "EventName": "PM_MRK_DATA_FROM_ON_CHIP_CACHE",
+    "BriefDescription": "The processor's data cache was reloaded either shared or modified data from another core's L2/L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x3F14C",
+    "EventName": "PM_MRK_DPTEG_FROM_DL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x1E040",
+    "EventName": "PM_DPTEG_FROM_L2_NO_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x24052",
+    "EventName": "PM_FXU_IDLE",
+    "BriefDescription": "Cycles in which FXU0, FXU1, FXU2, and FXU3 are all idle"
+  },
+  {,
+    "EventCode": "0x1E054",
+    "EventName": "PM_CMPLU_STALL",
+    "BriefDescription": "Nothing completed and ICT not empty"
+  },
+  {,
+    "EventCode": "0x2",
+    "EventName": "PM_INST_CMPL",
+    "BriefDescription": "Number of PowerPC Instructions that completed."
+  },
+  {,
+    "EventCode": "0x3D058",
+    "EventName": "PM_VSU_DP_FSQRT_FDIV",
+    "BriefDescription": "vector versions of fdiv,fsqrt"
+  },
+  {,
+    "EventCode": "0x10006",
+    "EventName": "PM_DISP_HELD",
+    "BriefDescription": "Dispatch Held"
+  },
+  {,
+    "EventCode": "0x3D154",
+    "EventName": "PM_MRK_DERAT_MISS_16M",
+    "BriefDescription": "Marked Data ERAT Miss (Data TLB Access) page size 16M"
+  },
+  {,
+    "EventCode": "0x200F8",
+    "EventName": "PM_EXT_INT",
+    "BriefDescription": "external interrupt"
+  },
+  {,
+    "EventCode": "0x20008",
+    "EventName": "PM_ICT_EMPTY_CYC",
+    "BriefDescription": "Cycles in which the ICT is completely empty. No itags are assigned to any thread"
+  },
+  {,
+    "EventCode": "0x4F146",
+    "EventName": "PM_MRK_DPTEG_FROM_L21_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x10056",
+    "EventName": "PM_MEM_READ",
+    "BriefDescription": "Reads from Memory from this thread (includes data/inst/xlate/l1prefetch/inst prefetch). Includes L4"
+  },
+  {,
+    "EventCode": "0x3C04C",
+    "EventName": "PM_DATA_FROM_DL4",
+    "BriefDescription": "The processor's data cache was reloaded from another chip's L4 on a different Node or Group (Distant) due to a demand load"
+  },
+  {,
+    "EventCode": "0x4E046",
+    "EventName": "PM_DPTEG_FROM_L21_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L2 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2E016",
+    "EventName": "PM_NTC_ISSUE_HELD_ARB",
+    "BriefDescription": "The NTC instruction is being held at dispatch because it lost arbitration onto the issue pipe to another instruction (from the same thread or a different thread)"
+  },
+  {,
+    "EventCode": "0x15156",
+    "EventName": "PM_SYNC_MRK_FX_DIVIDE",
+    "BriefDescription": "Marked fixed point divide that can cause a synchronous interrupt"
+  },
+  {,
+    "EventCode": "0x1C056",
+    "EventName": "PM_DERAT_MISS_4K",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 4K"
+  },
+  {,
+    "EventCode": "0x2F142",
+    "EventName": "PM_MRK_DPTEG_FROM_L3_MEPF",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x10024",
+    "EventName": "PM_PMC5_OVERFLOW",
+    "BriefDescription": "Overflow from counter 5"
+  },
+  {,
+    "EventCode": "0x2C018",
+    "EventName": "PM_CMPLU_STALL_DMISS_L21_L31",
+    "BriefDescription": "Completion stall by Dcache miss which resolved on chip ( excluding local L2/L3)"
+  },
+  {,
+    "EventCode": "0x4006A",
+    "EventName": "PM_IERAT_RELOAD_16M",
+    "BriefDescription": "IERAT Reloaded (Miss) for a 16M page"
+  },
+  {,
+    "EventCode": "0x4E010",
+    "EventName": "PM_ICT_NOSLOT_IC_L3MISS",
+    "BriefDescription": "Ict empty for this thread due to icache misses that were sourced from beyond the local L3. The source could be local/remote/distant memory or another core's cache"
+  },
+  {,
+    "EventCode": "0x4D01C",
+    "EventName": "PM_ICT_NOSLOT_DISP_HELD_SYNC",
+    "BriefDescription": "Dispatch held due to a synchronizing instruction at dispatch"
+  },
+  {,
+    "EventCode": "0x2D01A",
+    "EventName": "PM_ICT_NOSLOT_IC_MISS",
+    "BriefDescription": "Ict empty for this thread due to Icache Miss"
+  },
+  {,
+    "EventCode": "0x3D152",
+    "EventName": "PM_MRK_DERAT_MISS_1G",
+    "BriefDescription": "Marked Data ERAT Miss (Data TLB Access) page size 1G. Implies radix translation"
+  },
+  {,
+    "EventCode": "0x4F14A",
+    "EventName": "PM_MRK_DPTEG_FROM_OFF_CHIP_CACHE",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x30058",
+    "EventName": "PM_TLBIE_FIN",
+    "BriefDescription": "tlbie finished"
+  },
+  {,
+    "EventCode": "0x100F8",
+    "EventName": "PM_ICT_NOSLOT_CYC",
+    "BriefDescription": "Number of cycles the ICT has no itags assigned to this thread"
+  },
+  {,
+    "EventCode": "0x3E042",
+    "EventName": "PM_DPTEG_FROM_L3_DISP_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x1F140",
+    "EventName": "PM_MRK_DPTEG_FROM_L2_NO_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 without conflict due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2C05A",
+    "EventName": "PM_DERAT_MISS_1G",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 1G. Implies radix translation"
+  },
+  {,
+    "EventCode": "0x1F058",
+    "EventName": "PM_RADIX_PWC_L2_PTE_FROM_L2",
+    "BriefDescription": "A Page Table Entry was reloaded to a level 2 page walk cache from the core's L2 data cache. This implies that level 3 and level 4 PWC accesses were not necessary for this translation"
+  },
+  {,
+    "EventCode": "0x1D14A",
+    "EventName": "PM_MRK_DATA_FROM_RL2L3_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x10050",
+    "EventName": "PM_CHIP_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope was chip pump (prediction=correct) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x45058",
+    "EventName": "PM_IC_MISS_CMPL",
+    "BriefDescription": "Non-speculative icache miss, counted at completion"
+  },
+  {,
+    "EventCode": "0x2D150",
+    "EventName": "PM_MRK_DERAT_MISS_4K",
+    "BriefDescription": "Marked Data ERAT Miss (Data TLB Access) page size 4K"
+  },
+  {,
+    "EventCode": "0x34058",
+    "EventName": "PM_ICT_NOSLOT_BR_MPRED_ICMISS",
+    "BriefDescription": "Ict empty for this thread due to Icache Miss and branch mispred"
+  },
+  {,
+    "EventCode": "0x10022",
+    "EventName": "PM_PMC2_SAVED",
+    "BriefDescription": "PMC2 Rewind Value saved"
+  },
+  {,
+    "EventCode": "0x2000A",
+    "EventName": "PM_HV_CYC",
+    "BriefDescription": "Cycles in which msr_hv is high. Note that this event does not take msr_pr into consideration"
+  },
+  {,
+    "EventCode": "0x1F144",
+    "EventName": "PM_MRK_DPTEG_FROM_L3_NO_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 without conflict due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x300FC",
+    "EventName": "PM_DTLB_MISS",
+    "BriefDescription": "Data PTEG reload"
+  },
+  {,
+    "EventCode": "0x2D152",
+    "EventName": "PM_MRK_DERAT_MISS_2M",
+    "BriefDescription": "Marked Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation"
+  },
+  {,
+    "EventCode": "0x2C046",
+    "EventName": "PM_DATA_FROM_RL2L3_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x20052",
+    "EventName": "PM_GRP_PUMP_MPRED",
+    "BriefDescription": "Final Pump Scope (Group) ended up either larger or smaller than Initial Pump Scope for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x3F05A",
+    "EventName": "PM_RADIX_PWC_L2_PDE_FROM_L3",
+    "BriefDescription": "A Page Directory Entry was reloaded to a level 2 page walk cache from the core's L3 data cache"
+  },
+  {,
+    "EventCode": "0x1E04A",
+    "EventName": "PM_DPTEG_FROM_RL2L3_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x10064",
+    "EventName": "PM_ICT_NOSLOT_DISP_HELD_TBEGIN",
+    "BriefDescription": "the NTC instruction is being held at dispatch because it is a tbegin instruction and there is an older tbegin in the pipeline that must complete before the younger tbegin can dispatch"
+  },
+  {,
+    "EventCode": "0x2E046",
+    "EventName": "PM_DPTEG_FROM_RL2L3_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4F14C",
+    "EventName": "PM_MRK_DPTEG_FROM_DMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2E042",
+    "EventName": "PM_DPTEG_FROM_L3_MEPF",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 without dispatch conflicts hit on Mepf state. due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2D012",
+    "EventName": "PM_CMPLU_STALL_DFU",
+    "BriefDescription": "Finish stall because the NTF instruction was issued to the Decimal Floating Point execution pipe and waiting to finish. Includes decimal floating point instructions + 128 bit binary floating point instructions. Not qualified by multicycle"
+  },
+  {,
+    "EventCode": "0x4C04C",
+    "EventName": "PM_DATA_FROM_DMEM",
+    "BriefDescription": "The processor's data cache was reloaded from another chip's memory on the same Node or Group (Distant) due to a demand load"
+  },
+  {,
+    "EventCode": "0x30022",
+    "EventName": "PM_PMC4_SAVED",
+    "BriefDescription": "PMC4 Rewind Value saved (matched condition)"
+  },
+  {,
+    "EventCode": "0x200F4",
+    "EventName": "PM_RUN_CYC",
+    "BriefDescription": "Run_cycles"
+  },
+  {,
+    "EventCode": "0x400F2",
+    "EventName": "PM_1PLUS_PPC_DISP",
+    "BriefDescription": "Cycles at least one Instr Dispatched"
+  },
+  {,
+    "EventCode": "0x3D148",
+    "EventName": "PM_MRK_DATA_FROM_L21_MOD_CYC",
+    "BriefDescription": "Duration in cycles to reload with Modified (M) data from another core's L2 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x2F146",
+    "EventName": "PM_MRK_DPTEG_FROM_RL2L3_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4E01A",
+    "EventName": "PM_ICT_NOSLOT_DISP_HELD",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any reason"
+  },
+  {,
+    "EventCode": "0x401EC",
+    "EventName": "PM_THRESH_EXC_2048",
+    "BriefDescription": "Threshold counter exceeded a value of 2048"
+  },
+  {,
+    "EventCode": "0x35150",
+    "EventName": "PM_MRK_DATA_FROM_RL2L3_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x3E052",
+    "EventName": "PM_ICT_NOSLOT_IC_L3",
+    "BriefDescription": "Ict empty for this thread due to icache misses that were sourced from the local L3"
+  },
+  {,
+    "EventCode": "0x2405A",
+    "EventName": "PM_NTC_FIN",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline (NTC) finishes. This event is used to account for cycles in which work is being completed in the CPI stack"
+  },
+  {,
+    "EventCode": "0x40052",
+    "EventName": "PM_PUMP_MPRED",
+    "BriefDescription": "Pump misprediction. Counts across all types of pumps for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x30056",
+    "EventName": "PM_TM_ABORTS",
+    "BriefDescription": "Number of TM transactions aborted"
+  },
+  {,
+    "EventCode": "0x2404C",
+    "EventName": "PM_INST_FROM_MEMORY",
+    "BriefDescription": "The processor's Instruction cache was reloaded from a memory location including L4 from local remote or distant due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x1C05A",
+    "EventName": "PM_DERAT_MISS_2M",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation"
+  },
+  {,
+    "EventCode": "0x30024",
+    "EventName": "PM_PMC6_OVERFLOW",
+    "BriefDescription": "Overflow from counter 6"
+  },
+  {,
+    "EventCode": "0x10068",
+    "EventName": "PM_BRU_FIN",
+    "BriefDescription": "Branch Instruction Finished"
+  },
+  {,
+    "EventCode": "0x30020",
+    "EventName": "PM_PMC2_REWIND",
+    "BriefDescription": "PMC2 Rewind Event (did not match condition)"
+  },
+  {,
+    "EventCode": "0x40064",
+    "EventName": "PM_DUMMY2_REMOVE_ME",
+    "BriefDescription": "Space holder for LS_PC_RELOAD_RA"
+  },
+  {,
+    "EventCode": "0x3F148",
+    "EventName": "PM_MRK_DPTEG_FROM_DL2L3_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4D01E",
+    "EventName": "PM_ICT_NOSLOT_BR_MPRED",
+    "BriefDescription": "Ict empty for this thread due to branch mispred"
+  },
+  {,
+    "EventCode": "0x3405E",
+    "EventName": "PM_IFETCH_THROTTLE",
+    "BriefDescription": "Cycles in which Instruction fetch throttle was active."
+  },
+  {,
+    "EventCode": "0x1F148",
+    "EventName": "PM_MRK_DPTEG_FROM_ON_CHIP_CACHE",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x3E046",
+    "EventName": "PM_DPTEG_FROM_L21_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another core's L2 on the same chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2F144",
+    "EventName": "PM_MRK_DPTEG_FROM_L31_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's L3 on the same chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4C15C",
+    "EventName": "PM_MRK_DERAT_MISS_16G",
+    "BriefDescription": "Marked Data ERAT Miss (Data TLB Access) page size 16G"
+  },
+  {,
+    "EventCode": "0x14052",
+    "EventName": "PM_INST_GRP_PUMP_MPRED_RTY",
+    "BriefDescription": "Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for an instruction fetch"
+  },
+  {,
+    "EventCode": "0x10016",
+    "EventName": "PM_DSLB_MISS",
+    "BriefDescription": "Data SLB Miss - Total of all segment sizes"
+  },
+  {,
+    "EventCode": "0xD0A8",
+    "EventName": "PM_DSLB_MISS",
+    "BriefDescription": "Data SLB Miss - Total of all segment sizes"
+  },
+  {,
+    "EventCode": "0x4C058",
+    "EventName": "PM_MEM_CO",
+    "BriefDescription": "Memory castouts from this thread"
+  },
+  {,
+    "EventCode": "0x40004",
+    "EventName": "PM_FXU_FIN",
+    "BriefDescription": "The fixed point unit Unit finished an instruction. Instructions that finish may not necessary complete."
+  },
+  {,
+    "EventCode": "0x2C054",
+    "EventName": "PM_DERAT_MISS_64K",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K"
+  },
+  {,
+    "EventCode": "0x10018",
+    "EventName": "PM_IC_DEMAND_CYC",
+    "BriefDescription": "Icache miss demand cycles"
+  },
+  {,
+    "EventCode": "0x3C054",
+    "EventName": "PM_DERAT_MISS_16M",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 16M"
+  },
+  {,
+    "EventCode": "0x2D14E",
+    "EventName": "PM_MRK_DATA_FROM_L21_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x3405C",
+    "EventName": "PM_CMPLU_STALL_DPLONG",
+    "BriefDescription": "Finish stall because the NTF instruction was a scalar multi-cycle instruction issued to the Double Precision execution pipe and waiting to finish. Includes binary floating point instructions in 32 and 64 bit binary floating point format. Qualified by NOT vector AND multicycle"
+  },
+  {,
+    "EventCode": "0x4D052",
+    "EventName": "PM_2FLOP_CMPL",
+    "BriefDescription": "DP vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres ,fsqrte, fneg "
+  },
+  {,
+    "EventCode": "0x1F142",
+    "EventName": "PM_MRK_DPTEG_FROM_L2",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x40062",
+    "EventName": "PM_DUMMY1_REMOVE_ME",
+    "BriefDescription": "Space holder for L2_PC_PM_MK_LDST_SCOPE_PRED_STATUS"
+  },
+  {,
+    "EventCode": "0x4C012",
+    "EventName": "PM_CMPLU_STALL_ERAT_MISS",
+    "BriefDescription": "Finish stall because the NTF instruction was a load or store that suffered a translation miss"
+  },
+  {,
+    "EventCode": "0x4D050",
+    "EventName": "PM_VSU_NON_FLOP_CMPL",
+    "BriefDescription": "Non FLOP operation completed"
+  },
+  {,
+    "EventCode": "0x2E012",
+    "EventName": "PM_TM_TX_PASS_RUN_CYC",
+    "BriefDescription": "cycles spent in successful transactions"
+  },
+  {,
+    "EventCode": "0x4D04E",
+    "EventName": "PM_VSU_FSQRT_FDIV",
+    "BriefDescription": "four flops operation (fdiv,fsqrt) Scalar Instructions only"
+  },
+  {,
+    "EventCode": "0x4C120",
+    "EventName": "PM_MRK_DATA_FROM_L2_MEPF",
+    "BriefDescription": "The processor's data cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to a marked load"
+  },
+  {,
+    "EventCode": "0x10062",
+    "EventName": "PM_LD_L3MISS_PEND_CYC",
+    "BriefDescription": "Cycles L3 miss was pending for this thread"
+  },
+  {,
+    "EventCode": "0x2F14C",
+    "EventName": "PM_MRK_DPTEG_FROM_MEMORY",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x14050",
+    "EventName": "PM_INST_CHIP_PUMP_CPRED",
+    "BriefDescription": "Initial and Final Pump Scope was chip pump (prediction=correct) for an instruction fetch"
+  },
+  {,
+    "EventCode": "0x2000E",
+    "EventName": "PM_FXU_BUSY",
+    "BriefDescription": "Cycles in which all 4 FXUs are busy. The FXU is running at capacity"
+  },
+  {,
+    "EventCode": "0x20066",
+    "EventName": "PM_TLB_MISS",
+    "BriefDescription": "TLB Miss (I + D)"
+  },
+  {,
+    "EventCode": "0x10054",
+    "EventName": "PM_PUMP_CPRED",
+    "BriefDescription": "Pump prediction correct. Counts across all types of pumps for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x4D124",
+    "EventName": "PM_MRK_DATA_FROM_L31_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another core's L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x400F8",
+    "EventName": "PM_FLUSH",
+    "BriefDescription": "Flush (any type)"
+  },
+  {,
+    "EventCode": "0x30004",
+    "EventName": "PM_CMPLU_STALL_EMQ_FULL",
+    "BriefDescription": "Finish stall because the next to finish instruction suffered an ERAT miss and the EMQ was full"
+  },
+  {,
+    "EventCode": "0x1D154",
+    "EventName": "PM_MRK_DATA_FROM_L21_SHR_CYC",
+    "BriefDescription": "Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json
new file mode 100644
index 000000000000..a2c95a99e168
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json
@@ -0,0 +1,127 @@
+[
+  {,
+    "EventCode": "0x20036",
+    "EventName": "PM_BR_2PATH",
+    "BriefDescription": "Branches that are not strongly biased"
+  },
+  {,
+    "EventCode": "0x40036",
+    "EventName": "PM_BR_2PATH",
+    "BriefDescription": "Branches that are not strongly biased"
+  },
+  {,
+    "EventCode": "0x40056",
+    "EventName": "PM_MEM_LOC_THRESH_LSU_HIGH",
+    "BriefDescription": "Local memory above threshold for LSU medium"
+  },
+  {,
+    "EventCode": "0x2C056",
+    "EventName": "PM_DTLB_MISS_4K",
+    "BriefDescription": "Data TLB Miss page size 4k"
+  },
+  {,
+    "EventCode": "0x40118",
+    "EventName": "PM_MRK_DCACHE_RELOAD_INTV",
+    "BriefDescription": "Combined Intervention event"
+  },
+  {,
+    "EventCode": "0x4F148",
+    "EventName": "PM_MRK_DPTEG_FROM_DL2L3_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x301E8",
+    "EventName": "PM_THRESH_EXC_64",
+    "BriefDescription": "Threshold counter exceeded a value of 64"
+  },
+  {,
+    "EventCode": "0x4E04E",
+    "EventName": "PM_DPTEG_FROM_L3MISS",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a location other than the local core's L3 due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x40050",
+    "EventName": "PM_SYS_PUMP_MPRED_RTY",
+    "BriefDescription": "Final Pump Scope (system) ended up larger than Initial Pump Scope (Chip/Group) for all data types excluding data prefetch (demand load,inst prefetch,inst fetch,xlate)"
+  },
+  {,
+    "EventCode": "0x1F14E",
+    "EventName": "PM_MRK_DPTEG_FROM_L2MISS",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a location other than the local core's L2 due to a marked data side request.. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4D018",
+    "EventName": "PM_CMPLU_STALL_BRU",
+    "BriefDescription": "Completion stall due to a Branch Unit"
+  },
+  {,
+    "EventCode": "0x45052",
+    "EventName": "PM_4FLOP_CMPL",
+    "BriefDescription": "4 FLOP instruction completed"
+  },
+  {,
+    "EventCode": "0x3D142",
+    "EventName": "PM_MRK_DATA_FROM_LMEM",
+    "BriefDescription": "The processor's data cache was reloaded from the local chip's Memory due to a marked load"
+  },
+  {,
+    "EventCode": "0x4C01E",
+    "EventName": "PM_CMPLU_STALL_CRYPTO",
+    "BriefDescription": "Finish stall because the NTF instruction was routed to the crypto execution pipe and was waiting to finish"
+  },
+  {,
+    "EventCode": "0x3000C",
+    "EventName": "PM_FREQ_DOWN",
+    "BriefDescription": "Power Management: Below Threshold B"
+  },
+  {,
+    "EventCode": "0x4D128",
+    "EventName": "PM_MRK_DATA_FROM_LMEM_CYC",
+    "BriefDescription": "Duration in cycles to reload from the local chip's Memory due to a marked load"
+  },
+  {,
+    "EventCode": "0x4D054",
+    "EventName": "PM_8FLOP_CMPL",
+    "BriefDescription": "8 FLOP instruction completed"
+  },
+  {,
+    "EventCode": "0x10026",
+    "EventName": "PM_TABLEWALK_CYC",
+    "BriefDescription": "Cycles when an instruction tablewalk is active"
+  },
+  {,
+    "EventCode": "0x2C012",
+    "EventName": "PM_CMPLU_STALL_DCACHE_MISS",
+    "BriefDescription": "Finish stall because the NTF instruction was a load that missed the L1 and was waiting for the data to return from the nest"
+  },
+  {,
+    "EventCode": "0x2E04C",
+    "EventName": "PM_DPTEG_FROM_MEMORY",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x3F142",
+    "EventName": "PM_MRK_DPTEG_FROM_L3_DISP_CONFLICT",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 with dispatch conflict due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x4F142",
+    "EventName": "PM_MRK_DPTEG_FROM_L3",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L3 due to a marked data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x10060",
+    "EventName": "PM_TM_TRANS_RUN_CYC",
+    "BriefDescription": "run cycles in transactional state"
+  },
+  {,
+    "EventCode": "0x1E04C",
+    "EventName": "PM_DPTEG_FROM_LL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's L4 cache due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x45050",
+    "EventName": "PM_1FLOP_CMPL",
+    "BriefDescription": "one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation completed"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/translation.json b/tools/perf/pmu-events/arch/powerpc/power9/translation.json
new file mode 100644
index 000000000000..8c0f12024afa
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power9/translation.json
@@ -0,0 +1,232 @@
+[
+  {,
+    "EventCode": "0x1E",
+    "EventName": "PM_CYC",
+    "BriefDescription": "Processor cycles"
+  },
+  {,
+    "EventCode": "0x30010",
+    "EventName": "PM_PMC2_OVERFLOW",
+    "BriefDescription": "Overflow from counter 2"
+  },
+  {,
+    "EventCode": "0x3C046",
+    "EventName": "PM_DATA_FROM_L21_SHR",
+    "BriefDescription": "The processor's data cache was reloaded with Shared (S) data from another core's L2 on the same chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x4D05C",
+    "EventName": "PM_DP_QP_FLOP_CMPL",
+    "BriefDescription": "Double-Precion or Quad-Precision instruction completed"
+  },
+  {,
+    "EventCode": "0x4E04C",
+    "EventName": "PM_DPTEG_FROM_DMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group (Distant) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x20016",
+    "EventName": "PM_ST_FIN",
+    "BriefDescription": "Store finish count. Includes speculative activity"
+  },
+  {,
+    "EventCode": "0x44042",
+    "EventName": "PM_INST_FROM_L3",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L3 due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x1504A",
+    "EventName": "PM_IPTEG_FROM_RL2L3_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x40132",
+    "EventName": "PM_MRK_LSU_FIN",
+    "BriefDescription": "lsu marked instr PPC finish"
+  },
+  {,
+    "EventCode": "0x3C05C",
+    "EventName": "PM_CMPLU_STALL_VFXU",
+    "BriefDescription": "Finish stall due to a vector fixed point instruction in the execution pipeline. These instructions get routed to the ALU, ALU2, and DIV pipes"
+  },
+  {,
+    "EventCode": "0x30066",
+    "EventName": "PM_LSU_FIN",
+    "BriefDescription": "LSU Finished a PPC instruction (up to 4 per cycle)"
+  },
+  {,
+    "EventCode": "0x2011C",
+    "EventName": "PM_MRK_NTC_CYC",
+    "BriefDescription": "Cycles during which the marked instruction is next to complete (completion is held up because the marked instruction hasn't completed yet)"
+  },
+  {,
+    "EventCode": "0x3E048",
+    "EventName": "PM_DPTEG_FROM_DL2L3_SHR",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Shared (S) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x2E018",
+    "EventName": "PM_CMPLU_STALL_VFXLONG",
+    "BriefDescription": "Completion stall due to a long latency vector fixed point instruction (division, square root)"
+  },
+  {,
+    "EventCode": "0x1C04E",
+    "EventName": "PM_DATA_FROM_L2MISS_MOD",
+    "BriefDescription": "The processor's data cache was reloaded from a location other than the local core's L2 due to a demand load"
+  },
+  {,
+    "EventCode": "0x15048",
+    "EventName": "PM_IPTEG_FROM_ON_CHIP_CACHE",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on the same chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x34046",
+    "EventName": "PM_INST_FROM_L21_SHR",
+    "BriefDescription": "The processor's Instruction cache was reloaded with Shared (S) data from another core's L2 on the same chip due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x1E058",
+    "EventName": "PM_STCX_FAIL",
+    "BriefDescription": "stcx failed"
+  },
+  {,
+    "EventCode": "0x20112",
+    "EventName": "PM_MRK_NTF_FIN",
+    "BriefDescription": "Marked next to finish instruction finished"
+  },
+  {,
+    "EventCode": "0x300F0",
+    "EventName": "PM_ST_MISS_L1",
+    "BriefDescription": "Store Missed L1"
+  },
+  {,
+    "EventCode": "0x4C046",
+    "EventName": "PM_DATA_FROM_L21_MOD",
+    "BriefDescription": "The processor's data cache was reloaded with Modified (M) data from another core's L2 on the same chip due to a demand load"
+  },
+  {,
+    "EventCode": "0x2504A",
+    "EventName": "PM_IPTEG_FROM_RL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's L4 on the same Node or Group ( Remote) due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x2003E",
+    "EventName": "PM_LSU_LMQ_SRQ_EMPTY_CYC",
+    "BriefDescription": "Cycles in which the LSU is empty for all threads (lmq and srq are completely empty)"
+  },
+  {,
+    "EventCode": "0x201E6",
+    "EventName": "PM_THRESH_EXC_32",
+    "BriefDescription": "Threshold counter exceeded a value of 32"
+  },
+  {,
+    "EventCode": "0x4405C",
+    "EventName": "PM_CMPLU_STALL_VDP",
+    "BriefDescription": "Finish stall because the NTF instruction was a vector instruction issued to the Double Precision execution pipe and waiting to finish. Includes binary floating point instructions in 32 and 64 bit binary floating point format. Not qualified multicycle. Qualified by vector"
+  },
+  {,
+    "EventCode": "0x4D010",
+    "EventName": "PM_PMC1_SAVED",
+    "BriefDescription": "PMC1 Rewind Value saved"
+  },
+  {,
+    "EventCode": "0x200FE",
+    "EventName": "PM_DATA_FROM_L2MISS",
+    "BriefDescription": "Demand LD - L2 Miss (not L2 hit)"
+  },
+  {,
+    "EventCode": "0x2D14A",
+    "EventName": "PM_MRK_DATA_FROM_RL2L3_MOD_CYC",
+    "BriefDescription": "Duration in cycles to reload with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x10028",
+    "EventName": "PM_STALL_END_ICT_EMPTY",
+    "BriefDescription": "The number a times the core transitioned from a stall to ICT-empty for this thread"
+  },
+  {,
+    "EventCode": "0x2504C",
+    "EventName": "PM_IPTEG_FROM_MEMORY",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from a memory location including L4 from local remote or distant due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x4504A",
+    "EventName": "PM_IPTEG_FROM_OFF_CHIP_CACHE",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB either shared or modified data from another core's L2/L3 on a different chip (remote or distant) due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x1404E",
+    "EventName": "PM_INST_FROM_L2MISS",
+    "BriefDescription": "The processor's Instruction cache was reloaded from a location other than the local core's L2 due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x34042",
+    "EventName": "PM_INST_FROM_L3_DISP_CONFLICT",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L3 with dispatch conflict due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x4E048",
+    "EventName": "PM_DPTEG_FROM_DL2L3_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on a different Node or Group (Distant), as this chip due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x200F0",
+    "EventName": "PM_ST_CMPL",
+    "BriefDescription": "Stores completed from S2Q (2nd-level store queue)."
+  },
+  {,
+    "EventCode": "0x4E05C",
+    "EventName": "PM_LSU_REJECT_LHS",
+    "BriefDescription": "LSU Reject due to LHS (up to 4 per cycle)"
+  },
+  {,
+    "EventCode": "0x14044",
+    "EventName": "PM_INST_FROM_L3_NO_CONFLICT",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L3 without conflict due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x3E04C",
+    "EventName": "PM_DPTEG_FROM_DL4",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
+  },
+  {,
+    "EventCode": "0x1F15E",
+    "EventName": "PM_MRK_PROBE_NOP_CMPL",
+    "BriefDescription": "Marked probeNops completed"
+  },
+  {,
+    "EventCode": "0x20018",
+    "EventName": "PM_ST_FWD",
+    "BriefDescription": "Store forwards that finished"
+  },
+  {,
+    "EventCode": "0x1D142",
+    "EventName": "PM_MRK_DATA_FROM_L31_ECO_SHR_CYC",
+    "BriefDescription": "Duration in cycles to reload with Shared (S) data from another core's ECO L3 on the same chip due to a marked load"
+  },
+  {,
+    "EventCode": "0x24042",
+    "EventName": "PM_INST_FROM_L3_MEPF",
+    "BriefDescription": "The processor's Instruction cache was reloaded from local core's L3 without dispatch conflicts hit on Mepf state. due to an instruction fetch (not prefetch)"
+  },
+  {,
+    "EventCode": "0x25046",
+    "EventName": "PM_IPTEG_FROM_RL2L3_MOD",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another chip's L2 or L3 on the same Node or Group (Remote), as this chip due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x3504A",
+    "EventName": "PM_IPTEG_FROM_RMEM",
+    "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's memory on the same Node or Group ( Remote) due to a instruction side request"
+  },
+  {,
+    "EventCode": "0x3C05A",
+    "EventName": "PM_CMPLU_STALL_VDPLONG",
+    "BriefDescription": "Finish stall because the NTF instruction was a scalar multi-cycle instruction issued to the Double Precision execution pipe and waiting to finish. Includes binary floating point instructions in 32 and 64 bit binary floating point format. Qualified by NOT vector AND multicycle"
+  },
+  {,
+    "EventCode": "0x2E01C",
+    "EventName": "PM_CMPLU_STALL_TLBIE",
+    "BriefDescription": "Finish stall because the NTF instruction was a tlbie waiting for response from L2"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index d1a12e584c1b..4ea068366c3e 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -34,3 +34,4 @@ GenuineIntel-6-2C,v2,westmereep-dp,core
 GenuineIntel-6-2C,v2,westmereep-dp,core
 GenuineIntel-6-25,v2,westmereep-sp,core
 GenuineIntel-6-2F,v2,westmereex,core
+GenuineIntel-6-55,v1,skylakex,core
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/cache.json b/tools/perf/pmu-events/arch/x86/skylakex/cache.json
new file mode 100644
index 000000000000..b5bc742b6fbc
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/cache.json
@@ -0,0 +1,1672 @@
+[
+    {
+        "EventCode": "0x24",
+        "UMask": "0x21",
+        "BriefDescription": "Demand Data Read miss L2, no rejects",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS",
+        "PublicDescription": "Counts the number of demand Data Read requests that miss L2 cache. Only not rejected loads are counted.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0x22",
+        "BriefDescription": "RFO requests that miss L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.RFO_MISS",
+        "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0x24",
+        "BriefDescription": "L2 cache misses when fetching instructions",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.CODE_RD_MISS",
+        "PublicDescription": "Counts L2 cache misses when fetching instructions.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0x27",
+        "BriefDescription": "Demand requests that miss L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.ALL_DEMAND_MISS",
+        "PublicDescription": "Demand requests that miss L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0x38",
+        "BriefDescription": "Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches that miss L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.PF_MISS",
+        "PublicDescription": "Counts requests from the L1/L2/L3 hardware prefetchers or Load software prefetches that miss L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0x3f",
+        "BriefDescription": "All requests that miss L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.MISS",
+        "PublicDescription": "All requests that miss L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0x41",
+        "BriefDescription": "Demand Data Read requests that hit L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT",
+        "PublicDescription": "Counts the number of demand Data Read requests that hit L2 cache. Only non rejected loads are counted.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0x42",
+        "BriefDescription": "RFO requests that hit L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.RFO_HIT",
+        "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0x44",
+        "BriefDescription": "L2 cache hits when fetching instructions, code reads.",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.CODE_RD_HIT",
+        "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0xd8",
+        "BriefDescription": "Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches that hit L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.PF_HIT",
+        "PublicDescription": "Counts requests from the L1/L2/L3 hardware prefetchers or Load software prefetches that hit L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0xe1",
+        "BriefDescription": "Demand Data Read requests",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD",
+        "PublicDescription": "Counts the number of demand Data Read requests (including requests from L1D hardware prefetchers). These loads may hit or miss L2 cache. Only non rejected loads are counted.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0xe2",
+        "BriefDescription": "RFO requests to L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.ALL_RFO",
+        "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0xe4",
+        "BriefDescription": "L2 code requests",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.ALL_CODE_RD",
+        "PublicDescription": "Counts the total number of L2 code requests.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0xe7",
+        "BriefDescription": "Demand requests to L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.ALL_DEMAND_REFERENCES",
+        "PublicDescription": "Demand requests to L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0xf8",
+        "BriefDescription": "Requests from the L1/L2/L3 hardware prefetchers or Load software prefetches",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.ALL_PF",
+        "PublicDescription": "Counts the total number of requests from the L2 hardware prefetchers.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x24",
+        "UMask": "0xff",
+        "BriefDescription": "All L2 requests",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_RQSTS.REFERENCES",
+        "PublicDescription": "All L2 requests.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x2E",
+        "UMask": "0x41",
+        "BriefDescription": "Core-originated cacheable demand requests missed L3",
+        "Counter": "0,1,2,3",
+        "EventName": "LONGEST_LAT_CACHE.MISS",
+        "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches from L1 and L2. It does not include all misses to the L3.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x2E",
+        "UMask": "0x4f",
+        "BriefDescription": "Core-originated cacheable demand requests that refer to L3",
+        "Counter": "0,1,2,3",
+        "EventName": "LONGEST_LAT_CACHE.REFERENCE",
+        "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches from L1 and L2.  It does not include all accesses to the L3.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x48",
+        "UMask": "0x1",
+        "BriefDescription": "L1D miss outstandings duration in cycles",
+        "Counter": "0,1,2,3",
+        "EventName": "L1D_PEND_MISS.PENDING",
+        "PublicDescription": "Counts duration of L1D miss outstanding, that is each cycle number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch.Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x48",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles with L1D load Misses outstanding.",
+        "Counter": "0,1,2,3",
+        "EventName": "L1D_PEND_MISS.PENDING_CYCLES",
+        "CounterMask": "1",
+        "PublicDescription": "Counts duration of L1D miss outstanding in cycles.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x48",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles with L1D load Misses outstanding from any thread on physical core.",
+        "Counter": "0,1,2,3",
+        "EventName": "L1D_PEND_MISS.PENDING_CYCLES_ANY",
+        "AnyThread": "1",
+        "CounterMask": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x48",
+        "UMask": "0x2",
+        "BriefDescription": "Number of times a request needed a FB entry but there was no entry available for it. That is the FB unavailability was dominant reason for blocking the request. A request includes cacheable/uncacheable demands that is load, store or SW prefetch.",
+        "Counter": "0,1,2,3",
+        "EventName": "L1D_PEND_MISS.FB_FULL",
+        "PublicDescription": "Number of times a request needed a FB (Fill Buffer) entry but there was no entry available for it. A request includes cacheable/uncacheable demands that are load, store or SW prefetch instructions.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x51",
+        "UMask": "0x1",
+        "BriefDescription": "L1D data line replacements",
+        "Counter": "0,1,2,3",
+        "EventName": "L1D.REPLACEMENT",
+        "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x1",
+        "BriefDescription": "Offcore outstanding Demand Data Read transactions in uncore queue.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD",
+        "PublicDescription": "Counts the number of offcore outstanding Demand Data Read transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor. See the corresponding Umask under OFFCORE_REQUESTS.Note: A prefetch promoted to Demand is counted from the promotion point.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles when offcore outstanding Demand Data Read transactions are present in SuperQueue (SQ), queue to uncore",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles when offcore outstanding Demand Data Read transactions are present in the super queue (SQ). A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles with at least 6 offcore outstanding Demand Data Read transactions in uncore queue.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD_GE_6",
+        "CounterMask": "6",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x2",
+        "BriefDescription": "Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle. ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD",
+        "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD",
+        "CounterMask": "1",
+        "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x4",
+        "BriefDescription": "Offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore, every cycle",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO",
+        "PublicDescription": "Counts the number of offcore outstanding RFO (store) transactions in the super queue (SQ) every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x4",
+        "BriefDescription": "Cycles with offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO",
+        "CounterMask": "1",
+        "PublicDescription": "Counts the number of offcore outstanding demand rfo Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x8",
+        "BriefDescription": "Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD",
+        "PublicDescription": "Counts the number of offcore outstanding cacheable Core Data Read transactions in the super queue every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x8",
+        "BriefDescription": "Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "UMask": "0x1",
+        "BriefDescription": "Demand Data Read requests sent to uncore",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD",
+        "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "UMask": "0x2",
+        "BriefDescription": "Cacheable and noncachaeble code read requests",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD",
+        "PublicDescription": "Counts both cacheable and non-cacheable code read requests.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "UMask": "0x4",
+        "BriefDescription": "Demand RFO requests including regular RFOs, locks, ItoM",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS.DEMAND_RFO",
+        "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "UMask": "0x8",
+        "BriefDescription": "Demand and prefetch data reads",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS.ALL_DATA_RD",
+        "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "UMask": "0x80",
+        "BriefDescription": "Any memory transaction that reached the SQ.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS.ALL_REQUESTS",
+        "PublicDescription": "Counts memory transactions reached the super queue including requests initiated by the core, all L3 prefetches, page walks, etc..",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB2",
+        "UMask": "0x1",
+        "BriefDescription": "Offcore requests buffer cannot take more entries for this thread core.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_BUFFER.SQ_FULL",
+        "PublicDescription": "Counts the number of cases when the offcore requests buffer cannot take more entries for the core. This can happen when the superqueue does not contain eligible entries, or when L1D writeback pending FIFO requests is full.Note: Writeback pending FIFO has six entries.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE",
+        "PublicDescription": "Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD0",
+        "UMask": "0x11",
+        "BriefDescription": "Retired load instructions that miss the STLB.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_INST_RETIRED.STLB_MISS_LOADS",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD0",
+        "UMask": "0x12",
+        "BriefDescription": "Retired store instructions that miss the STLB.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_INST_RETIRED.STLB_MISS_STORES",
+        "SampleAfterValue": "100003",
+        "L1_Hit_Indication": "1",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD0",
+        "UMask": "0x21",
+        "BriefDescription": "Retired load instructions with locked access.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_INST_RETIRED.LOCK_LOADS",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD0",
+        "UMask": "0x41",
+        "BriefDescription": "Retired load instructions that split across a cacheline boundary.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_INST_RETIRED.SPLIT_LOADS",
+        "PublicDescription": "Counts retired load instructions that split across a cacheline boundary.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD0",
+        "UMask": "0x42",
+        "BriefDescription": "Retired store instructions that split across a cacheline boundary.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_INST_RETIRED.SPLIT_STORES",
+        "PublicDescription": "Counts retired store instructions that split across a cacheline boundary.",
+        "SampleAfterValue": "100003",
+        "L1_Hit_Indication": "1",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD0",
+        "UMask": "0x81",
+        "BriefDescription": "All retired load instructions.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_INST_RETIRED.ALL_LOADS",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD0",
+        "UMask": "0x82",
+        "BriefDescription": "All retired store instructions.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_INST_RETIRED.ALL_STORES",
+        "SampleAfterValue": "2000003",
+        "L1_Hit_Indication": "1",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD1",
+        "UMask": "0x1",
+        "BriefDescription": "Retired load instructions with L1 cache hits as data sources",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_RETIRED.L1_HIT",
+        "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD1",
+        "UMask": "0x2",
+        "BriefDescription": "Retired load instructions with L2 cache hits as data sources",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_RETIRED.L2_HIT",
+        "PublicDescription": "Retired load instructions with L2 cache hits as data sources.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD1",
+        "UMask": "0x4",
+        "BriefDescription": "Retired load instructions with L3 cache hits as data sources",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_RETIRED.L3_HIT",
+        "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache. ",
+        "SampleAfterValue": "50021",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD1",
+        "UMask": "0x8",
+        "BriefDescription": "Retired load instructions missed L1 cache as data sources",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_RETIRED.L1_MISS",
+        "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD1",
+        "UMask": "0x10",
+        "BriefDescription": "Retired load instructions missed L2 cache as data sources",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_RETIRED.L2_MISS",
+        "PublicDescription": "Retired load instructions missed L2 cache as data sources.",
+        "SampleAfterValue": "50021",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD1",
+        "UMask": "0x20",
+        "BriefDescription": "Retired load instructions missed L3 cache as data sources",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_RETIRED.L3_MISS",
+        "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache. ",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD1",
+        "UMask": "0x40",
+        "BriefDescription": "Retired load instructions which data sources were load missed L1 but hit FB due to preceding miss to the same cache line with data not ready",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_RETIRED.FB_HIT",
+        "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready. ",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD2",
+        "UMask": "0x1",
+        "BriefDescription": "Retired load instructions which data sources were L3 hit and cross-core snoop missed in on-pkg core cache.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS",
+        "SampleAfterValue": "20011",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD2",
+        "UMask": "0x2",
+        "BriefDescription": "Retired load instructions which data sources were L3 and cross-core snoop hits in on-pkg core cache",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT",
+        "PublicDescription": "Retired load instructions which data sources were L3 and cross-core snoop hits in on-pkg core cache.",
+        "SampleAfterValue": "20011",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD2",
+        "UMask": "0x4",
+        "BriefDescription": "Retired load instructions which data sources were HitM responses from shared L3",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM",
+        "PublicDescription": "Retired load instructions which data sources were HitM responses from shared L3.",
+        "SampleAfterValue": "20011",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD2",
+        "UMask": "0x8",
+        "BriefDescription": "Retired load instructions which data sources were hits in L3 without snoops required",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE",
+        "PublicDescription": "Retired load instructions which data sources were hits in L3 without snoops required.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD3",
+        "UMask": "0x1",
+        "BriefDescription": "Retired load instructions which data sources missed L3 but serviced from local dram",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM",
+        "PublicDescription": "Retired load instructions which data sources missed L3 but serviced from local DRAM.",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD3",
+        "UMask": "0x2",
+        "BriefDescription": "Retired load instructions which data sources missed L3 but serviced from remote dram",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD3",
+        "UMask": "0x4",
+        "BriefDescription": "Retired load instructions whose data sources was remote HITM",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM",
+        "PublicDescription": "Retired load instructions whose data sources was remote HITM.",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD3",
+        "UMask": "0x8",
+        "BriefDescription": "Retired load instructions whose data sources was forwarded from a remote cache",
+        "Data_LA": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD",
+        "PublicDescription": "Retired load instructions whose data sources was forwarded from a remote cache.",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xD4",
+        "UMask": "0x4",
+        "BriefDescription": "Retired instructions with at least 1 uncacheable load or lock.",
+        "Data_LA": "1",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_LOAD_MISC_RETIRED.UC",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xF0",
+        "UMask": "0x40",
+        "BriefDescription": "L2 writebacks that access L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_TRANS.L2_WB",
+        "PublicDescription": "Counts L2 writebacks that access L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xF1",
+        "UMask": "0x1f",
+        "BriefDescription": "L2 cache lines filling L2",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_LINES_IN.ALL",
+        "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xF2",
+        "UMask": "0x1",
+        "BriefDescription": "Counts the number of lines that are silently dropped by L2 cache when triggered by an L2 cache fill. These lines are typically in Shared state. A non-threaded event.",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_LINES_OUT.SILENT",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xF2",
+        "UMask": "0x2",
+        "BriefDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines can be either in modified state or clean state. Modified lines may either be written back to L3 or directly written to memory and not allocated in L3.  Clean lines may either be allocated in L3 or dropped ",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_LINES_OUT.NON_SILENT",
+        "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines can be either in modified state or clean state. Modified lines may either be written back to L3 or directly written to memory and not allocated in L3.  Clean lines may either be allocated in L3 or dropped.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xF2",
+        "UMask": "0x4",
+        "BriefDescription": "Counts the number of lines that have been hardware prefetched but not used and now evicted by L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_LINES_OUT.USELESS_PREF",
+        "PublicDescription": "Counts the number of lines that have been hardware prefetched but not used and now evicted by L2 cache.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xF2",
+        "UMask": "0x4",
+        "BriefDescription": "Counts the number of lines that have been hardware prefetched but not used and now evicted by L2 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "L2_LINES_OUT.USELESS_HWPF",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xF4",
+        "UMask": "0x10",
+        "BriefDescription": "Number of cache line split locks sent to uncore.",
+        "Counter": "0,1,2,3",
+        "EventName": "SQ_MISC.SPLIT_LOCK",
+        "PublicDescription": "Counts the number of cache line split locks sent to the uncore.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that have any response type.",
+        "MSRValue": "0x0000010001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "DEMAND_DATA_RD & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that hit in the L3.",
+        "MSRValue": "0x3f803c0001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that have any response type.",
+        "MSRValue": "0x0000010002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "DEMAND_RFO & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that hit in the L3.",
+        "MSRValue": "0x3f803c0002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that have any response type.",
+        "MSRValue": "0x0000010004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "DEMAND_CODE_RD & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that hit in the L3.",
+        "MSRValue": "0x3f803c0004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that have any response type.",
+        "MSRValue": "0x0000010010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "PF_L2_DATA_RD & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3.",
+        "MSRValue": "0x3f803c0010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that have any response type.",
+        "MSRValue": "0x0000010020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "PF_L2_RFO & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3.",
+        "MSRValue": "0x3f803c0020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that have any response type.",
+        "MSRValue": "0x0000010080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "PF_L3_DATA_RD & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3.",
+        "MSRValue": "0x3f803c0080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that have any response type.",
+        "MSRValue": "0x0000010100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "PF_L3_RFO & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3.",
+        "MSRValue": "0x3f803c0100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that have any response type.",
+        "MSRValue": "0x0000010400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "PF_L1D_AND_SW & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that hit in the L3.",
+        "MSRValue": "0x3f803c0400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that have any response type.",
+        "MSRValue": "0x0000010490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "ALL_PF_DATA_RD & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that hit in the L3.",
+        "MSRValue": "0x3f803c0490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that have any response type.",
+        "MSRValue": "0x0000010120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "ALL_PF_RFO & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that hit in the L3.",
+        "MSRValue": "0x3f803c0120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that have any response type.",
+        "MSRValue": "0x0000010491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "ALL_DATA_RD & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that hit in the L3.",
+        "MSRValue": "0x3f803c0491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that have any response type.",
+        "MSRValue": "0x0000010122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that have any response type.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.",
+        "MSRValue": "0x01003c0122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_HIT.NO_SNOOP_NEEDED",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and sibling core snoops are not needed as either the core-valid bit is not set or the shared line is present in multiple cores.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x04003c0122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_HIT.HIT_OTHER_CORE_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "ALL_RFO & L3_HIT & SNOOP_HIT_WITH_FWD",
+        "MSRValue": "0x08003c0122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "tbd; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.",
+        "MSRValue": "0x10003c0122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_HIT.HITM_OTHER_CORE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3 and the snoop to one of the sibling cores hits the line in M state and the line is forwarded.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that hit in the L3.",
+        "MSRValue": "0x3f803c0122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_HIT.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that hit in the L3.; Offcore response can be programmed only with a specific pair of event select and counter MSR, and with specific event codes and predefine mask bit value in a dedicated MSR to specify attributes of the offcore transaction.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/floating-point.json b/tools/perf/pmu-events/arch/x86/skylakex/floating-point.json
new file mode 100644
index 000000000000..1c09a328df36
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/floating-point.json
@@ -0,0 +1,88 @@
+[
+    {
+        "EventCode": "0xC7",
+        "UMask": "0x1",
+        "BriefDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired.  Each count represents 1 computation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC7",
+        "UMask": "0x2",
+        "BriefDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired.  Each count represents 1 computation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC7",
+        "UMask": "0x4",
+        "BriefDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired.  Each count represents 2 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC7",
+        "UMask": "0x8",
+        "BriefDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.  ",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE",
+        "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC7",
+        "UMask": "0x10",
+        "BriefDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired.  Each count represents 4 computations. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC7",
+        "UMask": "0x20",
+        "BriefDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired.  Each count represents 8 computations. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP RSQRT SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC7",
+        "UMask": "0x40",
+        "BriefDescription": "Number of Packed Double-Precision FP arithmetic instructions (Use operation multiplier of 8)",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE",
+        "PublicDescription": "Number of Packed Double-Precision FP arithmetic instructions (Use operation multiplier of 8).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC7",
+        "UMask": "0x80",
+        "BriefDescription": "Number of Packed Single-Precision FP arithmetic instructions (Use operation multiplier of 16)",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE",
+        "PublicDescription": "Number of Packed Single-Precision FP arithmetic instructions (Use operation multiplier of 16).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xCA",
+        "UMask": "0x1e",
+        "BriefDescription": "Cycles with any input/output SSE or FP assist",
+        "Counter": "0,1,2,3",
+        "EventName": "FP_ASSIST.ANY",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles with any input and output SSE or x87 FP assist. If an input and output assist are detected on the same cycle the event increments by 1.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/frontend.json b/tools/perf/pmu-events/arch/x86/skylakex/frontend.json
new file mode 100644
index 000000000000..40abc0852cd6
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/frontend.json
@@ -0,0 +1,482 @@
+[
+    {
+        "EventCode": "0x79",
+        "UMask": "0x4",
+        "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from MITE path",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.MITE_UOPS",
+        "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may 'bypass' the IDQ. This also means that uops are not being delivered from the Decode Stream Buffer (DSB).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x4",
+        "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from MITE path",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.MITE_CYCLES",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the MITE path. Counting includes uops that may 'bypass' the IDQ.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x8",
+        "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.DSB_UOPS",
+        "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may 'bypass' the IDQ.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x8",
+        "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) from Decode Stream Buffer (DSB) path",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.DSB_CYCLES",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Counting includes uops that may 'bypass' the IDQ.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles when uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.MS_DSB_CYCLES",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles during which uops initiated by Decode Stream Buffer (DSB) are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may 'bypass' the IDQ.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x18",
+        "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering 4 Uops",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.ALL_DSB_CYCLES_4_UOPS",
+        "CounterMask": "4",
+        "PublicDescription": "Counts the number of cycles 4 uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x18",
+        "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.ALL_DSB_CYCLES_ANY_UOPS",
+        "CounterMask": "1",
+        "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Count includes uops that may 'bypass' the IDQ.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x20",
+        "BriefDescription": "Uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.MS_MITE_UOPS",
+        "PublicDescription": "Counts the number of uops initiated by MITE and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may 'bypass' the IDQ.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x24",
+        "BriefDescription": "Cycles MITE is delivering 4 Uops",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.ALL_MITE_CYCLES_4_UOPS",
+        "CounterMask": "4",
+        "PublicDescription": "Counts the number of cycles 4 uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. Counting includes uops that may 'bypass' the IDQ. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x24",
+        "BriefDescription": "Cycles MITE is delivering any Uop",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.ALL_MITE_CYCLES_ANY_UOPS",
+        "CounterMask": "1",
+        "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. Counting includes uops that may 'bypass' the IDQ. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x30",
+        "BriefDescription": "Cycles when uops are being delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.MS_CYCLES",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may 'bypass' the IDQ. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EdgeDetect": "1",
+        "EventCode": "0x79",
+        "UMask": "0x30",
+        "BriefDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.MS_SWITCHES",
+        "CounterMask": "1",
+        "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x79",
+        "UMask": "0x30",
+        "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) while Microcode Sequenser (MS) is busy",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ.MS_UOPS",
+        "PublicDescription": "Counts the total number of uops delivered by the Microcode Sequencer (MS). Any instruction over 4 uops will be delivered by the MS. Some instructions such as transcendentals may additionally generate uops from the MS.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x80",
+        "UMask": "0x4",
+        "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache miss.",
+        "Counter": "0,1,2,3",
+        "EventName": "ICACHE_16B.IFDATA_STALL",
+        "PublicDescription": "Cycles where a code line fetch is stalled due to an L1 instruction cache miss. The legacy decode pipeline works at a 16 Byte granularity.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x83",
+        "UMask": "0x1",
+        "BriefDescription": "Instruction fetch tag lookups that hit in the instruction cache (L1I). Counts at 64-byte cache-line granularity.",
+        "Counter": "0,1,2,3",
+        "EventName": "ICACHE_64B.IFTAG_HIT",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x83",
+        "UMask": "0x2",
+        "BriefDescription": "Instruction fetch tag lookups that miss in the instruction cache (L1I). Counts at 64-byte cache-line granularity.",
+        "Counter": "0,1,2,3",
+        "EventName": "ICACHE_64B.IFTAG_MISS",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x83",
+        "UMask": "0x4",
+        "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache tag miss.",
+        "Counter": "0,1,2,3",
+        "EventName": "ICACHE_64B.IFTAG_STALL",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x9C",
+        "UMask": "0x1",
+        "BriefDescription": "Uops not delivered to Resource Allocation Table (RAT) per thread when backend of the machine is not stalled",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CORE",
+        "PublicDescription": "Counts the number of uops not delivered to Resource Allocation Table (RAT) per thread adding \u201c4 \u2013 x\u201d when Resource Allocation Table (RAT) is not stalled and Instruction Decode Queue (IDQ) delivers x uops to Resource Allocation Table (RAT) (where x belongs to {0,1,2,3}). Counting does not cover cases when: a. IDQ-Resource Allocation Table (RAT) pipe serves the other thread. b. Resource Allocation Table (RAT) is stalled for the thread (including uop drops and clear BE conditions).  c. Instruction Decode Queue (IDQ) delivers four uops.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x9C",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles per thread when 4 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE",
+        "CounterMask": "4",
+        "PublicDescription": "Counts, on the per-thread basis, cycles when no uops are delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core =4.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x9C",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles per thread when 3 or more uops are not delivered to Resource Allocation Table (RAT) when backend of the machine is not stalled",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_1_UOP_DELIV.CORE",
+        "CounterMask": "3",
+        "PublicDescription": "Counts, on the per-thread basis, cycles when less than 1 uop is delivered to Resource Allocation Table (RAT). IDQ_Uops_Not_Delivered.core >= 3.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x9C",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles with less than 2 uops delivered by the front end.",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_2_UOP_DELIV.CORE",
+        "CounterMask": "2",
+        "PublicDescription": "Cycles with less than 2 uops delivered by the front-end.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x9C",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles with less than 3 uops delivered by the front end.",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_LE_3_UOP_DELIV.CORE",
+        "CounterMask": "1",
+        "PublicDescription": "Cycles with less than 3 uops delivered by the front-end.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "Invert": "1",
+        "EventCode": "0x9C",
+        "UMask": "0x1",
+        "BriefDescription": "Counts cycles FE delivered 4 uops or Resource Allocation Table (RAT) was stalling FE.",
+        "Counter": "0,1,2,3",
+        "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK",
+        "CounterMask": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xAB",
+        "UMask": "0x2",
+        "BriefDescription": "Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles.",
+        "Counter": "0,1,2,3",
+        "EventName": "DSB2MITE_SWITCHES.PENALTY_CYCLES",
+        "PublicDescription": "Counts Decode Stream Buffer (DSB)-to-MITE switch true penalty cycles. These cycles do not include uops routed through because of the switch itself, for example, when Instruction Decode Queue (IDQ) pre-allocation is unavailable, or Instruction Decode Queue (IDQ) is full. SBD-to-MITE switch true penalty cycles happen after the merge mux (MM) receives Decode Stream Buffer (DSB) Sync-indication until receiving the first MITE uop. MM is placed before Instruction Decode Queue (IDQ) to merge uops being fed from the MITE and Decode Stream Buffer (DSB) paths. Decode Stream Buffer (DSB) inserts the Sync-indication whenever a Decode Stream Buffer (DSB)-to-MITE switch occurs.Penalty: A Decode Stream Buffer (DSB) hit followed by a Decode Stream Buffer (DSB) miss can cost up to six cycles in which no uops are delivered to the IDQ. Most often, such switches from the Decode Stream Buffer (DSB) to the legacy pipeline cost 0\u20132 cycles.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired Instructions who experienced decode stream buffer (DSB - the decoded instruction-cache) miss.",
+        "PEBS": "1",
+        "MSRValue": "0x11",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.DSB_MISS",
+        "MSRIndex": "0x3F7",
+        "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. ",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired Instructions who experienced Instruction L1 Cache true miss.",
+        "PEBS": "1",
+        "MSRValue": "0x12",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.L1I_MISS",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired Instructions who experienced Instruction L2 Cache true miss.",
+        "PEBS": "1",
+        "MSRValue": "0x13",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.L2_MISS",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired Instructions who experienced iTLB true miss.",
+        "PEBS": "1",
+        "MSRValue": "0x14",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.ITLB_MISS",
+        "MSRIndex": "0x3F7",
+        "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired Instructions who experienced STLB (2nd level TLB) true miss.",
+        "PEBS": "1",
+        "MSRValue": "0x15",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.STLB_MISS",
+        "MSRIndex": "0x3F7",
+        "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss. ",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 2 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x400206",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_2",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end had at least 2 bubble-slots for a period of 2 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x200206",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_2",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x400406",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_4",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 8 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x400806",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_8",
+        "MSRIndex": "0x3F7",
+        "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 16 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x401006",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_16",
+        "MSRIndex": "0x3F7",
+        "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 32 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x402006",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_32",
+        "MSRIndex": "0x3F7",
+        "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x404006",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_64",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x408006",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_128",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x410006",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_256",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x420006",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_512",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end had at least 1 bubble-slot for a period of 2 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x100206",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1",
+        "MSRIndex": "0x3F7",
+        "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC6",
+        "UMask": "0x1",
+        "BriefDescription": "Retired instructions that are fetched after an interval where the front-end had at least 3 bubble-slots for a period of 2 cycles which was not interrupted by a back-end stall.",
+        "PEBS": "1",
+        "MSRValue": "0x300206",
+        "Counter": "0,1,2,3",
+        "EventName": "FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_3",
+        "MSRIndex": "0x3F7",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/memory.json b/tools/perf/pmu-events/arch/x86/skylakex/memory.json
new file mode 100644
index 000000000000..ca22a22c1abd
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/memory.json
@@ -0,0 +1,1396 @@
+[
+    {
+        "EventCode": "0x54",
+        "UMask": "0x1",
+        "BriefDescription": "Number of times a transactional abort was signaled due to a data conflict on a transactionally accessed address",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_MEM.ABORT_CONFLICT",
+        "PublicDescription": "Number of times a TSX line had a cache conflict.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x54",
+        "UMask": "0x2",
+        "BriefDescription": "Number of times a transactional abort was signaled due to a data capacity limitation for transactional reads or writes.",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_MEM.ABORT_CAPACITY",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x54",
+        "UMask": "0x4",
+        "BriefDescription": "Number of times a HLE transactional region aborted due to a non XRELEASE prefixed instruction writing to an elided lock in the elision buffer",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_MEM.ABORT_HLE_STORE_TO_ELIDED_LOCK",
+        "PublicDescription": "Number of times a TSX Abort was triggered due to a non-release/commit store to lock.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x54",
+        "UMask": "0x8",
+        "BriefDescription": "Number of times an HLE transactional execution aborted due to NoAllocatedElisionBuffer being non-zero.",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_MEM.ABORT_HLE_ELISION_BUFFER_NOT_EMPTY",
+        "PublicDescription": "Number of times a TSX Abort was triggered due to commit but Lock Buffer not empty.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x54",
+        "UMask": "0x10",
+        "BriefDescription": "Number of times an HLE transactional execution aborted due to XRELEASE lock not satisfying the address and value requirements in the elision buffer",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_MEM.ABORT_HLE_ELISION_BUFFER_MISMATCH",
+        "PublicDescription": "Number of times a TSX Abort was triggered due to release/commit but data and address mismatch.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x54",
+        "UMask": "0x20",
+        "BriefDescription": "Number of times an HLE transactional execution aborted due to an unsupported read alignment from the elision buffer.",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_MEM.ABORT_HLE_ELISION_BUFFER_UNSUPPORTED_ALIGNMENT",
+        "PublicDescription": "Number of times a TSX Abort was triggered due to attempting an unsupported alignment from Lock Buffer.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x54",
+        "UMask": "0x40",
+        "BriefDescription": "Number of times HLE lock could not be elided due to ElisionBufferAvailable being zero.",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_MEM.HLE_ELISION_BUFFER_FULL",
+        "PublicDescription": "Number of times we could not allocate Lock Buffer.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x5d",
+        "UMask": "0x1",
+        "BriefDescription": "Counts the number of times a class of instructions that may cause a transactional abort was executed. Since this is the count of execution, it may not always cause a transactional abort.",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_EXEC.MISC1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x5d",
+        "UMask": "0x2",
+        "BriefDescription": "Counts the number of times a class of instructions (e.g., vzeroupper) that may cause a transactional abort was executed inside a transactional region",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_EXEC.MISC2",
+        "PublicDescription": "Unfriendly TSX abort triggered by a vzeroupper instruction.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x5d",
+        "UMask": "0x4",
+        "BriefDescription": "Counts the number of times an instruction execution caused the transactional nest count supported to be exceeded",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_EXEC.MISC3",
+        "PublicDescription": "Unfriendly TSX abort triggered by a nest count that is too deep.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x5d",
+        "UMask": "0x8",
+        "BriefDescription": "Counts the number of times a XBEGIN instruction was executed inside an HLE transactional region.",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_EXEC.MISC4",
+        "PublicDescription": "RTM region detected inside HLE.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x5d",
+        "UMask": "0x10",
+        "BriefDescription": "Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region",
+        "Counter": "0,1,2,3",
+        "EventName": "TX_EXEC.MISC5",
+        "PublicDescription": "Counts the number of times an HLE XACQUIRE instruction was executed inside an RTM transactional region.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x10",
+        "BriefDescription": "Counts number of Offcore outstanding Demand Data Read requests that miss L3 cache in the superQ every cycle.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles with at least 1 Demand Data Read requests who miss L3 cache in the superQ.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD",
+        "CounterMask": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x60",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles with at least 6 Demand Data Read requests that miss L3 cache in the superQ.",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD_GE_6",
+        "CounterMask": "6",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles while L3 cache miss demand load is outstanding.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.CYCLES_L3_MISS",
+        "CounterMask": "2",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0x6",
+        "BriefDescription": "Execution stalls while L3 cache miss demand load is outstanding.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.STALLS_L3_MISS",
+        "CounterMask": "6",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB0",
+        "UMask": "0x10",
+        "BriefDescription": "Demand Data Read requests who miss L3 cache",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
+        "PublicDescription": "Demand Data Read requests who miss L3 cache.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC3",
+        "UMask": "0x2",
+        "BriefDescription": "Counts the number of machine clears due to memory order conflicts.",
+        "Counter": "0,1,2,3",
+        "EventName": "MACHINE_CLEARS.MEMORY_ORDERING",
+        "Errata": "SKL089",
+        "PublicDescription": "Counts the number of memory ordering Machine Clears detected. Memory Ordering Machine Clears can result from one of the following:a. memory disambiguation,b. external snoop, orc. cross SMT-HW-thread snoop (stores) hitting load buffer.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC8",
+        "UMask": "0x1",
+        "BriefDescription": "Number of times an HLE execution started.",
+        "Counter": "0,1,2,3",
+        "EventName": "HLE_RETIRED.START",
+        "PublicDescription": "Number of times we entered an HLE region. Does not count nested transactions.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC8",
+        "UMask": "0x2",
+        "BriefDescription": "Number of times an HLE execution successfully committed",
+        "Counter": "0,1,2,3",
+        "EventName": "HLE_RETIRED.COMMIT",
+        "PublicDescription": "Number of times HLE commit succeeded.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC8",
+        "UMask": "0x4",
+        "BriefDescription": "Number of times an HLE execution aborted due to any reasons (multiple categories may count as one). ",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "HLE_RETIRED.ABORTED",
+        "PublicDescription": "Number of times HLE abort was triggered.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC8",
+        "UMask": "0x8",
+        "BriefDescription": "Number of times an HLE execution aborted due to various memory events (e.g., read/write capacity and conflicts).",
+        "Counter": "0,1,2,3",
+        "EventName": "HLE_RETIRED.ABORTED_MEM",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC8",
+        "UMask": "0x10",
+        "BriefDescription": "Number of times an HLE execution aborted due to hardware timer expiration.",
+        "Counter": "0,1,2,3",
+        "EventName": "HLE_RETIRED.ABORTED_TIMER",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC8",
+        "UMask": "0x20",
+        "BriefDescription": "Number of times an HLE execution aborted due to HLE-unfriendly instructions and certain unfriendly events (such as AD assists etc.). ",
+        "Counter": "0,1,2,3",
+        "EventName": "HLE_RETIRED.ABORTED_UNFRIENDLY",
+        "PublicDescription": "Number of times an HLE execution aborted due to HLE-unfriendly instructions and certain unfriendly events (such as AD assists etc.).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC8",
+        "UMask": "0x40",
+        "BriefDescription": "Number of times an HLE execution aborted due to incompatible memory type",
+        "Counter": "0,1,2,3",
+        "EventName": "HLE_RETIRED.ABORTED_MEMTYPE",
+        "PublicDescription": "Number of times an HLE execution aborted due to incompatible memory type.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC8",
+        "UMask": "0x80",
+        "BriefDescription": "Number of times an HLE execution aborted due to unfriendly events (such as interrupts).",
+        "Counter": "0,1,2,3",
+        "EventName": "HLE_RETIRED.ABORTED_EVENTS",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC9",
+        "UMask": "0x1",
+        "BriefDescription": "Number of times an RTM execution started.",
+        "Counter": "0,1,2,3",
+        "EventName": "RTM_RETIRED.START",
+        "PublicDescription": "Number of times we entered an RTM region. Does not count nested transactions.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC9",
+        "UMask": "0x2",
+        "BriefDescription": "Number of times an RTM execution successfully committed",
+        "Counter": "0,1,2,3",
+        "EventName": "RTM_RETIRED.COMMIT",
+        "PublicDescription": "Number of times RTM commit succeeded.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC9",
+        "UMask": "0x4",
+        "BriefDescription": "Number of times an RTM execution aborted due to any reasons (multiple categories may count as one). ",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "RTM_RETIRED.ABORTED",
+        "PublicDescription": "Number of times RTM abort was triggered.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC9",
+        "UMask": "0x8",
+        "BriefDescription": "Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts)",
+        "Counter": "0,1,2,3",
+        "EventName": "RTM_RETIRED.ABORTED_MEM",
+        "PublicDescription": "Number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC9",
+        "UMask": "0x10",
+        "BriefDescription": "Number of times an RTM execution aborted due to uncommon conditions.",
+        "Counter": "0,1,2,3",
+        "EventName": "RTM_RETIRED.ABORTED_TIMER",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC9",
+        "UMask": "0x20",
+        "BriefDescription": "Number of times an RTM execution aborted due to HLE-unfriendly instructions",
+        "Counter": "0,1,2,3",
+        "EventName": "RTM_RETIRED.ABORTED_UNFRIENDLY",
+        "PublicDescription": "Number of times an RTM execution aborted due to HLE-unfriendly instructions.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC9",
+        "UMask": "0x40",
+        "BriefDescription": "Number of times an RTM execution aborted due to incompatible memory type",
+        "Counter": "0,1,2,3",
+        "EventName": "RTM_RETIRED.ABORTED_MEMTYPE",
+        "PublicDescription": "Number of times an RTM execution aborted due to incompatible memory type.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC9",
+        "UMask": "0x80",
+        "BriefDescription": "Number of times an RTM execution aborted due to none of the previous 4 categories (e.g. interrupt)",
+        "Counter": "0,1,2,3",
+        "EventName": "RTM_RETIRED.ABORTED_EVENTS",
+        "PublicDescription": "Number of times an RTM execution aborted due to none of the previous 4 categories (e.g. interrupt).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xCD",
+        "UMask": "0x1",
+        "BriefDescription": "Counts loads when the latency from first dispatch to completion is greater than 4 cycles.",
+        "PEBS": "2",
+        "MSRValue": "0x4",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4",
+        "MSRIndex": "0x3F6",
+        "PublicDescription": "Counts loads when the latency from first dispatch to completion is greater than 4 cycles.  Reported latency may be longer than just the memory latency.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xCD",
+        "UMask": "0x1",
+        "BriefDescription": "Counts loads when the latency from first dispatch to completion is greater than 8 cycles.",
+        "PEBS": "2",
+        "MSRValue": "0x8",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8",
+        "MSRIndex": "0x3F6",
+        "PublicDescription": "Counts loads when the latency from first dispatch to completion is greater than 8 cycles.  Reported latency may be longer than just the memory latency.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "50021",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xCD",
+        "UMask": "0x1",
+        "BriefDescription": "Counts loads when the latency from first dispatch to completion is greater than 16 cycles.",
+        "PEBS": "2",
+        "MSRValue": "0x10",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16",
+        "MSRIndex": "0x3F6",
+        "PublicDescription": "Counts loads when the latency from first dispatch to completion is greater than 16 cycles.  Reported latency may be longer than just the memory latency.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "20011",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xCD",
+        "UMask": "0x1",
+        "BriefDescription": "Counts loads when the latency from first dispatch to completion is greater than 32 cycles.",
+        "PEBS": "2",
+        "MSRValue": "0x20",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32",
+        "MSRIndex": "0x3F6",
+        "PublicDescription": "Counts loads when the latency from first dispatch to completion is greater than 32 cycles.  Reported latency may be longer than just the memory latency.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xCD",
+        "UMask": "0x1",
+        "BriefDescription": "Counts loads when the latency from first dispatch to completion is greater than 64 cycles.",
+        "PEBS": "2",
+        "MSRValue": "0x40",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64",
+        "MSRIndex": "0x3F6",
+        "PublicDescription": "Counts loads when the latency from first dispatch to completion is greater than 64 cycles.  Reported latency may be longer than just the memory latency.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "2003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xCD",
+        "UMask": "0x1",
+        "BriefDescription": "Counts loads when the latency from first dispatch to completion is greater than 128 cycles.",
+        "PEBS": "2",
+        "MSRValue": "0x80",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128",
+        "MSRIndex": "0x3F6",
+        "PublicDescription": "Counts loads when the latency from first dispatch to completion is greater than 128 cycles.  Reported latency may be longer than just the memory latency.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "1009",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xCD",
+        "UMask": "0x1",
+        "BriefDescription": "Counts loads when the latency from first dispatch to completion is greater than 256 cycles.",
+        "PEBS": "2",
+        "MSRValue": "0x100",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256",
+        "MSRIndex": "0x3F6",
+        "PublicDescription": "Counts loads when the latency from first dispatch to completion is greater than 256 cycles.  Reported latency may be longer than just the memory latency.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "503",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xCD",
+        "UMask": "0x1",
+        "BriefDescription": "Counts loads when the latency from first dispatch to completion is greater than 512 cycles.",
+        "PEBS": "2",
+        "MSRValue": "0x200",
+        "Counter": "0,1,2,3",
+        "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512",
+        "MSRIndex": "0x3F6",
+        "PublicDescription": "Counts loads when the latency from first dispatch to completion is greater than 512 cycles.  Reported latency may be longer than just the memory latency.",
+        "TakenAlone": "1",
+        "SampleAfterValue": "101",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that miss in the L3.",
+        "MSRValue": "0x3fbc000001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts demand data reads that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000001 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_DATA_RD.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts demand data reads that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that miss in the L3.",
+        "MSRValue": "0x3fbc000002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000002 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_RFO.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand data writes (RFOs) that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that miss in the L3.",
+        "MSRValue": "0x3fbc000004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand code reads that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000004 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.DEMAND_CODE_RD.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand code reads that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that miss in the L3.",
+        "MSRValue": "0x3fbc000010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000010 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_DATA_RD.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch (that bring data to L2) data reads that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that miss in the L3.",
+        "MSRValue": "0x3fbc000020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000020 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L2_RFO.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to L2) RFOs that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss in the L3.",
+        "MSRValue": "0x3fbc000080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000080 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_DATA_RD.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) data reads that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss in the L3.",
+        "MSRValue": "0x3fbc000100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000100 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L3_RFO.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch (that bring data to LLC only) RFOs that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss in the L3.",
+        "MSRValue": "0x3fbc000400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000400 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.PF_L1D_AND_SW.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts L1 data cache hardware prefetch requests and software prefetch requests that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that miss in the L3.",
+        "MSRValue": "0x3fbc000490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all prefetch data reads that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000490 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_DATA_RD.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all prefetch data reads that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that miss in the L3.",
+        "MSRValue": "0x3fbc000120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts prefetch RFOs that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000120 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_PF_RFO.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts prefetch RFOs that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that miss in the L3.",
+        "MSRValue": "0x3fbc000491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000491 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_DATA_RD.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch data reads that miss the L3 and the data is returned from local dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that miss in the L3.",
+        "MSRValue": "0x3fbc000122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_MISS.ANY_SNOOP",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that miss in the L3. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that miss the L3 and clean or shared data is transferred from remote cache.",
+        "MSRValue": "0x083fc00122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_MISS.REMOTE_HIT_FORWARD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that miss the L3 and clean or shared data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that miss the L3 and the modified data is transferred from remote cache.",
+        "MSRValue": "0x103fc00122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_MISS.REMOTE_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that miss the L3 and the modified data is transferred from remote cache. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local or remote dram.",
+        "MSRValue": "0x063fc00122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_MISS.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local or remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from remote dram.",
+        "MSRValue": "0x063b800122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_MISS_REMOTE_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from remote dram. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "Offcore": "1",
+        "EventCode": "0xB7, 0xBB",
+        "UMask": "0x1",
+        "BriefDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local dram.",
+        "MSRValue": "0x0604000122 ",
+        "Counter": "0,1,2,3",
+        "EventName": "OFFCORE_RESPONSE.ALL_RFO.L3_MISS_LOCAL_DRAM.SNOOP_MISS_OR_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "PublicDescription": "Counts all demand & prefetch RFOs that miss the L3 and the data is returned from local dram.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/other.json b/tools/perf/pmu-events/arch/x86/skylakex/other.json
new file mode 100644
index 000000000000..70243b0b0586
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/other.json
@@ -0,0 +1,72 @@
+[
+    {
+        "EventCode": "0x28",
+        "UMask": "0x7",
+        "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the Non-AVX turbo schedule.",
+        "Counter": "0,1,2,3",
+        "EventName": "CORE_POWER.LVL0_TURBO_LICENSE",
+        "PublicDescription": "Core cycles where the core was running with power-delivery for baseline license level 0.  This includes non-AVX codes, SSE, AVX 128-bit, and low-current AVX 256-bit codes.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x28",
+        "UMask": "0x18",
+        "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX2 turbo schedule.",
+        "Counter": "0,1,2,3",
+        "EventName": "CORE_POWER.LVL1_TURBO_LICENSE",
+        "PublicDescription": "Core cycles where the core was running with power-delivery for license level 1.  This includes high current AVX 256-bit instructions as well as low current AVX 512-bit instructions.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x28",
+        "UMask": "0x20",
+        "BriefDescription": "Core cycles where the core was running in a manner where Turbo may be clipped to the AVX512 turbo schedule.",
+        "Counter": "0,1,2,3",
+        "EventName": "CORE_POWER.LVL2_TURBO_LICENSE",
+        "PublicDescription": "Core cycles where the core was running with power-delivery for license level 2 (introduced in Skylake Server michroarchtecture).  This includes high current AVX 512-bit instructions.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x28",
+        "UMask": "0x40",
+        "BriefDescription": "Core cycles the core was throttled due to a pending power level request.",
+        "Counter": "0,1,2,3",
+        "EventName": "CORE_POWER.THROTTLE",
+        "PublicDescription": "Core cycles the out-of-order engine was throttled due to a pending power level request.",
+        "SampleAfterValue": "200003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xCB",
+        "UMask": "0x1",
+        "BriefDescription": "Number of hardware interrupts received by the processor.",
+        "Counter": "0,1,2,3",
+        "EventName": "HW_INTERRUPTS.RECEIVED",
+        "PublicDescription": "Counts the number of hardware interruptions received by the processor.",
+        "SampleAfterValue": "203",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xFE",
+        "UMask": "0x2",
+        "BriefDescription": "Counts number of cache lines that are allocated and written back to L3 with the intention that they are more likely to be reused shortly",
+        "Counter": "0,1,2,3",
+        "EventName": "IDI_MISC.WB_UPGRADE",
+        "PublicDescription": "Counts number of cache lines that are allocated and written back to L3 with the intention that they are more likely to be reused shortly.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xFE",
+        "UMask": "0x4",
+        "BriefDescription": "Counts number of cache lines that are dropped and not written back to L3 as they are deemed to be less likely to be reused shortly",
+        "Counter": "0,1,2,3",
+        "EventName": "IDI_MISC.WB_DOWNGRADE",
+        "PublicDescription": "Counts number of cache lines that are dropped and not written back to L3 as they are deemed to be less likely to be reused shortly.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json b/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json
new file mode 100644
index 000000000000..0895d1e52a4a
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/pipeline.json
@@ -0,0 +1,950 @@
+[
+    {
+        "EventCode": "0x00",
+        "UMask": "0x1",
+        "BriefDescription": "Instructions retired from execution.",
+        "Counter": "Fixed counter 1",
+        "EventName": "INST_RETIRED.ANY",
+        "PublicDescription": "Counts the number of instructions retired from execution. For instructions that consist of multiple micro-ops, Counts the retirement of the last micro-op of the instruction. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. INST_RETIRED.ANY_P is counted by a programmable counter and it is an architectural performance event. Counting: Faulting executions of GETSEC/VM entry/VM Exit/MWait will not count as retired instructions.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "Fixed counter 1"
+    },
+    {
+        "EventCode": "0x00",
+        "UMask": "0x2",
+        "BriefDescription": "Core cycles when the thread is not in halt state",
+        "Counter": "Fixed counter 2",
+        "EventName": "CPU_CLK_UNHALTED.THREAD",
+        "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "Fixed counter 2"
+    },
+    {
+        "EventCode": "0x00",
+        "UMask": "0x2",
+        "BriefDescription": "Core cycles when at least one thread on the physical core is not in halt state.",
+        "Counter": "Fixed counter 2",
+        "EventName": "CPU_CLK_UNHALTED.THREAD_ANY",
+        "AnyThread": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "Fixed counter 2"
+    },
+    {
+        "EventCode": "0x00",
+        "UMask": "0x3",
+        "BriefDescription": "Reference cycles when the core is not in halt state.",
+        "Counter": "Fixed counter 3",
+        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
+        "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. This event has a constant ratio with the CPU_CLK_UNHALTED.REF_XCLK event. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'.  The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'.  After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "Fixed counter 3"
+    },
+    {
+        "EventCode": "0x03",
+        "UMask": "0x2",
+        "BriefDescription": "Loads blocked by overlapping with store buffer that cannot be forwarded .",
+        "Counter": "0,1,2,3",
+        "EventName": "LD_BLOCKS.STORE_FORWARD",
+        "PublicDescription": "Counts how many times the load operation got the true Block-on-Store blocking code preventing store forwarding. This includes cases when:a. preceding store conflicts with the load (incomplete overlap),b. store forwarding is impossible due to u-arch limitations,c. preceding lock RMW operations are not forwarded,d. store has the no-forward bit set (uncacheable/page-split/masked stores),e. all-blocking stores are used (mostly, fences and port I/O), and others.The most common case is a load blocked due to its address range overlapping with a preceding smaller uncompleted store. Note: This event does not take into account cases of out-of-SW-control (for example, SbTailHit), unknown physical STA, and cases of blocking loads on store due to being non-WB memory type or a lock. These cases are covered by other events. See the table of not supported store forwards in the Optimization Guide.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x03",
+        "UMask": "0x8",
+        "BriefDescription": "The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use",
+        "Counter": "0,1,2,3",
+        "EventName": "LD_BLOCKS.NO_SR",
+        "PublicDescription": "The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x07",
+        "UMask": "0x1",
+        "BriefDescription": "False dependencies in MOB due to partial compare on address.",
+        "Counter": "0,1,2,3",
+        "EventName": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS",
+        "PublicDescription": "Counts false dependencies in MOB when the partial comparison upon loose net check and dependency was resolved by the Enhanced Loose net mechanism. This may not result in high performance penalties. Loose net checks can fail when loads and stores are 4k aliased.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x0D",
+        "UMask": "0x1",
+        "BriefDescription": "Core cycles the allocator was stalled due to recovery from earlier clear event for this thread (e.g. misprediction or memory nuke)",
+        "Counter": "0,1,2,3",
+        "EventName": "INT_MISC.RECOVERY_CYCLES",
+        "PublicDescription": "Core cycles the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x0D",
+        "UMask": "0x1",
+        "BriefDescription": "Core cycles the allocator was stalled due to recovery from earlier clear event for any thread running on the physical core (e.g. misprediction or memory nuke).",
+        "Counter": "0,1,2,3",
+        "EventName": "INT_MISC.RECOVERY_CYCLES_ANY",
+        "AnyThread": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x0D",
+        "UMask": "0x80",
+        "BriefDescription": "Cycles the issue-stage is waiting for front-end to fetch from resteered path following branch misprediction or machine clear events.",
+        "Counter": "0,1,2,3",
+        "EventName": "INT_MISC.CLEAR_RESTEER_CYCLES",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x0E",
+        "UMask": "0x1",
+        "BriefDescription": "Uops that Resource Allocation Table (RAT) issues to Reservation Station (RS)",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_ISSUED.ANY",
+        "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "Invert": "1",
+        "EventCode": "0x0E",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles when Resource Allocation Table (RAT) does not issue Uops to Reservation Station (RS) for the thread",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_ISSUED.STALL_CYCLES",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles during which the Resource Allocation Table (RAT) does not issue any Uops to the reservation station (RS) for the current thread.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x0E",
+        "UMask": "0x2",
+        "BriefDescription": "Uops inserted at issue-stage in order to preserve upper bits of vector registers.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_ISSUED.VECTOR_WIDTH_MISMATCH",
+        "PublicDescription": "Counts the number of Blend Uops issued by the Resource Allocation Table (RAT) to the reservation station (RS) in order to preserve upper bits of vector registers. Starting with the Skylake microarchitecture, these Blend uops are needed since every Intel SSE instruction executed in Dirty Upper State needs to preserve bits 128-255 of the destination register. For more information, refer to \u201cMixing Intel AVX and Intel SSE Code\u201d section of the Optimization Guide.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x0E",
+        "UMask": "0x20",
+        "BriefDescription": "Number of slow LEA uops being allocated. A uop is generally considered SlowLea if it has 3 sources (e.g. 2 sources + immediate) regardless if as a result of LEA instruction or not.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_ISSUED.SLOW_LEA",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x14",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations.",
+        "Counter": "0,1,2,3",
+        "EventName": "ARITH.DIVIDER_ACTIVE",
+        "CounterMask": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x3C",
+        "UMask": "0x0",
+        "BriefDescription": "Thread cycles when thread is not in halt state",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_UNHALTED.THREAD_P",
+        "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x3C",
+        "UMask": "0x0",
+        "BriefDescription": "Core cycles when at least one thread on the physical core is not in halt state.",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_UNHALTED.THREAD_P_ANY",
+        "AnyThread": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EdgeDetect": "1",
+        "EventCode": "0x3C",
+        "UMask": "0x0",
+        "BriefDescription": "Counts when there is a transition from ring 1, 2 or 3 to ring 0.",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_UNHALTED.RING0_TRANS",
+        "CounterMask": "1",
+        "PublicDescription": "Counts when the Current Privilege Level (CPL) transitions from ring 1, 2 or 3 to ring 0 (Kernel).",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x3C",
+        "UMask": "0x1",
+        "BriefDescription": "Core crystal clock cycles when the thread is unhalted.",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_THREAD_UNHALTED.REF_XCLK",
+        "SampleAfterValue": "2503",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x3C",
+        "UMask": "0x1",
+        "BriefDescription": "Core crystal clock cycles when at least one thread on the physical core is unhalted.",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_THREAD_UNHALTED.REF_XCLK_ANY",
+        "AnyThread": "1",
+        "SampleAfterValue": "2503",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x3C",
+        "UMask": "0x1",
+        "BriefDescription": "Core crystal clock cycles when the thread is unhalted.",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_UNHALTED.REF_XCLK",
+        "SampleAfterValue": "2503",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x3C",
+        "UMask": "0x1",
+        "BriefDescription": "Core crystal clock cycles when at least one thread on the physical core is unhalted.",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_UNHALTED.REF_XCLK_ANY",
+        "AnyThread": "1",
+        "SampleAfterValue": "2503",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x3C",
+        "UMask": "0x2",
+        "BriefDescription": "Core crystal clock cycles when this thread is unhalted and the other thread is halted.",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_THREAD_UNHALTED.ONE_THREAD_ACTIVE",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x3C",
+        "UMask": "0x2",
+        "BriefDescription": "Core crystal clock cycles when this thread is unhalted and the other thread is halted.",
+        "Counter": "0,1,2,3",
+        "EventName": "CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE",
+        "SampleAfterValue": "2503",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x4C",
+        "UMask": "0x1",
+        "BriefDescription": "Demand load dispatches that hit L1D fill buffer (FB) allocated for software prefetch.",
+        "Counter": "0,1,2,3",
+        "EventName": "LOAD_HIT_PRE.SW_PF",
+        "PublicDescription": "Counts all not software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x5E",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread",
+        "Counter": "0,1,2,3",
+        "EventName": "RS_EVENTS.EMPTY_CYCLES",
+        "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for the thread.; Note: In ST-mode, not active thread should drive 0. This is usually caused by severely costly branch mispredictions, or allocator/FE issues.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EdgeDetect": "1",
+        "Invert": "1",
+        "EventCode": "0x5E",
+        "UMask": "0x1",
+        "BriefDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate Frontend Latency Bound issues.",
+        "Counter": "0,1,2,3",
+        "EventName": "RS_EVENTS.EMPTY_END",
+        "CounterMask": "1",
+        "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to precisely locate front-end Latency Bound issues.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x87",
+        "UMask": "0x1",
+        "BriefDescription": "Stalls caused by changing prefix length of the instruction.",
+        "Counter": "0,1,2,3",
+        "EventName": "ILD_STALL.LCP",
+        "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA1",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles per thread when uops are executed in port 0",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_DISPATCHED_PORT.PORT_0",
+        "PublicDescription": "Counts, on the per-thread basis, cycles during which at least one uop is dispatched from the Reservation Station (RS) to port 0.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA1",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles per thread when uops are executed in port 1",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_DISPATCHED_PORT.PORT_1",
+        "PublicDescription": "Counts, on the per-thread basis, cycles during which at least one uop is dispatched from the Reservation Station (RS) to port 1.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA1",
+        "UMask": "0x4",
+        "BriefDescription": "Cycles per thread when uops are executed in port 2",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_DISPATCHED_PORT.PORT_2",
+        "PublicDescription": "Counts, on the per-thread basis, cycles during which at least one uop is dispatched from the Reservation Station (RS) to port 2.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA1",
+        "UMask": "0x8",
+        "BriefDescription": "Cycles per thread when uops are executed in port 3",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_DISPATCHED_PORT.PORT_3",
+        "PublicDescription": "Counts, on the per-thread basis, cycles during which at least one uop is dispatched from the Reservation Station (RS) to port 3.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA1",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles per thread when uops are executed in port 4",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_DISPATCHED_PORT.PORT_4",
+        "PublicDescription": "Counts, on the per-thread basis, cycles during which at least one uop is dispatched from the Reservation Station (RS) to port 4.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA1",
+        "UMask": "0x20",
+        "BriefDescription": "Cycles per thread when uops are executed in port 5",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_DISPATCHED_PORT.PORT_5",
+        "PublicDescription": "Counts, on the per-thread basis, cycles during which at least one uop is dispatched from the Reservation Station (RS) to port 5.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA1",
+        "UMask": "0x40",
+        "BriefDescription": "Cycles per thread when uops are executed in port 6",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_DISPATCHED_PORT.PORT_6",
+        "PublicDescription": "Counts, on the per-thread basis, cycles during which at least one uop is dispatched from the Reservation Station (RS) to port 6.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA1",
+        "UMask": "0x80",
+        "BriefDescription": "Cycles per thread when uops are executed in port 7",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_DISPATCHED_PORT.PORT_7",
+        "PublicDescription": "Counts, on the per-thread basis, cycles during which at least one uop is dispatched from the Reservation Station (RS) to port 7.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA2",
+        "UMask": "0x1",
+        "BriefDescription": "Resource-related stall cycles",
+        "Counter": "0,1,2,3",
+        "EventName": "RESOURCE_STALLS.ANY",
+        "PublicDescription": "Counts resource-related stall cycles. Reasons for stalls can be as follows:a. *any* u-arch structure got full (LB, SB, RS, ROB, BOB, LM, Physical Register Reclaim Table (PRRT), or Physical History Table (PHT) slots).b. *any* u-arch structure got empty (like INT/SIMD FreeLists).c. FPU control word (FPCW), MXCSR.and others. This counts cycles that the pipeline back-end blocked uop delivery from the front-end.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA2",
+        "UMask": "0x8",
+        "BriefDescription": "Cycles stalled due to no store buffers available. (not including draining form sync).",
+        "Counter": "0,1,2,3",
+        "EventName": "RESOURCE_STALLS.SB",
+        "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles while L2 cache miss demand load is outstanding.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.CYCLES_L2_MISS",
+        "CounterMask": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0x4",
+        "BriefDescription": "Total execution stalls.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.STALLS_TOTAL",
+        "CounterMask": "4",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0x5",
+        "BriefDescription": "Execution stalls while L2 cache miss demand load is outstanding.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.STALLS_L2_MISS",
+        "CounterMask": "5",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0x8",
+        "BriefDescription": "Cycles while L1 cache miss demand load is outstanding.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.CYCLES_L1D_MISS",
+        "CounterMask": "8",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0xc",
+        "BriefDescription": "Execution stalls while L1 cache miss demand load is outstanding.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.STALLS_L1D_MISS",
+        "CounterMask": "12",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles while memory subsystem has an outstanding load.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.CYCLES_MEM_ANY",
+        "CounterMask": "16",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA3",
+        "UMask": "0x14",
+        "BriefDescription": "Execution stalls while memory subsystem has an outstanding load.",
+        "Counter": "0,1,2,3",
+        "EventName": "CYCLE_ACTIVITY.STALLS_MEM_ANY",
+        "CounterMask": "20",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xA6",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles where no uops were executed, the Reservation Station was not empty, the Store Buffer was full and there was no outstanding load.",
+        "Counter": "0,1,2,3",
+        "EventName": "EXE_ACTIVITY.EXE_BOUND_0_PORTS",
+        "PublicDescription": "Counts cycles during which no uops were executed on all ports and Reservation Station (RS) was not empty.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA6",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles total of 1 uop is executed on all ports and Reservation Station was not empty.",
+        "Counter": "0,1,2,3",
+        "EventName": "EXE_ACTIVITY.1_PORTS_UTIL",
+        "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA6",
+        "UMask": "0x4",
+        "BriefDescription": "Cycles total of 2 uops are executed on all ports and Reservation Station was not empty.",
+        "Counter": "0,1,2,3",
+        "EventName": "EXE_ACTIVITY.2_PORTS_UTIL",
+        "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA6",
+        "UMask": "0x8",
+        "BriefDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station was not empty.",
+        "Counter": "0,1,2,3",
+        "EventName": "EXE_ACTIVITY.3_PORTS_UTIL",
+        "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA6",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station was not empty.",
+        "Counter": "0,1,2,3",
+        "EventName": "EXE_ACTIVITY.4_PORTS_UTIL",
+        "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA6",
+        "UMask": "0x40",
+        "BriefDescription": "Cycles where the Store Buffer was full and no outstanding load.",
+        "Counter": "0,1,2,3",
+        "EventName": "EXE_ACTIVITY.BOUND_ON_STORES",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA8",
+        "UMask": "0x1",
+        "BriefDescription": "Number of Uops delivered by the LSD.",
+        "Counter": "0,1,2,3",
+        "EventName": "LSD.UOPS",
+        "PublicDescription": "Number of uops delivered to the back-end by the LSD(Loop Stream Detector).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA8",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles Uops delivered by the LSD, but didn't come from the decoder.",
+        "Counter": "0,1,2,3",
+        "EventName": "LSD.CYCLES_ACTIVE",
+        "CounterMask": "1",
+        "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xA8",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles 4 Uops delivered by the LSD, but didn't come from the decoder.",
+        "Counter": "0,1,2,3",
+        "EventName": "LSD.CYCLES_4_UOPS",
+        "CounterMask": "4",
+        "PublicDescription": "Counts the cycles when 4 uops are delivered by the LSD (Loop-stream detector).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x1",
+        "BriefDescription": "Counts the number of uops to be executed per-thread each cycle.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.THREAD",
+        "PublicDescription": "Number of uops to be executed per-thread each cycle.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "Invert": "1",
+        "EventCode": "0xB1",
+        "UMask": "0x1",
+        "BriefDescription": "Counts number of cycles no uops were dispatched to be executed on this thread.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.STALL_CYCLES",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles where at least 1 uop was executed per-thread",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC",
+        "CounterMask": "1",
+        "PublicDescription": "Cycles where at least 1 uop was executed per-thread.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles where at least 2 uops were executed per-thread",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC",
+        "CounterMask": "2",
+        "PublicDescription": "Cycles where at least 2 uops were executed per-thread.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles where at least 3 uops were executed per-thread",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC",
+        "CounterMask": "3",
+        "PublicDescription": "Cycles where at least 3 uops were executed per-thread.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x1",
+        "BriefDescription": "Cycles where at least 4 uops were executed per-thread",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CYCLES_GE_4_UOPS_EXEC",
+        "CounterMask": "4",
+        "PublicDescription": "Cycles where at least 4 uops were executed per-thread.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x2",
+        "BriefDescription": "Number of uops executed on the core.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CORE",
+        "PublicDescription": "Number of uops executed from any thread.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles at least 1 micro-op is executed from any thread on physical core.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_1",
+        "CounterMask": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles at least 2 micro-op is executed from any thread on physical core.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_2",
+        "CounterMask": "2",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles at least 3 micro-op is executed from any thread on physical core.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_3",
+        "CounterMask": "3",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles at least 4 micro-op is executed from any thread on physical core.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_4",
+        "CounterMask": "4",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "Invert": "1",
+        "EventCode": "0xB1",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles with no micro-ops executed from any thread on physical core.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.CORE_CYCLES_NONE",
+        "CounterMask": "1",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xB1",
+        "UMask": "0x10",
+        "BriefDescription": "Counts the number of x87 uops dispatched.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_EXECUTED.X87",
+        "PublicDescription": "Counts the number of x87 uops executed.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC0",
+        "UMask": "0x0",
+        "BriefDescription": "Number of instructions retired. General Counter - architectural event",
+        "Counter": "0,1,2,3",
+        "EventName": "INST_RETIRED.ANY_P",
+        "Errata": "SKL091, SKL044",
+        "PublicDescription": "Counts the number of instructions (EOMs) retired. Counting covers macro-fused instructions individually (that is, increments by two).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC0",
+        "UMask": "0x1",
+        "BriefDescription": "Precise instruction retired event with HW to reduce effect of PEBS shadow in IP distribution",
+        "PEBS": "2",
+        "Counter": "1",
+        "EventName": "INST_RETIRED.PREC_DIST",
+        "Errata": "SKL091, SKL044",
+        "PublicDescription": "A version of INST_RETIRED that allows for a more unbiased distribution of samples across instructions retired. It utilizes the Precise Distribution of Instructions Retired (PDIR) feature to mitigate some bias in how retired instructions get sampled.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "1"
+    },
+    {
+        "Invert": "1",
+        "EventCode": "0xC0",
+        "UMask": "0x1",
+        "BriefDescription": "Number of cycles using always true condition applied to  PEBS instructions retired event.",
+        "PEBS": "2",
+        "Counter": "0,2,3",
+        "EventName": "INST_RETIRED.TOTAL_CYCLES_PS",
+        "CounterMask": "10",
+        "Errata": "SKL091, SKL044",
+        "PublicDescription": "Number of cycles using an always true condition applied to  PEBS instructions retired event. (inst_ret< 16)",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,2,3"
+    },
+    {
+        "EventCode": "0xC1",
+        "UMask": "0x3f",
+        "BriefDescription": "Number of times a microcode assist is invoked by HW other than FP-assist. Examples include AD (page Access Dirty) and AVX* related assists.",
+        "Counter": "0,1,2,3",
+        "EventName": "OTHER_ASSISTS.ANY",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC2",
+        "UMask": "0x2",
+        "BriefDescription": "Retirement slots used.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_RETIRED.RETIRE_SLOTS",
+        "PublicDescription": "Counts the retirement slots used.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "Invert": "1",
+        "EventCode": "0xC2",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles without actually retired uops.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_RETIRED.STALL_CYCLES",
+        "CounterMask": "1",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts cycles without actually retired uops.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "Invert": "1",
+        "EventCode": "0xC2",
+        "UMask": "0x2",
+        "BriefDescription": "Cycles with less than 10 actually retired uops.",
+        "Counter": "0,1,2,3",
+        "EventName": "UOPS_RETIRED.TOTAL_CYCLES",
+        "CounterMask": "10",
+        "PublicDescription": "Number of cycles using always true condition (uops_ret < 16) applied to non PEBS uops retired event.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EdgeDetect": "1",
+        "EventCode": "0xC3",
+        "UMask": "0x1",
+        "BriefDescription": "Number of machine clears (nukes) of any type. ",
+        "Counter": "0,1,2,3",
+        "EventName": "MACHINE_CLEARS.COUNT",
+        "CounterMask": "1",
+        "PublicDescription": "Number of machine clears (nukes) of any type.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC3",
+        "UMask": "0x4",
+        "BriefDescription": "Self-modifying code (SMC) detected.",
+        "Counter": "0,1,2,3",
+        "EventName": "MACHINE_CLEARS.SMC",
+        "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC4",
+        "UMask": "0x0",
+        "BriefDescription": "All (macro) branch instructions retired.",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_INST_RETIRED.ALL_BRANCHES",
+        "Errata": "SKL091",
+        "PublicDescription": "Counts all (macro) branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC4",
+        "UMask": "0x1",
+        "BriefDescription": "Conditional branch instructions retired.",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_INST_RETIRED.CONDITIONAL",
+        "Errata": "SKL091",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts conditional branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC4",
+        "UMask": "0x2",
+        "BriefDescription": "Direct and indirect near call instructions retired.",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_INST_RETIRED.NEAR_CALL",
+        "Errata": "SKL091",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts both direct and indirect near call instructions retired.",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC4",
+        "UMask": "0x4",
+        "BriefDescription": "All (macro) branch instructions retired. ",
+        "PEBS": "2",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_INST_RETIRED.ALL_BRANCHES_PEBS",
+        "Errata": "SKL091",
+        "PublicDescription": "This is a precise version of BR_INST_RETIRED.ALL_BRANCHES that counts all (macro) branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC4",
+        "UMask": "0x8",
+        "BriefDescription": "Return instructions retired.",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
+        "Errata": "SKL091",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts return instructions retired.",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC4",
+        "UMask": "0x10",
+        "BriefDescription": "Not taken branch instructions retired.",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_INST_RETIRED.NOT_TAKEN",
+        "Errata": "SKL091",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts not taken branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC4",
+        "UMask": "0x20",
+        "BriefDescription": "Taken branch instructions retired.",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_INST_RETIRED.NEAR_TAKEN",
+        "Errata": "SKL091",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts taken branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC4",
+        "UMask": "0x40",
+        "BriefDescription": "Far branch instructions retired.",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_INST_RETIRED.FAR_BRANCH",
+        "Errata": "SKL091",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts far branch instructions retired.",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC5",
+        "UMask": "0x0",
+        "BriefDescription": "All mispredicted macro branch instructions retired.",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
+        "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch.  When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC5",
+        "UMask": "0x1",
+        "BriefDescription": "Mispredicted conditional branch instructions retired.",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_MISP_RETIRED.CONDITIONAL",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted conditional branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC5",
+        "UMask": "0x2",
+        "BriefDescription": "Mispredicted direct and indirect near call instructions retired.",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_MISP_RETIRED.NEAR_CALL",
+        "PublicDescription": "Counts both taken and not taken retired mispredicted direct and indirect near calls, including both register and memory indirect.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xC5",
+        "UMask": "0x4",
+        "BriefDescription": "Mispredicted macro branch instructions retired. ",
+        "PEBS": "2",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_MISP_RETIRED.ALL_BRANCHES_PEBS",
+        "PublicDescription": "This is a precise version of BR_MISP_RETIRED.ALL_BRANCHES that counts all mispredicted macro branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3"
+    },
+    {
+        "EventCode": "0xC5",
+        "UMask": "0x20",
+        "BriefDescription": "Number of near branch instructions retired that were mispredicted and taken.",
+        "PEBS": "1",
+        "Counter": "0,1,2,3",
+        "EventName": "BR_MISP_RETIRED.NEAR_TAKEN",
+        "SampleAfterValue": "400009",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xCC",
+        "UMask": "0x20",
+        "BriefDescription": "Increments whenever there is an update to the LBR array.",
+        "Counter": "0,1,2,3",
+        "EventName": "ROB_MISC_EVENTS.LBR_INSERTS",
+        "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xE6",
+        "UMask": "0x1",
+        "BriefDescription": "Counts the total number when the front end is resteered, mainly when the BPU cannot provide a correct prediction and this is corrected by other branch handling mechanisms at the front end.",
+        "Counter": "0,1,2,3",
+        "EventName": "BACLEARS.ANY",
+        "PublicDescription": "Counts the number of times the front-end is resteered when it finds a branch instruction in a fetch line. This occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-memory.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-memory.json
new file mode 100644
index 000000000000..9c7e5f8beee2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-memory.json
@@ -0,0 +1,172 @@
+[
+    {
+        "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x4",
+        "EventName": "LLC_MISSES.MEM_READ",
+        "PerPkg": "1",
+        "ScaleUnit": "64Bytes",
+        "UMask": "0x3",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x4",
+        "EventName": "LLC_MISSES.MEM_WRITE",
+        "PerPkg": "1",
+        "ScaleUnit": "64Bytes",
+        "UMask": "0xC",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Memory controller clock ticks",
+        "Counter": "0,1,2,3",
+        "EventName": "UNC_M_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x85",
+        "EventName": "UNC_M_POWER_CHANNEL_PPD",
+        "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_channel_ppd %",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Cycles Memory is in self refresh power mode",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x43",
+        "EventName": "UNC_M_POWER_SELF_REFRESH",
+        "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+        "MetricName": "power_self_refresh %",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Pre-charges due to page misses",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2",
+        "EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Pre-charge for reads",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2",
+        "EventName": "UNC_M_PRE_COUNT.RD",
+        "PerPkg": "1",
+        "UMask": "0x4",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Pre-charge for writes",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2",
+        "EventName": "UNC_M_PRE_COUNT.WR",
+        "PerPkg": "1",
+        "UMask": "0x8",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "DRAM Page Activate commands sent due to a write request",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x1",
+        "EventName": "UNC_M_ACT_COUNT.WR",
+        "PerPkg": "1",
+        "PublicDescription": "Counts DRAM Page Activate commands sent on this channel due to a write request to the iMC (Memory Controller).  Activate commands are issued to open up a page on the DRAM devices so that it can be read or written to with a CAS (Column Access Select) command.",
+        "UMask": "0x2",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "All DRAM CAS Commands issued",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x4",
+        "EventName": "UNC_M_CAS_COUNT.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts all CAS (Column Address Select) commands issued to DRAM per memory channel.  CAS commands are issued to specify the address to read or write on DRAM, so this event increments for every read and write. This event counts whether AutoPrecharge (which closes the DRAM Page automatically after a read/write) is enabled or not.",
+        "UMask": "0xF",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x4",
+        "EventName": "LLC_MISSES.MEM_READ",
+        "PerPkg": "1",
+        "ScaleUnit": "64Bytes",
+        "UMask": "0x3",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "All DRAM Read CAS Commands issued (does not include underfills) ",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x4",
+        "EventName": "UNC_M_CAS_COUNT.RD_REG",
+        "PerPkg": "1",
+        "PublicDescription": "Counts CAS (Column Access Select) regular read commands issued to DRAM on a per channel basis.  CAS commands are issued to specify the address to read or write on DRAM, and this event increments for every regular read.  This event only counts regular reads and does not includes underfill reads due to partial write requests.  This event counts whether AutoPrecharge (which closes the DRAM Page automatically after a read/write)  is enabled or not.",
+        "UMask": "0x1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "DRAM Underfill Read CAS Commands issued",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x4",
+        "EventName": "UNC_M_CAS_COUNT.RD_UNDERFILL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts CAS (Column Access Select) underfill read commands issued to DRAM due to a partial write, on a per channel basis.  CAS commands are issued to specify the address to read or write on DRAM, and this command counts underfill reads.  Partial writes must be completed by first reading in the underfill from DRAM and then merging in the partial write data before writing the full line back to DRAM. This event will generally count about the same as the number of partial writes, but may be slightly less because of partials hitting in the WPQ (due to a previous write request). ",
+        "UMask": "0x2",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x4",
+        "EventName": "LLC_MISSES.MEM_WRITE",
+        "PerPkg": "1",
+        "ScaleUnit": "64Bytes",
+        "UMask": "0xC",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue Allocations",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x10",
+        "EventName": "UNC_M_RPQ_INSERTS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of read requests allocated into the Read Pending Queue (RPQ).  This queue is used to schedule reads out to the memory controller and to track the requests.  Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the CHA to the iMC.  The requests deallocate after the read CAS command has been issued to DRAM.  This event counts both Isochronous and non-Isochronous requests which were issued to the RPQ.    ",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Read Pending Queue Occupancy",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x80",
+        "EventName": "UNC_M_RPQ_OCCUPANCY",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of entries in the Read Pending Queue (RPQ) at each cycle.  This can then be used to calculate both the average occupancy of the queue (in conjunction with the number of cycles not empty) and the average latency in the queue (in conjunction with the number of allocations).  The RPQ is used to schedule reads out to the memory controller and to track the requests.  Requests allocate into the RPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the CHA to the iMC. They deallocate from the RPQ after the CAS command has been issued to memory.",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue Allocations",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x20",
+        "EventName": "UNC_M_WPQ_INSERTS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of writes requests allocated into the Write Pending Queue (WPQ).  The WPQ is used to schedule writes out to the memory controller and to track the requests.  Requests allocate into the WPQ soon after they enter the memory controller, and need credits for an entry in this buffer before being sent from the CHA to the iMC (Memory Controller).  The write requests deallocate after being issued to DRAM.  Write requests themselves are able to complete (from the perspective of the rest of the system) as soon they have 'posted' to the iMC.",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Write Pending Queue Occupancy",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x81",
+        "EventName": "UNC_M_WPQ_OCCUPANCY",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of entries in the Write Pending Queue (WPQ) at each cycle.  This can then be used to calculate both the average queue occupancy (in conjunction with the number of cycles not empty) and the average latency (in conjunction with the number of allocations).  The WPQ is used to schedule writes out to the memory controller and to track the requests.",
+        "Unit": "iMC"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/uncore-other.json b/tools/perf/pmu-events/arch/x86/skylakex/uncore-other.json
new file mode 100644
index 000000000000..de6e70e552e2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/uncore-other.json
@@ -0,0 +1,1156 @@
+[
+    {
+        "BriefDescription": "Uncore cache clock ticks",
+        "Counter": "0,1,2,3",
+        "EventName": "UNC_CHA_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "LLC misses - Uncacheable reads (from cpu) . Derived from unc_cha_tor_inserts.ia_miss",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x35",
+        "EventName": "LLC_MISSES.UNCACHEABLE",
+        "Filter": "config1=0x40e33",
+        "PerPkg": "1",
+        "UMask": "0x21",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "MMIO reads. Derived from unc_cha_tor_inserts.ia_miss",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x35",
+        "EventName": "LLC_MISSES.MMIO_READ",
+        "Filter": "config1=0x40040e33",
+        "PerPkg": "1",
+        "UMask": "0x21",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "MMIO writes. Derived from unc_cha_tor_inserts.ia_miss",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x35",
+        "EventName": "LLC_MISSES.MMIO_WRITE",
+        "Filter": "config1=0x40041e33",
+        "PerPkg": "1",
+        "UMask": "0x21",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Streaming stores (full cache line). Derived from unc_cha_tor_inserts.ia_miss",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x35",
+        "EventName": "LLC_REFERENCES.STREAMING_FULL",
+        "Filter": "config1=0x41833",
+        "PerPkg": "1",
+        "ScaleUnit": "64Bytes",
+        "UMask": "0x21",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Streaming stores (partial cache line). Derived from unc_cha_tor_inserts.ia_miss",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x35",
+        "EventName": "LLC_REFERENCES.STREAMING_PARTIAL",
+        "Filter": "config1=0x41a33",
+        "PerPkg": "1",
+        "ScaleUnit": "64Bytes",
+        "UMask": "0x21",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "read requests from home agent",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.READS",
+        "PerPkg": "1",
+        "UMask": "0x03",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "read requests from local home agent",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.READS_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "read requests from remote home agent",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.READS_REMOTE",
+        "PerPkg": "1",
+        "UMask": "0x02",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "write requests from home agent",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.WRITES",
+        "PerPkg": "1",
+        "UMask": "0x0C",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "write requests from local home agent",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.WRITES_LOCAL",
+        "PerPkg": "1",
+        "UMask": "0x04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "write requests from remote home agent",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.WRITES_REMOTE",
+        "PerPkg": "1",
+        "UMask": "0x08",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "UPI interconnect send bandwidth for payload. Derived from unc_upi_txl_flits.all_data",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2",
+        "EventName": "UPI_DATA_BANDWIDTH_TX",
+        "PerPkg": "1",
+        "ScaleUnit": "7.11E-06Bytes",
+        "UMask": "0x0F",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth reading at IIO. Derived from unc_iio_data_req_of_cpu.mem_read.part0",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "LLC_MISSES.PCIE_READ",
+        "FCMask": "0x07",
+        "Filter": "ch_mask=0x1f",
+        "MetricExpr": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3",
+        "MetricName": "LLC_MISSES.PCIE_READ",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth writing at IIO. Derived from unc_iio_data_req_of_cpu.mem_write.part0",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "LLC_MISSES.PCIE_WRITE",
+        "FCMask": "0x07",
+        "Filter": "ch_mask=0x1f",
+        "MetricExpr": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 +UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 +UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 +UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3",
+        "MetricName": "LLC_MISSES.PCIE_WRITE",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth writing at IIO, part 0",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "MetricExpr": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART0 +UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1 +UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2 +UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3",
+        "MetricName": "LLC_MISSES.PCIE_WRITE",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth writing at IIO, part 1",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth writing at IIO, part 2",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth writing at IIO, part 3",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth reading at IIO, part 0",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "MetricExpr": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART0 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2 + UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3",
+        "MetricName": "LLC_MISSES.PCIE_READ",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth reading at IIO, part 1",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth reading at IIO, part 2",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "PCI Express bandwidth reading at IIO, part 3",
+        "Counter": "0,1",
+        "EventCode": "0x83",
+        "EventName": "UNC_IIO_DATA_REQ_OF_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "ScaleUnit": "4Bytes",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Core Cross Snoops Issued; Multiple Core Requests",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x33",
+        "EventName": "UNC_CHA_CORE_SNP.CORE_GTONE",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of transactions that trigger a configurable number of cross snoops.  Cores are snooped if the transaction looks up the cache and determines that it is necessary based on the operation type and what CoreValid bits are set.  For example, if 2 CV bits are set on a data read, the cores must have the data in S state so it is not necessary to snoop them.  However, if only 1 CV bit is set the core my have modified the data.  If the transaction was an RFO, it would need to invalidate the lines.  This event can be filtered based on who triggered the initial snoop(s).",
+        "UMask": "0x42",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Core Cross Snoops Issued; Multiple Eviction",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x33",
+        "EventName": "UNC_CHA_CORE_SNP.EVICT_GTONE",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the number of transactions that trigger a configurable number of cross snoops.  Cores are snooped if the transaction looks up the cache and determines that it is necessary based on the operation type and what CoreValid bits are set.  For example, if 2 CV bits are set on a data read, the cores must have the data in S state so it is not necessary to snoop them.  However, if only 1 CV bit is set the core my have modified the data.  If the transaction was an RFO, it would need to invalidate the lines.  This event can be filtered based on who triggered the initial snoop(s).",
+        "UMask": "0x82",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory state lookups; Snoop Not Needed",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x53",
+        "EventName": "UNC_CHA_DIR_LOOKUP.NO_SNP",
+        "PerPkg": "1",
+        "PublicDescription": "Counts transactions that looked into the multi-socket cacheline Directory state, and therefore did not send a snoop because the Directory indicated it was not needed",
+        "UMask": "0x02",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory state lookups; Snoop Needed",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x53",
+        "EventName": "UNC_CHA_DIR_LOOKUP.SNP",
+        "PerPkg": "1",
+        "PublicDescription": "Counts  transactions that looked into the multi-socket cacheline Directory state, and sent one or more snoops, because the Directory indicated it was needed",
+        "UMask": "0x01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory state updates; Directory Updated memory write from the HA pipe",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x54",
+        "EventName": "UNC_CHA_DIR_UPDATE.HA",
+        "PerPkg": "1",
+        "PublicDescription": "Counts only multi-socket cacheline Directory state updates memory writes issued from the HA pipe. This does not include memory write requests which are for I (Invalid) or E (Exclusive) cachelines.",
+        "UMask": "0x01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory state updates; Directory Updated memory write from TOR pipe",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x54",
+        "EventName": "UNC_CHA_DIR_UPDATE.TOR",
+        "PerPkg": "1",
+        "PublicDescription": "Counts only multi-socket cacheline Directory state updates due to memory writes issued from the TOR pipe which are the result of remote transaction hitting the SF/LLC and returning data Core2Core. This does not include memory write requests which are for I (Invalid) or E (Exclusive) cachelines.",
+        "UMask": "0x02",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Read request from a remote socket which hit in the HitMe Cache to a line In the E state",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x5F",
+        "EventName": "UNC_CHA_HITME_HIT.EX_RDS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts read requests from a remote socket which hit in the HitME cache (used to cache the multi-socket Directory state) to a line in the E(Exclusive) state.  This includes the following read opcodes (RdCode, RdData, RdDataMigratory, RdCur, RdInv*, Inv*)",
+        "UMask": "0x01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Normal priority reads issued to the memory controller from the CHA",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x59",
+        "EventName": "UNC_CHA_IMC_READS_COUNT.NORMAL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a normal (Non-Isochronous) read is issued to any of the memory controller channels from the CHA.",
+        "UMask": "0x01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "CHA to iMC Full Line Writes Issued; Full Line Non-ISOCH",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x5B",
+        "EventName": "UNC_CHA_IMC_WRITES_COUNT.FULL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a normal (Non-Isochronous) full line write is issued from the CHA to the any of the memory controller channels.",
+        "UMask": "0x01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Number of times that an RFO hit in S state.",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x39",
+        "EventName": "UNC_CHA_MISC.RFO_HIT_S",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a RFO (the Read for Ownership issued before a  write) request hit a cacheline in the S (Shared) state.",
+        "UMask": "0x08",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Local requests for exclusive ownership of a cache line  without receiving data",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.INVITOE_LOCAL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the total number of requests coming from a unit on this socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Local requests for exclusive ownership of a cache line without receiving data",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x50",
+        "EventName": "UNC_CHA_REQUESTS.INVITOE_REMOTE",
+        "PerPkg": "1",
+        "PublicDescription": "Counts the total number of requests coming from a remote socket for exclusive ownership of a cache line without receiving data (INVITOE) to the CHA.",
+        "UMask": "0x20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RspCnflct* Snoop Responses Received",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x5C",
+        "EventName": "UNC_CHA_SNOOP_RESP.RSPCNFLCTS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a a transaction with the opcode type RspCnflct* Snoop Response was received. This is returned when a snoop finds an existing outstanding transaction in a remote caching agent. This triggers conflict resolution hardware. This covers both the opcode RspCnflct and RspCnflctWbI.",
+        "UMask": "0x40",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RspI Snoop Responses Received",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x5C",
+        "EventName": "UNC_CHA_SNOOP_RESP.RSPI",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a transaction with the opcode type RspI Snoop Response was received which indicates the remote cache does not have the data, or when the remote cache silently evicts data (such as when an RFO: the Read for Ownership issued before a write hits non-modified data).",
+        "UMask": "0x01",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RspIFwd Snoop Responses Received",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x5C",
+        "EventName": "UNC_CHA_SNOOP_RESP.RSPIFWD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a a transaction with the opcode type RspIFwd Snoop Response was received which indicates a remote caching agent forwarded the data and the requesting agent is able to acquire the data in E (Exclusive) or M (modified) states.  This is commonly returned with RFO (the Read for Ownership issued before a write) transactions.  The snoop could have either been to a cacheline in the M,E,F (Modified, Exclusive or Forward)  states.",
+        "UMask": "0x04",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "RspSFwd Snoop Responses Received",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x5C",
+        "EventName": "UNC_CHA_SNOOP_RESP.RSPSFWD",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a a transaction with the opcode type RspSFwd Snoop Response was received which indicates a remote caching agent forwarded the data but held on to its current copy.  This is common for data and code reads that hit in a remote socket in E (Exclusive) or F (Forward) state.",
+        "UMask": "0x08",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Rsp*Fwd*WB Snoop Responses Received",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x5C",
+        "EventName": "UNC_CHA_SNOOP_RESP.RSP_FWD_WB",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a transaction with the opcode type Rsp*Fwd*WB Snoop Response was received which indicates the data was written back to it's home socket, and the cacheline was forwarded to the requestor socket.  This snoop response is only used in >= 4 socket systems.  It is used when a snoop HITM's in a remote caching agent and it directly forwards data to a requestor, and simultaneously returns data to it's home socket to be written back to memory.",
+        "UMask": "0x20",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Rsp*WB Snoop Responses Received",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x5C",
+        "EventName": "UNC_CHA_SNOOP_RESP.RSP_WBWB",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a transaction with the opcode type Rsp*WB Snoop Response was received which indicates which indicates the data was written back to it's home.  This is returned when a non-RFO request hits a cacheline in the Modified state. The Cache can either downgrade the cacheline to a S (Shared) or I (Invalid) state depending on how the system has been configured.  This reponse will also be sent when a cache requests E (Exclusive) ownership of a cache line without receiving data, because the cache must acquire ownership.",
+        "UMask": "0x10",
+        "Unit": "CHA"
+    },
+    {
+        "BriefDescription": "Clockticks of the IIO Traffic Controller",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x1",
+        "EventName": "UNC_IIO_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts clockticks of the 1GHz trafiic controller clock in the IIO unit.",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for 4 bytes made by the CPU to IIO Part0",
+        "Counter": "2,3",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "PublicDescription": "Counts every read request for 4 bytes of data made by a unit on the main die (generally a core) to the MMIO space of a card on IIO Part0. In the general case, Part0 refers to a standard PCIe card of any size (x16,x8,x4) that is plugged directly into one of the PCIe slots. Part0 could also refer to any device plugged into the first slot of a PCIe riser card or to a device attached to the IIO unit which starts its use of the bus using lane 0 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for 4 bytes made by the CPU to IIO Part1",
+        "Counter": "2,3",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "PublicDescription": "Counts every read request for 4 bytes of data made by a unit on the main die (generally a core) to the MMIO space of a card on IIO Part1. In the general case, Part1 refers to a x4 PCIe card plugged into the second slot of a PCIe riser card, but it could refer to any x4 device attached to the IIO unit using lanes starting at lane 4 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for 4 bytes made by the CPU to IIO Part2",
+        "Counter": "2,3",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "PublicDescription": "Counts every read request for 4 bytes of data made by a unit on the main die (generally a core) to the MMIO space of a card on IIO Part2. In the general case, Part2 refers to a x4 or x8 PCIe card plugged into the third slot of a PCIe riser card, but it could refer to any x4 or x8 device attached to the IIO unit and using lanes starting at lane 8 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for 4 bytes made by the CPU to IIO Part3",
+        "Counter": "2,3",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "PublicDescription": "Counts every read request for 4 bytes of data made by a unit on the main die (generally a core) to the MMIO space of a card on IIO Part3. In the general case, Part3 refers to a x4 PCIe card plugged into the fourth slot of a PCIe riser card, but it could brefer to  any device attached to the IIO unit using the lanes starting at lane 12 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of 4 bytes made to IIO Part0 by the CPU",
+        "Counter": "2,3",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "PublicDescription": "Counts every write request of 4 bytes of data made to the MMIO space of a card on IIO Part0 by a unit on the main die (generally a core). In the general case, Part0 refers to a standard PCIe card of any size (x16,x8,x4) that is plugged directly into one of the PCIe slots. Part0 could also refer to any device plugged into the first slot of a PCIe riser card or to a device attached to the IIO unit which starts its use of the bus using lane 0 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of 4 bytes made to IIO Part1 by the CPU",
+        "Counter": "2,3",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "PublicDescription": "Counts every write request of 4 bytes of data made to the MMIO space of a card on IIO Part1 by a unit on the main die (generally a core). In the general case, Part1 refers to a x4 PCIe card plugged into the second slot of a PCIe riser card, but it could refer to any x4 device attached to the IIO unit using lanes starting at lane 4 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of 4 bytes made to IIO Part2 by the CPU ",
+        "Counter": "2,3",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "PublicDescription": "Counts every write request of 4 bytes of data made to the MMIO space of a card on IIO Part2 by  a unit on the main die (generally a core). In the general case, Part2 refers to a x4 or x8 PCIe card plugged into the third slot of a PCIe riser card, but it could refer to any x4 or x8 device attached to the IIO unit and using lanes starting at lane 8 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of 4 bytes made to IIO Part3 by the CPU ",
+        "Counter": "2,3",
+        "EventCode": "0xC0",
+        "EventName": "UNC_IIO_DATA_REQ_BY_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "PublicDescription": "Counts every write request of 4 bytes of data made to the MMIO space of a card on IIO Part3 by  a unit on the main die (generally a core). In the general case, Part3 refers to a x4 PCIe card plugged into the fourth slot of a PCIe riser card, but it could brefer to any device attached to the IIO unit using the lanes starting at lane 12 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for up to a 64 byte transaction is made by the CPU to IIO Part0",
+        "Counter": "0,1,2,3",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "PublicDescription": "Counts every read request for up to a 64 byte transaction of data made by a unit on the main die (generally a core) to the MMIO space of a card on IIO Part0. In the general case, part0 refers to a standard PCIe card of any size (x16,x8,x4) that is plugged directly into one of the PCIe slots. Part0 could also refer to any device plugged into the first slot of a PCIe riser card or to a device attached to the IIO unit which starts its use of the bus using lane 0 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for up to a 64 byte transaction is made by the CPU to IIO Part1",
+        "Counter": "0,1,2,3",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "PublicDescription": "Counts every read request for up to a 64 byte transaction of data made by a unit on the main die (generally a core) to the MMIO space of a card on IIO Part1. In the general case, Part1 refers to a x4 PCIe card plugged into the second slot of a PCIe riser card, but it could refer to any x4 device attached to the IIO unit using lanes starting at lane 4 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for up to a 64 byte transaction is made by the CPU to IIO Part2",
+        "Counter": "0,1,2,3",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "PublicDescription": "Counts every read request for up to a 64 byte transaction of data made by a unit on the main die (generally a core) to the MMIO space of a card on IIO Part2. In the general case, Part2 refers to a x4 or x8 PCIe card plugged into the third slot of a PCIe riser card, but it could refer to any x4 or x8 device attached to the IIO unit and using lanes starting at lane 8 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for up to a 64 byte transaction is made by the CPU to IIO Part3",
+        "Counter": "0,1,2,3",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "PublicDescription": "Counts every read request for up to a 64 byte transaction of data made by a unit on the main die (generally a core) to the MMIO space of a card on IIO Part3. In the general case, Part3 refers to a x4 PCIe card plugged into the fourth slot of a PCIe riser card, but it could brefer to  any device attached to the IIO unit using the lanes starting at lane 12 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of up to a 64 byte transaction is made to IIO Part0 by the CPU",
+        "Counter": "0,1,2,3",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "PublicDescription": "Counts every write request of up to a 64 byte transaction of data made to the MMIO space of a card on IIO Part0 by a unit on the main die (generally a core). In the general case, Part0 refers to a standard PCIe card of any size (x16,x8,x4) that is plugged directly into one of the PCIe slots. Part0 could also refer to any device plugged into the first slot of a PCIe riser card or to a device attached to the IIO unit which starts its use of the bus using lane 0 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of up to a 64 byte transaction is made to IIO Part1 by the CPU",
+        "Counter": "0,1,2,3",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "PublicDescription": "Counts every write request of up to a 64 byte transaction of data made to the MMIO space of a card on IIO Part1 by a unit on the main die (generally a core). In the general case, Part1 refers to a x4 PCIe card plugged into the second slot of a PCIe riser card, but it could refer to any x4 device attached to the IIO unit using lanes starting at lane 4 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of up to a 64 byte transaction is made to IIO Part2 by the CPU ",
+        "Counter": "0,1,2,3",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "PublicDescription": "Counts every write request of up to a 64 byte transaction of data made to the MMIO space of a card on IIO Part2 by a unit on the main die (generally a core). In the general case, Part2 refers to a x4 or x8 PCIe card plugged into the third slot of a PCIe riser card, but it could refer to any x4 or x8 device attached to the IIO unit and using lanes starting at lane 8 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of up to a 64 byte transaction is made to IIO Part3 by the CPU ",
+        "Counter": "0,1,2,3",
+        "EventCode": "0xC1",
+        "EventName": "UNC_IIO_TXN_REQ_BY_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "PublicDescription": "Counts every write request of up to a 64 byte transaction of data made to the MMIO space of a card on IIO Part3 by a unit on the main die (generally a core). In the general case, Part3 refers to a x4 PCIe card plugged into the fourth slot of a PCIe riser card, but it could brefer to  any device attached to the IIO unit using the lanes starting at lane 12 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for up to a 64 byte transaction is made by IIO Part0 to Memory",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "PublicDescription": "Counts every read request for up to a 64 byte transaction of data made by IIO Part0 to a unit on the main die (generally memory). In the general case, Part0 refers to a standard PCIe card of any size (x16,x8,x4) that is plugged directly into one of the PCIe slots. Part0 could also refer to any device plugged into the first slot of a PCIe riser card or to a device attached to the IIO unit which starts its use of the bus using lane 0 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for up to a 64 byte transaction is  made by IIO Part1 to Memory",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "PublicDescription": "Counts every read request for up to a 64 byte transaction of data made by IIO Part1 to a unit on the main die (generally memory). In the general case, Part1 refers to a x4 PCIe card plugged into the second slot of a PCIe riser card, but it could refer to any x4 device attached to the IIO unit using lanes starting at lane 4 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for up to a 64 byte transaction is made by IIO Part2 to Memory",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "PublicDescription": "Counts every read request for up to a 64 byte transaction of data made by IIO Part2 to a unit on the main die (generally memory). In the general case, Part2 refers to a x4 or x8 PCIe card plugged into the third slot of a PCIe riser card, but it could refer to any x4 or x8 device attached to the IIO unit and using lanes starting at lane 8 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Read request for up to a 64 byte transaction is made by IIO Part3 to Memory",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_READ.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "PublicDescription": "Counts every read request for up to a 64 byte transaction of data made by IIO Part3 to a unit on the main die (generally memory). In the general case, Part3 refers to a x4 PCIe card plugged into the fourth slot of a PCIe riser card, but it could brefer to  any device attached to the IIO unit using the lanes starting at lane 12 of the 16 lanes supported by the bus.",
+        "UMask": "0x04",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of up to a 64 byte transaction is made by IIO Part0 to Memory",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART0",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x01",
+        "PublicDescription": "Counts every write request of up to a 64 byte transaction of data made by IIO Part0 to a unit on the main die (generally memory). In the general case, Part0 refers to a standard PCIe card of any size (x16,x8,x4) that is plugged directly into one of the PCIe slots. Part0 could also refer to any device plugged into the first slot of a PCIe riser card or to a device attached to the IIO unit which starts its use of the bus using lane 0 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of up to a 64 byte transaction is made by IIO Part1 to Memory",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART1",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x02",
+        "PublicDescription": "Counts every write request of up to a 64 byte transaction of data made by IIO Part1 to a unit on the main die (generally memory). In the general case, Part1 refers to a x4 PCIe card plugged into the second slot of a PCIe riser card, but it could refer to any x4 device attached to the IIO unit using lanes starting at lane 4 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of up to a 64 byte transaction is made by IIO Part2 to Memory",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART2",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x04",
+        "PublicDescription": "Counts every write request of up to a 64 byte transaction of data made by IIO Part2 to a unit on the main die (generally memory). In the general case, Part2 refers to a x4 or x8 PCIe card plugged into the third slot of a PCIe riser card, but it could refer to any x4 or x8 device attached to the IIO unit and using lanes starting at lane 8 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Write request of up to a 64 byte transaction is made by IIO Part3 to Memory",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x84",
+        "EventName": "UNC_IIO_TXN_REQ_OF_CPU.MEM_WRITE.PART3",
+        "FCMask": "0x07",
+        "PerPkg": "1",
+        "PortMask": "0x08",
+        "PublicDescription": "Counts every write request of up to a 64 byte transaction of data made by IIO Part3 to a unit on the main die (generally memory). In the general case, Part3 refers to a x4 PCIe card plugged into the fourth slot of a PCIe riser card, but it could brefer to  any device attached to the IIO unit using the lanes starting at lane 12 of the 16 lanes supported by the bus.",
+        "UMask": "0x01",
+        "Unit": "IIO"
+    },
+    {
+        "BriefDescription": "Traffic in which the M2M to iMC Bypass was not taken",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x22",
+        "EventName": "UNC_M2M_BYPASS_M2M_Egress.NOT_TAKEN",
+        "PerPkg": "1",
+        "PublicDescription": "Counts traffic in which the M2M (Mesh to Memory) to iMC (Memory Controller) bypass was not taken",
+        "UMask": "0x2",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Cycles when direct to core mode (which bypasses the CHA) was disabled",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x24",
+        "EventName": "UNC_M2M_DIRECT2CORE_NOT_TAKEN_DIRSTATE",
+        "PerPkg": "1",
+        "PublicDescription": "Counts cycles when direct to core mode (which bypasses the CHA) was disabled",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Messages sent direct to core (bypassing the CHA)",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x23",
+        "EventName": "UNC_M2M_DIRECT2CORE_TAKEN",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when messages were sent direct to core (bypassing the CHA)",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Number of reads in which direct to core transaction were overridden",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x25",
+        "EventName": "UNC_M2M_DIRECT2CORE_TXN_OVERRIDE",
+        "PerPkg": "1",
+        "PublicDescription": "Counts reads in which direct to core transactions (which would have bypassed the CHA) were overridden",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Number of reads in which direct to Intel UPI transactions were overridden",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x28",
+        "EventName": "UNC_M2M_DIRECT2UPI_NOT_TAKEN_CREDITS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts reads in which direct to Intel Ultra Path Interconnect (UPI) transactions (which would have bypassed the CHA) were overridden",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Cycles when direct to Intel UPI was disabled",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x27",
+        "EventName": "UNC_M2M_DIRECT2UPI_NOT_TAKEN_DIRSTATE",
+        "PerPkg": "1",
+        "PublicDescription": "Counts cycles when the ability to send messages direct to the Intel Ultra Path Interconnect (bypassing the CHA) was disabled",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Messages sent direct to the Intel UPI",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x26",
+        "EventName": "UNC_M2M_DIRECT2UPI_TAKEN",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when messages were sent direct to the Intel Ultra Path Interconnect (bypassing the CHA)",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Number of reads that a message sent direct2 Intel UPI was overridden",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x29",
+        "EventName": "UNC_M2M_DIRECT2UPI_TXN_OVERRIDE",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when a read message that was sent direct to the Intel Ultra Path Interconnect (bypassing the CHA) was overridden",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory lookups (any state found)",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2D",
+        "EventName": "UNC_M2M_DIRECTORY_LOOKUP.ANY",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) looks into the multi-socket cacheline Directory state, and found the cacheline marked in Any State (A, I, S or unused)",
+        "UMask": "0x1",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory lookups (cacheline found in A state) ",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2D",
+        "EventName": "UNC_M2M_DIRECTORY_LOOKUP.STATE_A",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) looks into the multi-socket cacheline Directory state, and found the cacheline marked in the A (SnoopAll) state, indicating the cacheline is stored in another socket in any state, and we must snoop the other sockets to make sure we get the latest data.  The data may be stored in any state in the local socket.",
+        "UMask": "0x8",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory lookup (cacheline found in I state) ",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2D",
+        "EventName": "UNC_M2M_DIRECTORY_LOOKUP.STATE_I",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) looks into the multi-socket cacheline Directory state , and found the cacheline marked in the I (Invalid) state indicating the cacheline is not stored in another socket, and so there is no need to snoop the other sockets for the latest data.  The data may be stored in any state in the local socket.",
+        "UMask": "0x2",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory lookup (cacheline found in S state) ",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2D",
+        "EventName": "UNC_M2M_DIRECTORY_LOOKUP.STATE_S",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) looks into the multi-socket cacheline Directory state , and found the cacheline marked in the S (Shared) state indicating the cacheline is either stored in another socket in the S(hared) state , and so there is no need to snoop the other sockets for the latest data.  The data may be stored in any state in the local socket.",
+        "UMask": "0x4",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory update from A to I",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2E",
+        "EventName": "UNC_M2M_DIRECTORY_UPDATE.A2I",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) updates the multi-socket cacheline Directory state from from A (SnoopAll) to I (Invalid)",
+        "UMask": "0x20",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory update from A to S",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2E",
+        "EventName": "UNC_M2M_DIRECTORY_UPDATE.A2S",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) updates the multi-socket cacheline Directory state from from A (SnoopAll) to S (Shared)",
+        "UMask": "0x40",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory update from/to Any state ",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2E",
+        "EventName": "UNC_M2M_DIRECTORY_UPDATE.ANY",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) updates the multi-socket cacheline Directory to a new state",
+        "UMask": "0x1",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory update from I to A",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2E",
+        "EventName": "UNC_M2M_DIRECTORY_UPDATE.I2A",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) updates the multi-socket cacheline Directory state from from I (Invalid) to A (SnoopAll)",
+        "UMask": "0x4",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory update from I to S",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2E",
+        "EventName": "UNC_M2M_DIRECTORY_UPDATE.I2S",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) updates the multi-socket cacheline Directory state from from I (Invalid) to S (Shared)",
+        "UMask": "0x2",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory update from S to A",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2E",
+        "EventName": "UNC_M2M_DIRECTORY_UPDATE.S2A",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) updates the multi-socket cacheline Directory state from from S (Shared) to A (SnoopAll)",
+        "UMask": "0x10",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Multi-socket cacheline Directory update from S to I",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2E",
+        "EventName": "UNC_M2M_DIRECTORY_UPDATE.S2I",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) updates the multi-socket cacheline Directory state from from S (Shared) to I (Invalid)",
+        "UMask": "0x8",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Reads to iMC issued",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x37",
+        "EventName": "UNC_M2M_IMC_READS.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) issues reads to the iMC (Memory Controller). ",
+        "UMask": "0x4",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Reads to iMC issued at Normal Priority (Non-Isochronous)",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x37",
+        "EventName": "UNC_M2M_IMC_READS.NORMAL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) issues reads to the iMC (Memory Controller).  It only counts  normal priority non-isochronous reads.",
+        "UMask": "0x1",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Writes to iMC issued",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x38",
+        "EventName": "UNC_M2M_IMC_WRITES.ALL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) issues writes to the iMC (Memory Controller).",
+        "UMask": "0x10",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Partial Non-Isochronous writes to the iMC",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x38",
+        "EventName": "UNC_M2M_IMC_WRITES.PARTIAL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) issues partial writes to the iMC (Memory Controller).  It only counts normal priority non-isochronous writes.",
+        "UMask": "0x2",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Prefecth requests that got turn into a demand request",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x56",
+        "EventName": "UNC_M2M_PREFCAM_DEMAND_PROMOTIONS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) promotes a outstanding request in the prefetch queue due to a subsequent demand read request that entered the M2M with the same address.  Explanatory Side Note: The Prefecth queue is made of CAM (Content Addressable Memory)",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Inserts into the Memory Controller Prefetch Queue",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x57",
+        "EventName": "UNC_M2M_PREFCAM_INSERTS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the M2M (Mesh to Memory) recieves a prefetch request and inserts it into its outstanding prefetch queue.  Explanatory Side Note: the prefect queue is made from CAM: Content Addressable Memory",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "AD Ingress (from CMS) Queue Inserts",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x1",
+        "EventName": "UNC_M2M_RxC_AD_INSERTS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the a new entry is Received(RxC) and then added to the AD (Address Ring) Ingress Queue from the CMS (Common Mesh Stop).  This is generally used for reads, and ",
+        "Unit": "M2M"
+    },
+    {
+        "BriefDescription": "Prefetches generated by the flow control queue of the M3UPI unit.",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x29",
+        "EventName": "UNC_M3UPI_UPI_PREFETCH_SPAWN",
+        "PerPkg": "1",
+        "PublicDescription": "Count cases where flow control queue that sits between the Intel Ultra Path Interconnect (UPI) and the mesh spawns a prefetch to the iMC (Memory Controller)",
+        "Unit": "M3UPI"
+    },
+    {
+        "BriefDescription": "Clocks of the Intel Ultra Path Interconnect (UPI)",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x1",
+        "EventName": "UNC_UPI_CLOCKTICKS",
+        "PerPkg": "1",
+        "PublicDescription": "Counts clockticks of the fixed frequency clock controlling the Intel Ultra Path Interconnect (UPI).  This clock runs at1/8th the 'GT/s' speed of the UPI link.  For example, a  9.6GT/s  link will have a fixed Frequency of 1.2 Ghz.",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Data Response packets that go direct to core",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x12",
+        "EventName": "UNC_UPI_DIRECT_ATTEMPTS.D2C",
+        "PerPkg": "1",
+        "PublicDescription": "Counts Data Response (DRS) packets that attempted to go direct to core bypassing the CHA.",
+        "UMask": "0x1",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Data Response packets that go direct to Intel UPI",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x12",
+        "EventName": "UNC_UPI_DIRECT_ATTEMPTS.D2U",
+        "PerPkg": "1",
+        "PublicDescription": "Counts Data Response (DRS) packets that attempted to go direct to Intel Ultra Path Interconnect (UPI) bypassing the CHA .",
+        "UMask": "0x2",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Cycles Intel UPI is in L1 power mode (shutdown)",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x21",
+        "EventName": "UNC_UPI_L1_POWER_CYCLES",
+        "PerPkg": "1",
+        "PublicDescription": "Counts cycles when the Intel Ultra Path Interconnect (UPI) is in L1 power mode.  L1 is a mode that totally shuts down the UPI link.  Link power states are per link and per direction, so for example the Tx direction could be in one state while Rx was in another, this event only coutns when both links are shutdown.",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Cycles the Rx of the Intel UPI is in L0p power mode",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x25",
+        "EventName": "UNC_UPI_RxL0P_POWER_CYCLES",
+        "PerPkg": "1",
+        "PublicDescription": "Counts cycles when the the receive side (Rx) of the Intel Ultra Path Interconnect(UPI) is in L0p power mode. L0p is a mode where we disable 60% of the UPI lanes, decreasing our bandwidth in order to save power.",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "FLITs received which bypassed the Slot0 Receive Buffer",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x31",
+        "EventName": "UNC_UPI_RxL_BYPASSED.SLOT0",
+        "PerPkg": "1",
+        "PublicDescription": "Counts incoming FLITs (FLow control unITs) which bypassed the slot0 RxQ buffer (Receive Queue) and passed directly to the Egress.  This is a latency optimization, and should generally be the common case.  If this value is less than the number of FLITs transfered, it implies that there was queueing getting onto the ring, and thus the transactions saw higher latency.",
+        "UMask": "0x1",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "FLITs received which bypassed the Slot0 Receive Buffer",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x31",
+        "EventName": "UNC_UPI_RxL_BYPASSED.SLOT1",
+        "PerPkg": "1",
+        "PublicDescription": "Counts incoming FLITs (FLow control unITs) which bypassed the slot1 RxQ buffer  (Receive Queue) and passed directly across the BGF and into the Egress.  This is a latency optimization, and should generally be the common case.  If this value is less than the number of FLITs transfered, it implies that there was queueing getting onto the ring, and thus the transactions saw higher latency.",
+        "UMask": "0x2",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "FLITs received which bypassed the Slot0 Recieve Buffer",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x31",
+        "EventName": "UNC_UPI_RxL_BYPASSED.SLOT2",
+        "PerPkg": "1",
+        "PublicDescription": "Counts incoming FLITs (FLow control unITs) whcih bypassed the slot2 RxQ buffer (Receive Queue)  and passed directly to the Egress.  This is a latency optimization, and should generally be the common case.  If this value is less than the number of FLITs transfered, it implies that there was queueing getting onto the ring, and thus the transactions saw higher latency.",
+        "UMask": "0x4",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Valid data FLITs received from any slot",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x3",
+        "EventName": "UNC_UPI_RxL_FLITS.ALL_DATA",
+        "PerPkg": "1",
+        "PublicDescription": "Counts valid data FLITs  (80 bit FLow control unITs: 64bits of data) received from any of the 3 Intel Ultra Path Interconnect (UPI) Receive Queue slots on this UPI unit.",
+        "UMask": "0x0F",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Null FLITs received from any slot",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x3",
+        "EventName": "UNC_UPI_RxL_FLITS.ALL_NULL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts null FLITs (80 bit FLow control unITs) received from any of the 3 Intel Ultra Path Interconnect (UPI) Receive Queue slots on this UPI unit.",
+        "UMask": "0x27",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Protocol header and credit FLITs received from any slot",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x3",
+        "EventName": "UNC_UPI_RxL_FLITS.NON_DATA",
+        "PerPkg": "1",
+        "PublicDescription": "Counts protocol header and credit FLITs  (80 bit FLow control unITs) received from any of the 3 UPI slots on this UPI unit.",
+        "UMask": "0x97",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Cycles in which the Tx of the Intel Ultra Path Interconnect (UPI) is in L0p power mode",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x27",
+        "EventName": "UNC_UPI_TxL0P_POWER_CYCLES",
+        "PerPkg": "1",
+        "PublicDescription": "Counts cycles when the transmit side (Tx) of the Intel Ultra Path Interconnect(UPI) is in L0p power mode. L0p is a mode where we disable 60% of the UPI lanes, decreasing our bandwidth in order to save power.",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "FLITs that bypassed the TxL Buffer",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x41",
+        "EventName": "UNC_UPI_TxL_BYPASSED",
+        "PerPkg": "1",
+        "PublicDescription": "Counts incoming FLITs (FLow control unITs) which bypassed the TxL(transmit) FLIT buffer and pass directly out the UPI Link. Generally, when data is transmitted across the Intel Ultra Path Interconnect (UPI), it will bypass the TxQ and pass directly to the link.  However, the TxQ will be used in L0p (Low Power) mode and (Link Layer Retry) LLR  mode, increasing latency to transfer out to the link.",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "UPI interconnect send bandwidth for payload. Derived from unc_upi_txl_flits.all_data",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2",
+        "EventName": "UPI_DATA_BANDWIDTH_TX",
+        "PerPkg": "1",
+        "ScaleUnit": "7.11E-06Bytes",
+        "UMask": "0x0F",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Null FLITs transmitted from any slot",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2",
+        "EventName": "UNC_UPI_TxL_FLITS.ALL_NULL",
+        "PerPkg": "1",
+        "PublicDescription": "Counts null FLITs (80 bit FLow control unITs) transmitted via any of the 3 Intel Ulra Path Interconnect (UPI) slots on this UPI unit.",
+        "UMask": "0x27",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Idle FLITs transmitted",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2",
+        "EventName": "UNC_UPI_TxL_FLITS.IDLE",
+        "PerPkg": "1",
+        "PublicDescription": "Counts when the Intel Ultra Path Interconnect(UPI) transmits an idle FLIT(80 bit FLow control unITs).  Every UPI cycle must be sending either data FLITs, protocol/credit FLITs or idle FLITs.",
+        "UMask": "0x47",
+        "Unit": "UPI LL"
+    },
+    {
+        "BriefDescription": "Protocol header and credit FLITs transmitted across any slot",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x2",
+        "EventName": "UNC_UPI_TxL_FLITS.NON_DATA",
+        "PerPkg": "1",
+        "PublicDescription": "Counts protocol header and credit FLITs (80 bit FLow control unITs) transmitted across any of the 3 UPI (Ultra Path Interconnect) slots on this UPI unit.",
+        "UMask": "0x97",
+        "Unit": "UPI LL"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json b/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json
new file mode 100644
index 000000000000..70750dab7ead
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylakex/virtual-memory.json
@@ -0,0 +1,284 @@
+[
+    {
+        "EventCode": "0x08",
+        "UMask": "0x1",
+        "BriefDescription": "Load misses in all DTLB levels that cause page walks",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK",
+        "PublicDescription": "Counts demand data loads that caused a page walk of any page size (4K/2M/4M/1G). This implies it missed in all TLB levels, but the walk need not have completed.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x08",
+        "UMask": "0x2",
+        "BriefDescription": "Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (4K).",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts demand data loads that caused a completed page walk (4K page size). This implies it missed in all TLB levels. The page walk can end with or without a fault.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x08",
+        "UMask": "0x4",
+        "BriefDescription": "Demand load Miss in all translation lookaside buffer (TLB) levels causes a page walk that completes (2M/4M).",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts demand data loads that caused a completed page walk (2M and 4M page sizes). This implies it missed in all TLB levels. The page walk can end with or without a fault.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x08",
+        "UMask": "0x8",
+        "BriefDescription": "Load miss in all TLB levels causes a page walk that completes. (1G)",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G",
+        "PublicDescription": "Counts load misses in all DTLB levels that cause a completed page walk (1G page size). The page walk can end with or without a fault.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x08",
+        "UMask": "0xe",
+        "BriefDescription": "Load miss in all TLB levels causes a page walk that completes. (All page sizes)",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts demand data loads that caused a completed page walk of any page size (4K/2M/4M/1G). This implies it missed in all TLB levels. The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x08",
+        "UMask": "0x10",
+        "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for a load. EPT page walk duration are excluded in Skylake. ",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_LOAD_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for a load. EPT page walk duration are excluded in Skylake microarchitecture. ",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x08",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles when at least one PMH is busy with a page walk for a load. EPT page walk duration are excluded in Skylake. ",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_LOAD_MISSES.WALK_ACTIVE",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a load.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x08",
+        "UMask": "0x20",
+        "BriefDescription": "Loads that miss the DTLB and hit the STLB.",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_LOAD_MISSES.STLB_HIT",
+        "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB).",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x49",
+        "UMask": "0x1",
+        "BriefDescription": "Store misses in all DTLB levels that cause page walks",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_STORE_MISSES.MISS_CAUSES_A_WALK",
+        "PublicDescription": "Counts demand data stores that caused a page walk of any page size (4K/2M/4M/1G). This implies it missed in all TLB levels, but the walk need not have completed.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x49",
+        "UMask": "0x2",
+        "BriefDescription": "Store miss in all TLB levels causes a page walk that completes. (4K)",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts demand data stores that caused a completed page walk (4K page size). This implies it missed in all TLB levels. The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x49",
+        "UMask": "0x4",
+        "BriefDescription": "Store misses in all DTLB levels that cause completed page walks (2M/4M)",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts demand data stores that caused a completed page walk (2M and 4M page sizes). This implies it missed in all TLB levels. The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x49",
+        "UMask": "0x8",
+        "BriefDescription": "Store misses in all DTLB levels that cause completed page walks (1G)",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1G",
+        "PublicDescription": "Counts store misses in all DTLB levels that cause a completed page walk (1G page size). The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x49",
+        "UMask": "0xe",
+        "BriefDescription": "Store misses in all TLB levels causes a page walk that completes. (All page sizes)",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts demand data stores that caused a completed page walk of any page size (4K/2M/4M/1G). This implies it missed in all TLB levels. The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x49",
+        "UMask": "0x10",
+        "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for a store. EPT page walk duration are excluded in Skylake. ",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_STORE_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for a store. EPT page walk duration are excluded in Skylake microarchitecture. ",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x49",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles when at least one PMH is busy with a page walk for a store. EPT page walk duration are excluded in Skylake. ",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_STORE_MISSES.WALK_ACTIVE",
+        "CounterMask": "1",
+        "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x49",
+        "UMask": "0x20",
+        "BriefDescription": "Stores that miss the DTLB and hit the STLB.",
+        "Counter": "0,1,2,3",
+        "EventName": "DTLB_STORE_MISSES.STLB_HIT",
+        "PublicDescription": "Stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB).",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x4F",
+        "UMask": "0x10",
+        "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a EPT (Extended Page Table) walk for any request type.",
+        "Counter": "0,1,2,3",
+        "EventName": "EPT.WALK_PENDING",
+        "PublicDescription": "Counts cycles for each PMH (Page Miss Handler) that is busy with an EPT (Extended Page Table) walk for any request type.",
+        "SampleAfterValue": "2000003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x85",
+        "UMask": "0x1",
+        "BriefDescription": "Misses at all ITLB levels that cause page walks",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB_MISSES.MISS_CAUSES_A_WALK",
+        "PublicDescription": "Counts page walks of any page size (4K/2M/4M/1G) caused by a code fetch. This implies it missed in the ITLB and further levels of TLB, but the walk need not have completed.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x85",
+        "UMask": "0x2",
+        "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (4K)",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts completed page walks (4K page size) caused by a code fetch. This implies it missed in the ITLB and further levels of TLB. The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x85",
+        "UMask": "0x4",
+        "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (2M/4M)",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M",
+        "PublicDescription": "Counts completed page walks of any page size (4K/2M/4M/1G) caused by a code fetch. This implies it missed in the ITLB and further levels of TLB. The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x85",
+        "UMask": "0x8",
+        "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (1G)",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_1G",
+        "PublicDescription": "Counts store misses in all DTLB levels that cause a completed page walk (1G page size). The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x85",
+        "UMask": "0xe",
+        "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (All page sizes)",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts completed page walks (2M and 4M page sizes) caused by a code fetch. This implies it missed in the ITLB and further levels of TLB. The page walk can end with or without a fault.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x85",
+        "UMask": "0x10",
+        "BriefDescription": "Counts 1 per cycle for each PMH that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake. ",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB_MISSES.WALK_PENDING",
+        "PublicDescription": "Counts 1 per cycle for each PMH (Page Miss Handler) that is busy with a page walk for an instruction fetch request. EPT page walk duration are excluded in Skylake michroarchitecture. ",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x85",
+        "UMask": "0x10",
+        "BriefDescription": "Cycles when at least one PMH is busy with a page walk for code (instruction fetch) request. EPT page walk duration are excluded in Skylake.",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB_MISSES.WALK_ACTIVE",
+        "CounterMask": "1",
+        "PublicDescription": "Cycles when at least one PMH is busy with a page walk for code (instruction fetch) request. EPT page walk duration are excluded in Skylake microarchitecture.",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0x85",
+        "UMask": "0x20",
+        "BriefDescription": "Instruction fetch requests that miss the ITLB and hit the STLB.",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB_MISSES.STLB_HIT",
+        "SampleAfterValue": "100003",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xAE",
+        "UMask": "0x1",
+        "BriefDescription": "Flushing of the Instruction TLB (ITLB) pages, includes 4k/2M/4M pages.",
+        "Counter": "0,1,2,3",
+        "EventName": "ITLB.ITLB_FLUSH",
+        "PublicDescription": "Counts the number of flushes of the big or small ITLB pages. Counting include both TLB Flush (covering all sets) and TLB Set Clear (set-specific).",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xBD",
+        "UMask": "0x1",
+        "BriefDescription": "DTLB flush attempts of the thread-specific entries",
+        "Counter": "0,1,2,3",
+        "EventName": "TLB_FLUSH.DTLB_THREAD",
+        "PublicDescription": "Counts the number of DTLB flush attempts of the thread-specific entries.",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    },
+    {
+        "EventCode": "0xBD",
+        "UMask": "0x20",
+        "BriefDescription": "STLB flush attempts",
+        "Counter": "0,1,2,3",
+        "EventName": "TLB_FLUSH.STLB_ANY",
+        "PublicDescription": "Counts the number of any STLB flush attempts (such as entire, VPID, PCID, InvPage, CR3 write, etc.).",
+        "SampleAfterValue": "100007",
+        "CounterHTOff": "0,1,2,3,4,5,6,7"
+    }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index bd0aabb2bd0f..d51dc9ca8861 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -137,6 +137,8 @@ static struct field {
 	{ "AnyThread",	"any=" },
 	{ "EdgeDetect",	"edge=" },
 	{ "SampleAfterValue", "period=" },
+	{ "FCMask",	"fc_mask=" },
+	{ "PortMask",	"ch_mask=" },
 	{ NULL, NULL }
 };
 
@@ -822,10 +824,6 @@ static int process_one_file(const char *fpath, const struct stat *sb,
  * PMU event tables (see struct pmu_events_map).
  *
  * Write out the PMU events tables and the mapping table to pmu-event.c.
- *
- * If unable to process the JSON or arch files, create an empty mapping
- * table so we can continue to build/use  perf even if we cannot use the
- * PMU event aliases.
  */
 int main(int argc, char *argv[])
 {
@@ -836,6 +834,7 @@ int main(int argc, char *argv[])
 	const char *arch;
 	const char *output_file;
 	const char *start_dirname;
+	struct stat stbuf;
 
 	prog = basename(argv[0]);
 	if (argc < 4) {
@@ -857,11 +856,17 @@ int main(int argc, char *argv[])
 		return 2;
 	}
 
+	sprintf(ldirname, "%s/%s", start_dirname, arch);
+
+	/* If architecture does not have any event lists, bail out */
+	if (stat(ldirname, &stbuf) < 0) {
+		pr_info("%s: Arch %s has no PMU event lists\n", prog, arch);
+		goto empty_map;
+	}
+
 	/* Include pmu-events.h first */
 	fprintf(eventsfp, "#include \"../../pmu-events/pmu-events.h\"\n");
 
-	sprintf(ldirname, "%s/%s", start_dirname, arch);
-
 	/*
 	 * The mapfile allows multiple CPUids to point to the same JSON file,
 	 * so, not sure if there is a need for symlinks within the pmu-events
@@ -878,6 +883,9 @@ int main(int argc, char *argv[])
 	if (rc && verbose) {
 		pr_info("%s: Error walking file tree %s\n", prog, ldirname);
 		goto empty_map;
+	} else if (rc < 0) {
+		/* Make build fail */
+		return 1;
 	} else if (rc) {
 		goto empty_map;
 	}
@@ -892,7 +900,8 @@ int main(int argc, char *argv[])
 
 	if (process_mapfile(eventsfp, mapfile)) {
 		pr_info("%s: Error processing mapfile %s\n", prog, mapfile);
-		goto empty_map;
+		/* Make build fail */
+		return 1;
 	}
 
 	return 0;
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
index 1d95009592eb..f6c84966e4f8 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -57,6 +57,7 @@ try:
 		'ia64'	: audit.MACH_IA64,
 		'ppc'	: audit.MACH_PPC,
 		'ppc64'	: audit.MACH_PPC64,
+		'ppc64le' : audit.MACH_PPC64LE,
 		's390'	: audit.MACH_S390,
 		's390x'	: audit.MACH_S390X,
 		'i386'	: audit.MACH_X86,
diff --git a/tools/perf/scripts/python/bin/export-to-sqlite-record b/tools/perf/scripts/python/bin/export-to-sqlite-record
new file mode 100644
index 000000000000..070204fd6d00
--- /dev/null
+++ b/tools/perf/scripts/python/bin/export-to-sqlite-record
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+#
+# export perf data to a sqlite3 database. Can cover
+# perf ip samples (excluding the tracepoints). No special
+# record requirements, just record what you want to export.
+#
+perf record $@
diff --git a/tools/perf/scripts/python/bin/export-to-sqlite-report b/tools/perf/scripts/python/bin/export-to-sqlite-report
new file mode 100644
index 000000000000..5ff6033e70ba
--- /dev/null
+++ b/tools/perf/scripts/python/bin/export-to-sqlite-report
@@ -0,0 +1,29 @@
+#!/bin/bash
+# description: export perf data to a sqlite3 database
+# args: [database name] [columns] [calls]
+n_args=0
+for i in "$@"
+do
+    if expr match "$i" "-" > /dev/null ; then
+	break
+    fi
+    n_args=$(( $n_args + 1 ))
+done
+if [ "$n_args" -gt 3 ] ; then
+    echo "usage: export-to-sqlite-report [database name] [columns] [calls]"
+    exit
+fi
+if [ "$n_args" -gt 2 ] ; then
+    dbname=$1
+    columns=$2
+    calls=$3
+    shift 3
+elif [ "$n_args" -gt 1 ] ; then
+    dbname=$1
+    columns=$2
+    shift 2
+elif [ "$n_args" -gt 0 ] ; then
+    dbname=$1
+    shift
+fi
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/export-to-sqlite.py $dbname $columns $calls
diff --git a/tools/perf/scripts/python/call-graph-from-postgresql.py b/tools/perf/scripts/python/call-graph-from-sql.py
index e78fdc2a5a9d..b494a67a1c67 100644
--- a/tools/perf/scripts/python/call-graph-from-postgresql.py
+++ b/tools/perf/scripts/python/call-graph-from-sql.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python2
-# call-graph-from-postgresql.py: create call-graph from postgresql database
-# Copyright (c) 2014, Intel Corporation.
+# call-graph-from-sql.py: create call-graph from sql database
+# Copyright (c) 2014-2017, Intel Corporation.
 #
 # This program is free software; you can redistribute it and/or modify it
 # under the terms and conditions of the GNU General Public License,
@@ -11,18 +11,19 @@
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 # more details.
 
-# To use this script you will need to have exported data using the
-# export-to-postgresql.py script.  Refer to that script for details.
+# To use this script you will need to have exported data using either the
+# export-to-sqlite.py or the export-to-postgresql.py script.  Refer to those
+# scripts for details.
 #
-# Following on from the example in the export-to-postgresql.py script, a
+# Following on from the example in the export scripts, a
 # call-graph can be displayed for the pt_example database like this:
 #
-#	python tools/perf/scripts/python/call-graph-from-postgresql.py pt_example
+#	python tools/perf/scripts/python/call-graph-from-sql.py pt_example
 #
-# Note this script supports connecting to remote databases by setting hostname,
-# port, username, password, and dbname e.g.
+# Note that for PostgreSQL, this script supports connecting to remote databases
+# by setting hostname, port, username, password, and dbname e.g.
 #
-#	python tools/perf/scripts/python/call-graph-from-postgresql.py "hostname=myhost username=myuser password=mypassword dbname=pt_example"
+#	python tools/perf/scripts/python/call-graph-from-sql.py "hostname=myhost username=myuser password=mypassword dbname=pt_example"
 #
 # The result is a GUI window with a tree representing a context-sensitive
 # call-graph.  Expanding a couple of levels of the tree and adjusting column
@@ -160,7 +161,7 @@ class TreeItem():
 				  '( SELECT short_name FROM dsos WHERE id = ( SELECT dso_id FROM symbols WHERE id = ( SELECT symbol_id FROM call_paths WHERE id = call_path_id ) ) ), '
 				  '( SELECT ip FROM call_paths where id = call_path_id ) '
 				  'FROM calls WHERE parent_call_path_id = ' + str(self.call_path_id) + ' AND comm_id = ' + str(self.comm_id) + ' AND thread_id = ' + str(self.thread_id) +
-				  'ORDER BY call_path_id')
+				  ' ORDER BY call_path_id')
 		if not ret:
 			raise Exception("Query failed: " + query.lastError().text())
 		last_call_path_id = 0
@@ -291,29 +292,40 @@ class MainWindow(QMainWindow):
 
 if __name__ == '__main__':
 	if (len(sys.argv) < 2):
-		print >> sys.stderr, "Usage is: call-graph-from-postgresql.py <database name>"
+		print >> sys.stderr, "Usage is: call-graph-from-sql.py <database name>"
 		raise Exception("Too few arguments")
 
 	dbname = sys.argv[1]
 
-	db = QSqlDatabase.addDatabase('QPSQL')
-
-	opts = dbname.split()
-	for opt in opts:
-		if '=' in opt:
-			opt = opt.split('=')
-			if opt[0] == 'hostname':
-				db.setHostName(opt[1])
-			elif opt[0] == 'port':
-				db.setPort(int(opt[1]))
-			elif opt[0] == 'username':
-				db.setUserName(opt[1])
-			elif opt[0] == 'password':
-				db.setPassword(opt[1])
-			elif opt[0] == 'dbname':
-				dbname = opt[1]
-		else:
-			dbname = opt
+	is_sqlite3 = False
+	try:
+		f = open(dbname)
+		if f.read(15) == "SQLite format 3":
+			is_sqlite3 = True
+		f.close()
+	except:
+		pass
+
+	if is_sqlite3:
+		db = QSqlDatabase.addDatabase('QSQLITE')
+	else:
+		db = QSqlDatabase.addDatabase('QPSQL')
+		opts = dbname.split()
+		for opt in opts:
+			if '=' in opt:
+				opt = opt.split('=')
+				if opt[0] == 'hostname':
+					db.setHostName(opt[1])
+				elif opt[0] == 'port':
+					db.setPort(int(opt[1]))
+				elif opt[0] == 'username':
+					db.setUserName(opt[1])
+				elif opt[0] == 'password':
+					db.setPassword(opt[1])
+				elif opt[0] == 'dbname':
+					dbname = opt[1]
+			else:
+				dbname = opt
 
 	db.setDatabaseName(dbname)
 	if not db.open():
diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py
index 7656ff8aa066..efcaf6cac2eb 100644
--- a/tools/perf/scripts/python/export-to-postgresql.py
+++ b/tools/perf/scripts/python/export-to-postgresql.py
@@ -59,7 +59,7 @@ import datetime
 #	pt_example=# \q
 #
 # An example of using the database is provided by the script
-# call-graph-from-postgresql.py.  Refer to that script for details.
+# call-graph-from-sql.py.  Refer to that script for details.
 #
 # Tables:
 #
@@ -340,7 +340,8 @@ if branches:
 		'to_sym_offset	bigint,'
 		'to_ip		bigint,'
 		'branch_type	integer,'
-		'in_tx		boolean)')
+		'in_tx		boolean,'
+		'call_path_id	bigint)')
 else:
 	do_query(query, 'CREATE TABLE samples ('
 		'id		bigint		NOT NULL,'
diff --git a/tools/perf/scripts/python/export-to-sqlite.py b/tools/perf/scripts/python/export-to-sqlite.py
new file mode 100644
index 000000000000..f827bf77e9d2
--- /dev/null
+++ b/tools/perf/scripts/python/export-to-sqlite.py
@@ -0,0 +1,451 @@
+# export-to-sqlite.py: export perf data to a sqlite3 database
+# Copyright (c) 2017, Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+
+import os
+import sys
+import struct
+import datetime
+
+# To use this script you will need to have installed package python-pyside which
+# provides LGPL-licensed Python bindings for Qt.  You will also need the package
+# libqt4-sql-sqlite for Qt sqlite3 support.
+#
+# An example of using this script with Intel PT:
+#
+#	$ perf record -e intel_pt//u ls
+#	$ perf script -s ~/libexec/perf-core/scripts/python/export-to-sqlite.py pt_example branches calls
+#	2017-07-31 14:26:07.326913 Creating database...
+#	2017-07-31 14:26:07.538097 Writing records...
+#	2017-07-31 14:26:09.889292 Adding indexes
+#	2017-07-31 14:26:09.958746 Done
+#
+# To browse the database, sqlite3 can be used e.g.
+#
+#	$ sqlite3 pt_example
+#	sqlite> .header on
+#	sqlite> select * from samples_view where id < 10;
+#	sqlite> .mode column
+#	sqlite> select * from samples_view where id < 10;
+#	sqlite> .tables
+#	sqlite> .schema samples_view
+#	sqlite> .quit
+#
+# An example of using the database is provided by the script
+# call-graph-from-sql.py.  Refer to that script for details.
+#
+# The database structure is practically the same as created by the script
+# export-to-postgresql.py. Refer to that script for details.  A notable
+# difference is  the 'transaction' column of the 'samples' table which is
+# renamed 'transaction_' in sqlite because 'transaction' is a reserved word.
+
+from PySide.QtSql import *
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+# These perf imports are not used at present
+#from perf_trace_context import *
+#from Core import *
+
+perf_db_export_mode = True
+perf_db_export_calls = False
+perf_db_export_callchains = False
+
+def usage():
+	print >> sys.stderr, "Usage is: export-to-sqlite.py <database name> [<columns>] [<calls>] [<callchains>]"
+	print >> sys.stderr, "where:	columns		'all' or 'branches'"
+	print >> sys.stderr, "		calls		'calls' => create calls and call_paths table"
+	print >> sys.stderr, "		callchains	'callchains' => create call_paths table"
+	raise Exception("Too few arguments")
+
+if (len(sys.argv) < 2):
+	usage()
+
+dbname = sys.argv[1]
+
+if (len(sys.argv) >= 3):
+	columns = sys.argv[2]
+else:
+	columns = "all"
+
+if columns not in ("all", "branches"):
+	usage()
+
+branches = (columns == "branches")
+
+for i in range(3,len(sys.argv)):
+	if (sys.argv[i] == "calls"):
+		perf_db_export_calls = True
+	elif (sys.argv[i] == "callchains"):
+		perf_db_export_callchains = True
+	else:
+		usage()
+
+def do_query(q, s):
+	if (q.exec_(s)):
+		return
+	raise Exception("Query failed: " + q.lastError().text())
+
+def do_query_(q):
+	if (q.exec_()):
+		return
+	raise Exception("Query failed: " + q.lastError().text())
+
+print datetime.datetime.today(), "Creating database..."
+
+db_exists = False
+try:
+	f = open(dbname)
+	f.close()
+	db_exists = True
+except:
+	pass
+
+if db_exists:
+	raise Exception(dbname + " already exists")
+
+db = QSqlDatabase.addDatabase('QSQLITE')
+db.setDatabaseName(dbname)
+db.open()
+
+query = QSqlQuery(db)
+
+do_query(query, 'PRAGMA journal_mode = OFF')
+do_query(query, 'BEGIN TRANSACTION')
+
+do_query(query, 'CREATE TABLE selected_events ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'name		varchar(80))')
+do_query(query, 'CREATE TABLE machines ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'pid		integer,'
+		'root_dir 	varchar(4096))')
+do_query(query, 'CREATE TABLE threads ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'machine_id	bigint,'
+		'process_id	bigint,'
+		'pid		integer,'
+		'tid		integer)')
+do_query(query, 'CREATE TABLE comms ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'comm		varchar(16))')
+do_query(query, 'CREATE TABLE comm_threads ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'comm_id	bigint,'
+		'thread_id	bigint)')
+do_query(query, 'CREATE TABLE dsos ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'machine_id	bigint,'
+		'short_name	varchar(256),'
+		'long_name	varchar(4096),'
+		'build_id	varchar(64))')
+do_query(query, 'CREATE TABLE symbols ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'dso_id		bigint,'
+		'sym_start	bigint,'
+		'sym_end	bigint,'
+		'binding	integer,'
+		'name		varchar(2048))')
+do_query(query, 'CREATE TABLE branch_types ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'name		varchar(80))')
+
+if branches:
+	do_query(query, 'CREATE TABLE samples ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'evsel_id	bigint,'
+		'machine_id	bigint,'
+		'thread_id	bigint,'
+		'comm_id	bigint,'
+		'dso_id		bigint,'
+		'symbol_id	bigint,'
+		'sym_offset	bigint,'
+		'ip		bigint,'
+		'time		bigint,'
+		'cpu		integer,'
+		'to_dso_id	bigint,'
+		'to_symbol_id	bigint,'
+		'to_sym_offset	bigint,'
+		'to_ip		bigint,'
+		'branch_type	integer,'
+		'in_tx		boolean,'
+		'call_path_id	bigint)')
+else:
+	do_query(query, 'CREATE TABLE samples ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'evsel_id	bigint,'
+		'machine_id	bigint,'
+		'thread_id	bigint,'
+		'comm_id	bigint,'
+		'dso_id		bigint,'
+		'symbol_id	bigint,'
+		'sym_offset	bigint,'
+		'ip		bigint,'
+		'time		bigint,'
+		'cpu		integer,'
+		'to_dso_id	bigint,'
+		'to_symbol_id	bigint,'
+		'to_sym_offset	bigint,'
+		'to_ip		bigint,'
+		'period		bigint,'
+		'weight		bigint,'
+		'transaction_	bigint,'
+		'data_src	bigint,'
+		'branch_type	integer,'
+		'in_tx		boolean,'
+		'call_path_id	bigint)')
+
+if perf_db_export_calls or perf_db_export_callchains:
+	do_query(query, 'CREATE TABLE call_paths ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'parent_id	bigint,'
+		'symbol_id	bigint,'
+		'ip		bigint)')
+if perf_db_export_calls:
+	do_query(query, 'CREATE TABLE calls ('
+		'id		integer		NOT NULL	PRIMARY KEY,'
+		'thread_id	bigint,'
+		'comm_id	bigint,'
+		'call_path_id	bigint,'
+		'call_time	bigint,'
+		'return_time	bigint,'
+		'branch_count	bigint,'
+		'call_id	bigint,'
+		'return_id	bigint,'
+		'parent_call_path_id	bigint,'
+		'flags		integer)')
+
+# printf was added to sqlite in version 3.8.3
+sqlite_has_printf = False
+try:
+	do_query(query, 'SELECT printf("") FROM machines')
+	sqlite_has_printf = True
+except:
+	pass
+
+def emit_to_hex(x):
+	if sqlite_has_printf:
+		return 'printf("%x", ' + x + ')'
+	else:
+		return x
+
+do_query(query, 'CREATE VIEW machines_view AS '
+	'SELECT '
+		'id,'
+		'pid,'
+		'root_dir,'
+		'CASE WHEN id=0 THEN \'unknown\' WHEN pid=-1 THEN \'host\' ELSE \'guest\' END AS host_or_guest'
+	' FROM machines')
+
+do_query(query, 'CREATE VIEW dsos_view AS '
+	'SELECT '
+		'id,'
+		'machine_id,'
+		'(SELECT host_or_guest FROM machines_view WHERE id = machine_id) AS host_or_guest,'
+		'short_name,'
+		'long_name,'
+		'build_id'
+	' FROM dsos')
+
+do_query(query, 'CREATE VIEW symbols_view AS '
+	'SELECT '
+		'id,'
+		'name,'
+		'(SELECT short_name FROM dsos WHERE id=dso_id) AS dso,'
+		'dso_id,'
+		'sym_start,'
+		'sym_end,'
+		'CASE WHEN binding=0 THEN \'local\' WHEN binding=1 THEN \'global\' ELSE \'weak\' END AS binding'
+	' FROM symbols')
+
+do_query(query, 'CREATE VIEW threads_view AS '
+	'SELECT '
+		'id,'
+		'machine_id,'
+		'(SELECT host_or_guest FROM machines_view WHERE id = machine_id) AS host_or_guest,'
+		'process_id,'
+		'pid,'
+		'tid'
+	' FROM threads')
+
+do_query(query, 'CREATE VIEW comm_threads_view AS '
+	'SELECT '
+		'comm_id,'
+		'(SELECT comm FROM comms WHERE id = comm_id) AS command,'
+		'thread_id,'
+		'(SELECT pid FROM threads WHERE id = thread_id) AS pid,'
+		'(SELECT tid FROM threads WHERE id = thread_id) AS tid'
+	' FROM comm_threads')
+
+if perf_db_export_calls or perf_db_export_callchains:
+	do_query(query, 'CREATE VIEW call_paths_view AS '
+		'SELECT '
+			'c.id,'
+			+ emit_to_hex('c.ip') + ' AS ip,'
+			'c.symbol_id,'
+			'(SELECT name FROM symbols WHERE id = c.symbol_id) AS symbol,'
+			'(SELECT dso_id FROM symbols WHERE id = c.symbol_id) AS dso_id,'
+			'(SELECT dso FROM symbols_view  WHERE id = c.symbol_id) AS dso_short_name,'
+			'c.parent_id,'
+			+ emit_to_hex('p.ip') + ' AS parent_ip,'
+			'p.symbol_id AS parent_symbol_id,'
+			'(SELECT name FROM symbols WHERE id = p.symbol_id) AS parent_symbol,'
+			'(SELECT dso_id FROM symbols WHERE id = p.symbol_id) AS parent_dso_id,'
+			'(SELECT dso FROM symbols_view  WHERE id = p.symbol_id) AS parent_dso_short_name'
+		' FROM call_paths c INNER JOIN call_paths p ON p.id = c.parent_id')
+if perf_db_export_calls:
+	do_query(query, 'CREATE VIEW calls_view AS '
+		'SELECT '
+			'calls.id,'
+			'thread_id,'
+			'(SELECT pid FROM threads WHERE id = thread_id) AS pid,'
+			'(SELECT tid FROM threads WHERE id = thread_id) AS tid,'
+			'(SELECT comm FROM comms WHERE id = comm_id) AS command,'
+			'call_path_id,'
+			+ emit_to_hex('ip') + ' AS ip,'
+			'symbol_id,'
+			'(SELECT name FROM symbols WHERE id = symbol_id) AS symbol,'
+			'call_time,'
+			'return_time,'
+			'return_time - call_time AS elapsed_time,'
+			'branch_count,'
+			'call_id,'
+			'return_id,'
+			'CASE WHEN flags=1 THEN \'no call\' WHEN flags=2 THEN \'no return\' WHEN flags=3 THEN \'no call/return\' ELSE \'\' END AS flags,'
+			'parent_call_path_id'
+		' FROM calls INNER JOIN call_paths ON call_paths.id = call_path_id')
+
+do_query(query, 'CREATE VIEW samples_view AS '
+	'SELECT '
+		'id,'
+		'time,'
+		'cpu,'
+		'(SELECT pid FROM threads WHERE id = thread_id) AS pid,'
+		'(SELECT tid FROM threads WHERE id = thread_id) AS tid,'
+		'(SELECT comm FROM comms WHERE id = comm_id) AS command,'
+		'(SELECT name FROM selected_events WHERE id = evsel_id) AS event,'
+		+ emit_to_hex('ip') + ' AS ip_hex,'
+		'(SELECT name FROM symbols WHERE id = symbol_id) AS symbol,'
+		'sym_offset,'
+		'(SELECT short_name FROM dsos WHERE id = dso_id) AS dso_short_name,'
+		+ emit_to_hex('to_ip') + ' AS to_ip_hex,'
+		'(SELECT name FROM symbols WHERE id = to_symbol_id) AS to_symbol,'
+		'to_sym_offset,'
+		'(SELECT short_name FROM dsos WHERE id = to_dso_id) AS to_dso_short_name,'
+		'(SELECT name FROM branch_types WHERE id = branch_type) AS branch_type_name,'
+		'in_tx'
+	' FROM samples')
+
+do_query(query, 'END TRANSACTION')
+
+evsel_query = QSqlQuery(db)
+evsel_query.prepare("INSERT INTO selected_events VALUES (?, ?)")
+machine_query = QSqlQuery(db)
+machine_query.prepare("INSERT INTO machines VALUES (?, ?, ?)")
+thread_query = QSqlQuery(db)
+thread_query.prepare("INSERT INTO threads VALUES (?, ?, ?, ?, ?)")
+comm_query = QSqlQuery(db)
+comm_query.prepare("INSERT INTO comms VALUES (?, ?)")
+comm_thread_query = QSqlQuery(db)
+comm_thread_query.prepare("INSERT INTO comm_threads VALUES (?, ?, ?)")
+dso_query = QSqlQuery(db)
+dso_query.prepare("INSERT INTO dsos VALUES (?, ?, ?, ?, ?)")
+symbol_query = QSqlQuery(db)
+symbol_query.prepare("INSERT INTO symbols VALUES (?, ?, ?, ?, ?, ?)")
+branch_type_query = QSqlQuery(db)
+branch_type_query.prepare("INSERT INTO branch_types VALUES (?, ?)")
+sample_query = QSqlQuery(db)
+if branches:
+	sample_query.prepare("INSERT INTO samples VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
+else:
+	sample_query.prepare("INSERT INTO samples VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
+if perf_db_export_calls or perf_db_export_callchains:
+	call_path_query = QSqlQuery(db)
+	call_path_query.prepare("INSERT INTO call_paths VALUES (?, ?, ?, ?)")
+if perf_db_export_calls:
+	call_query = QSqlQuery(db)
+	call_query.prepare("INSERT INTO calls VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
+
+def trace_begin():
+	print datetime.datetime.today(), "Writing records..."
+	do_query(query, 'BEGIN TRANSACTION')
+	# id == 0 means unknown.  It is easier to create records for them than replace the zeroes with NULLs
+	evsel_table(0, "unknown")
+	machine_table(0, 0, "unknown")
+	thread_table(0, 0, 0, -1, -1)
+	comm_table(0, "unknown")
+	dso_table(0, 0, "unknown", "unknown", "")
+	symbol_table(0, 0, 0, 0, 0, "unknown")
+	sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+	if perf_db_export_calls or perf_db_export_callchains:
+		call_path_table(0, 0, 0, 0)
+
+unhandled_count = 0
+
+def trace_end():
+	do_query(query, 'END TRANSACTION')
+
+	print datetime.datetime.today(), "Adding indexes"
+	if perf_db_export_calls:
+		do_query(query, 'CREATE INDEX pcpid_idx ON calls (parent_call_path_id)')
+
+	if (unhandled_count):
+		print datetime.datetime.today(), "Warning: ", unhandled_count, " unhandled events"
+	print datetime.datetime.today(), "Done"
+
+def trace_unhandled(event_name, context, event_fields_dict):
+	global unhandled_count
+	unhandled_count += 1
+
+def sched__sched_switch(*x):
+	pass
+
+def bind_exec(q, n, x):
+	for xx in x[0:n]:
+		q.addBindValue(str(xx))
+	do_query_(q)
+
+def evsel_table(*x):
+	bind_exec(evsel_query, 2, x)
+
+def machine_table(*x):
+	bind_exec(machine_query, 3, x)
+
+def thread_table(*x):
+	bind_exec(thread_query, 5, x)
+
+def comm_table(*x):
+	bind_exec(comm_query, 2, x)
+
+def comm_thread_table(*x):
+	bind_exec(comm_thread_query, 3, x)
+
+def dso_table(*x):
+	bind_exec(dso_query, 5, x)
+
+def symbol_table(*x):
+	bind_exec(symbol_query, 6, x)
+
+def branch_type_table(*x):
+	bind_exec(branch_type_query, 2, x)
+
+def sample_table(*x):
+	if branches:
+		bind_exec(sample_query, 18, x)
+	else:
+		bind_exec(sample_query, 22, x)
+
+def call_path_table(*x):
+	bind_exec(call_path_query, 4, x)
+
+def call_return_table(*x):
+	bind_exec(call_query, 11, x)
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 84222bdb8689..87bf3edb037c 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -34,6 +34,7 @@ perf-y += thread-map.o
 perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o llvm-src-relocation.o
 perf-y += bpf.o
 perf-y += topology.o
+perf-y += mem.o
 perf-y += cpumap.o
 perf-y += stat.o
 perf-y += event_update.o
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index 0e77b2cf61ec..c9aafed7da15 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -36,6 +36,7 @@
 #define ENV "PERF_TEST_ATTR"
 
 static char *dir;
+static bool ready;
 
 void test_attr__init(void)
 {
@@ -67,6 +68,9 @@ static int store_event(struct perf_event_attr *attr, pid_t pid, int cpu,
 	FILE *file;
 	char path[PATH_MAX];
 
+	if (!ready)
+		return 0;
+
 	snprintf(path, PATH_MAX, "%s/event-%d-%llu-%d", dir,
 		 attr->type, attr->config, fd);
 
@@ -136,7 +140,7 @@ void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
 {
 	int errno_saved = errno;
 
-	if (store_event(attr, pid, cpu, fd, group_fd, flags)) {
+	if ((fd != -1) && store_event(attr, pid, cpu, fd, group_fd, flags)) {
 		pr_err("test attr FAILED");
 		exit(128);
 	}
@@ -144,6 +148,12 @@ void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
 	errno = errno_saved;
 }
 
+void test_attr__ready(void)
+{
+	if (unlikely(test_attr__enabled) && !ready)
+		ready = true;
+}
+
 static int run_dir(const char *d, const char *perf)
 {
 	char v[] = "-vvvvv";
@@ -159,7 +169,7 @@ static int run_dir(const char *d, const char *perf)
 	return system(cmd);
 }
 
-int test__attr(int subtest __maybe_unused)
+int test__attr(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct stat st;
 	char path_perf[PATH_MAX];
diff --git a/tools/perf/tests/attr.py b/tools/perf/tests/attr.py
index cdf21a9d0c35..6bb50e82a3e3 100644
--- a/tools/perf/tests/attr.py
+++ b/tools/perf/tests/attr.py
@@ -9,6 +9,20 @@ import logging
 import shutil
 import ConfigParser
 
+def data_equal(a, b):
+    # Allow multiple values in assignment separated by '|'
+    a_list = a.split('|')
+    b_list = b.split('|')
+
+    for a_item in a_list:
+        for b_item in b_list:
+            if (a_item == b_item):
+                return True
+            elif (a_item == '*') or (b_item == '*'):
+                return True
+
+    return False
+
 class Fail(Exception):
     def __init__(self, test, msg):
         self.msg = msg
@@ -82,34 +96,25 @@ class Event(dict):
         self.add(base)
         self.add(data)
 
-    def compare_data(self, a, b):
-        # Allow multiple values in assignment separated by '|'
-        a_list = a.split('|')
-        b_list = b.split('|')
-
-        for a_item in a_list:
-            for b_item in b_list:
-                if (a_item == b_item):
-                    return True
-                elif (a_item == '*') or (b_item == '*'):
-                    return True
-
-        return False
-
     def equal(self, other):
         for t in Event.terms:
             log.debug("      [%s] %s %s" % (t, self[t], other[t]));
             if not self.has_key(t) or not other.has_key(t):
                 return False
-            if not self.compare_data(self[t], other[t]):
+            if not data_equal(self[t], other[t]):
                 return False
         return True
 
+    def optional(self):
+        if self.has_key('optional') and self['optional'] == '1':
+            return True
+        return False
+
     def diff(self, other):
         for t in Event.terms:
             if not self.has_key(t) or not other.has_key(t):
                 continue
-            if not self.compare_data(self[t], other[t]):
+            if not data_equal(self[t], other[t]):
 		log.warning("expected %s=%s, got %s" % (t, self[t], other[t]))
 
 # Test file description needs to have following sections:
@@ -218,9 +223,9 @@ class Test(object):
               self.perf, self.command, tempdir, self.args)
         ret = os.WEXITSTATUS(os.system(cmd))
 
-        log.info("  '%s' ret %d " % (cmd, ret))
+        log.info("  '%s' ret '%s', expected '%s'" % (cmd, str(ret), str(self.ret)))
 
-        if ret != int(self.ret):
+        if not data_equal(str(ret), str(self.ret)):
             raise Unsup(self)
 
     def compare(self, expect, result):
@@ -244,9 +249,12 @@ class Test(object):
             log.debug("    match: [%s] matches %s" % (exp_name, str(exp_list)))
 
             # we did not any matching event - fail
-            if (not exp_list):
-		exp_event.diff(res_event)
-                raise Fail(self, 'match failure');
+            if not exp_list:
+                if exp_event.optional():
+                    log.debug("    %s does not match, but is optional" % exp_name)
+                else:
+                    exp_event.diff(res_event)
+                    raise Fail(self, 'match failure');
 
             match[exp_name] = exp_list
 
diff --git a/tools/perf/tests/attr/base-record b/tools/perf/tests/attr/base-record
index 7e6d74946e04..31e0b1da830b 100644
--- a/tools/perf/tests/attr/base-record
+++ b/tools/perf/tests/attr/base-record
@@ -7,7 +7,7 @@ cpu=*
 type=0|1
 size=112
 config=0
-sample_period=4000
+sample_period=*
 sample_type=263
 read_format=0
 disabled=1
@@ -15,7 +15,7 @@ inherit=1
 pinned=0
 exclusive=0
 exclude_user=0
-exclude_kernel=0
+exclude_kernel=0|1
 exclude_hv=0
 exclude_idle=0
 mmap=1
@@ -25,7 +25,7 @@ inherit_stat=0
 enable_on_exec=1
 task=0
 watermark=0
-precise_ip=0
+precise_ip=0|1|2|3
 mmap_data=0
 sample_id_all=1
 exclude_host=0|1
diff --git a/tools/perf/tests/attr/base-stat b/tools/perf/tests/attr/base-stat
index f4cf148f14cb..4d0c2e42b64e 100644
--- a/tools/perf/tests/attr/base-stat
+++ b/tools/perf/tests/attr/base-stat
@@ -8,14 +8,14 @@ type=0
 size=112
 config=0
 sample_period=0
-sample_type=0
+sample_type=65536
 read_format=3
 disabled=1
 inherit=1
 pinned=0
 exclusive=0
 exclude_user=0
-exclude_kernel=0
+exclude_kernel=0|1
 exclude_hv=0
 exclude_idle=0
 mmap=0
diff --git a/tools/perf/tests/attr/test-record-C0 b/tools/perf/tests/attr/test-record-C0
index d6a7e43f61b3..cb0a3138fa54 100644
--- a/tools/perf/tests/attr/test-record-C0
+++ b/tools/perf/tests/attr/test-record-C0
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -C 0 kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
 cpu=0
diff --git a/tools/perf/tests/attr/test-record-basic b/tools/perf/tests/attr/test-record-basic
index 55c0428370ca..85a23cf35ba1 100644
--- a/tools/perf/tests/attr/test-record-basic
+++ b/tools/perf/tests/attr/test-record-basic
@@ -1,5 +1,6 @@
 [config]
 command = record
 args    = kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
diff --git a/tools/perf/tests/attr/test-record-branch-any b/tools/perf/tests/attr/test-record-branch-any
index 1421960ed4e9..81f839e2fad0 100644
--- a/tools/perf/tests/attr/test-record-branch-any
+++ b/tools/perf/tests/attr/test-record-branch-any
@@ -1,8 +1,8 @@
 [config]
 command = record
 args    = -b kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=2311
 branch_sample_type=8
diff --git a/tools/perf/tests/attr/test-record-branch-filter-any b/tools/perf/tests/attr/test-record-branch-filter-any
index 915c4df0e0c2..357421f4dfce 100644
--- a/tools/perf/tests/attr/test-record-branch-filter-any
+++ b/tools/perf/tests/attr/test-record-branch-filter-any
@@ -1,8 +1,8 @@
 [config]
 command = record
 args    = -j any kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=2311
 branch_sample_type=8
diff --git a/tools/perf/tests/attr/test-record-branch-filter-any_call b/tools/perf/tests/attr/test-record-branch-filter-any_call
index 8708dbd4f373..dbc55f2ab845 100644
--- a/tools/perf/tests/attr/test-record-branch-filter-any_call
+++ b/tools/perf/tests/attr/test-record-branch-filter-any_call
@@ -1,8 +1,8 @@
 [config]
 command = record
 args    = -j any_call kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=2311
 branch_sample_type=16
diff --git a/tools/perf/tests/attr/test-record-branch-filter-any_ret b/tools/perf/tests/attr/test-record-branch-filter-any_ret
index 0d3607a6dcbe..a0824ff8e131 100644
--- a/tools/perf/tests/attr/test-record-branch-filter-any_ret
+++ b/tools/perf/tests/attr/test-record-branch-filter-any_ret
@@ -1,8 +1,8 @@
 [config]
 command = record
 args    = -j any_ret kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=2311
 branch_sample_type=32
diff --git a/tools/perf/tests/attr/test-record-branch-filter-hv b/tools/perf/tests/attr/test-record-branch-filter-hv
index f25526740cec..f34d6f120181 100644
--- a/tools/perf/tests/attr/test-record-branch-filter-hv
+++ b/tools/perf/tests/attr/test-record-branch-filter-hv
@@ -1,8 +1,8 @@
 [config]
 command = record
 args    = -j hv kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=2311
 branch_sample_type=8
diff --git a/tools/perf/tests/attr/test-record-branch-filter-ind_call b/tools/perf/tests/attr/test-record-branch-filter-ind_call
index e862dd179128..b86a35232248 100644
--- a/tools/perf/tests/attr/test-record-branch-filter-ind_call
+++ b/tools/perf/tests/attr/test-record-branch-filter-ind_call
@@ -1,8 +1,8 @@
 [config]
 command = record
 args    = -j ind_call kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=2311
 branch_sample_type=64
diff --git a/tools/perf/tests/attr/test-record-branch-filter-k b/tools/perf/tests/attr/test-record-branch-filter-k
index 182971e898f5..d3fbc5e1858a 100644
--- a/tools/perf/tests/attr/test-record-branch-filter-k
+++ b/tools/perf/tests/attr/test-record-branch-filter-k
@@ -1,8 +1,8 @@
 [config]
 command = record
 args    = -j k kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=2311
 branch_sample_type=8
diff --git a/tools/perf/tests/attr/test-record-branch-filter-u b/tools/perf/tests/attr/test-record-branch-filter-u
index 83449ef9e687..a318f0dda173 100644
--- a/tools/perf/tests/attr/test-record-branch-filter-u
+++ b/tools/perf/tests/attr/test-record-branch-filter-u
@@ -1,8 +1,8 @@
 [config]
 command = record
 args    = -j u kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=2311
 branch_sample_type=8
diff --git a/tools/perf/tests/attr/test-record-count b/tools/perf/tests/attr/test-record-count
index 2f841de56f6b..34f6cc577263 100644
--- a/tools/perf/tests/attr/test-record-count
+++ b/tools/perf/tests/attr/test-record-count
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -c 123 kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
 sample_period=123
diff --git a/tools/perf/tests/attr/test-record-data b/tools/perf/tests/attr/test-record-data
index 716e143b5291..a9cf2233b0ce 100644
--- a/tools/perf/tests/attr/test-record-data
+++ b/tools/perf/tests/attr/test-record-data
@@ -1,10 +1,9 @@
 [config]
 command = record
 args    = -d kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
-
 # sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME |
 # PERF_SAMPLE_ADDR | PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC
 sample_type=33039
diff --git a/tools/perf/tests/attr/test-record-freq b/tools/perf/tests/attr/test-record-freq
index 600d0f8f2583..bf4cb459f0d5 100644
--- a/tools/perf/tests/attr/test-record-freq
+++ b/tools/perf/tests/attr/test-record-freq
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -F 100 kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
 sample_period=100
diff --git a/tools/perf/tests/attr/test-record-graph-default b/tools/perf/tests/attr/test-record-graph-default
index 853597a9a8f6..0b216e69760c 100644
--- a/tools/perf/tests/attr/test-record-graph-default
+++ b/tools/perf/tests/attr/test-record-graph-default
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -g kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
 sample_type=295
diff --git a/tools/perf/tests/attr/test-record-graph-dwarf b/tools/perf/tests/attr/test-record-graph-dwarf
index d6f324ea578c..da2fa73bd0a2 100644
--- a/tools/perf/tests/attr/test-record-graph-dwarf
+++ b/tools/perf/tests/attr/test-record-graph-dwarf
@@ -1,10 +1,12 @@
 [config]
 command = record
 args    = --call-graph dwarf -- kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_type=12583
+sample_type=45359
 exclude_callchain_user=1
 sample_stack_user=8192
 # TODO different for each arch, no support for that now
 sample_regs_user=*
+mmap_data=1
diff --git a/tools/perf/tests/attr/test-record-graph-fp b/tools/perf/tests/attr/test-record-graph-fp
index 055e3bee7993..625d190bb798 100644
--- a/tools/perf/tests/attr/test-record-graph-fp
+++ b/tools/perf/tests/attr/test-record-graph-fp
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = --call-graph fp kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
 sample_type=295
diff --git a/tools/perf/tests/attr/test-record-group b/tools/perf/tests/attr/test-record-group
index 57739cacdb2a..6e7961f6f7a5 100644
--- a/tools/perf/tests/attr/test-record-group
+++ b/tools/perf/tests/attr/test-record-group
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = --group -e cycles,instructions kill >/dev/null 2>&1
+ret     = 1
 
 [event-1:base-record]
 fd=1
diff --git a/tools/perf/tests/attr/test-record-group-sampling b/tools/perf/tests/attr/test-record-group-sampling
index 658f5d60c873..ef59afd6d635 100644
--- a/tools/perf/tests/attr/test-record-group-sampling
+++ b/tools/perf/tests/attr/test-record-group-sampling
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -e '{cycles,cache-misses}:S' kill >/dev/null 2>&1
+ret     = 1
 
 [event-1:base-record]
 fd=1
diff --git a/tools/perf/tests/attr/test-record-group1 b/tools/perf/tests/attr/test-record-group1
index c5548d054aff..87a222d014d8 100644
--- a/tools/perf/tests/attr/test-record-group1
+++ b/tools/perf/tests/attr/test-record-group1
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -e '{cycles,instructions}' kill >/dev/null 2>&1
+ret     = 1
 
 [event-1:base-record]
 fd=1
diff --git a/tools/perf/tests/attr/test-record-no-delay b/tools/perf/tests/attr/test-record-no-buffering
index f253b78cdbf2..aa3956d8fe20 100644
--- a/tools/perf/tests/attr/test-record-no-delay
+++ b/tools/perf/tests/attr/test-record-no-buffering
@@ -1,9 +1,9 @@
 [config]
 command = record
-args    = -D kill >/dev/null 2>&1
+args    = --no-buffering kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=263
 watermark=0
 wakeup_events=1
diff --git a/tools/perf/tests/attr/test-record-no-inherit b/tools/perf/tests/attr/test-record-no-inherit
index 44edcb2edcd5..560943decb87 100644
--- a/tools/perf/tests/attr/test-record-no-inherit
+++ b/tools/perf/tests/attr/test-record-no-inherit
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -i kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
 sample_type=263
diff --git a/tools/perf/tests/attr/test-record-no-samples b/tools/perf/tests/attr/test-record-no-samples
index d0141b2418b5..8eb73ab639e0 100644
--- a/tools/perf/tests/attr/test-record-no-samples
+++ b/tools/perf/tests/attr/test-record-no-samples
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -n kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
 sample_period=0
diff --git a/tools/perf/tests/attr/test-record-period b/tools/perf/tests/attr/test-record-period
index 8abc5314fc52..69bc748f0f27 100644
--- a/tools/perf/tests/attr/test-record-period
+++ b/tools/perf/tests/attr/test-record-period
@@ -1,6 +1,7 @@
 [config]
 command = record
 args    = -c 100 -P kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
 sample_period=100
diff --git a/tools/perf/tests/attr/test-record-raw b/tools/perf/tests/attr/test-record-raw
index 4a8ef25b5f49..a188a614a44c 100644
--- a/tools/perf/tests/attr/test-record-raw
+++ b/tools/perf/tests/attr/test-record-raw
@@ -1,7 +1,7 @@
 [config]
 command = record
 args    = -R kill >/dev/null 2>&1
+ret     = 1
 
 [event:base-record]
-sample_period=4000
 sample_type=1415
diff --git a/tools/perf/tests/attr/test-stat-C0 b/tools/perf/tests/attr/test-stat-C0
index aa835950751f..67717fe6a65d 100644
--- a/tools/perf/tests/attr/test-stat-C0
+++ b/tools/perf/tests/attr/test-stat-C0
@@ -4,6 +4,6 @@ args    = -e cycles -C 0 kill >/dev/null 2>&1
 ret     = 1
 
 [event:base-stat]
-# events are enabled by default when attached to cpu
-disabled=0
+# events are disabled by default when attached to cpu
+disabled=1
 enable_on_exec=0
diff --git a/tools/perf/tests/attr/test-stat-default b/tools/perf/tests/attr/test-stat-default
index 19270f54c96e..e911dbd4eb47 100644
--- a/tools/perf/tests/attr/test-stat-default
+++ b/tools/perf/tests/attr/test-stat-default
@@ -38,12 +38,14 @@ config=0
 fd=6
 type=0
 config=7
+optional=1
 
 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_BACKEND
 [event7:base-stat]
 fd=7
 type=0
 config=8
+optional=1
 
 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_INSTRUCTIONS
 [event8:base-stat]
diff --git a/tools/perf/tests/attr/test-stat-detailed-1 b/tools/perf/tests/attr/test-stat-detailed-1
index 51426b87153b..b39270a08e74 100644
--- a/tools/perf/tests/attr/test-stat-detailed-1
+++ b/tools/perf/tests/attr/test-stat-detailed-1
@@ -39,12 +39,14 @@ config=0
 fd=6
 type=0
 config=7
+optional=1
 
 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_BACKEND
 [event7:base-stat]
 fd=7
 type=0
 config=8
+optional=1
 
 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_INSTRUCTIONS
 [event8:base-stat]
diff --git a/tools/perf/tests/attr/test-stat-detailed-2 b/tools/perf/tests/attr/test-stat-detailed-2
index 8de5acc31c27..45f8e6ea34f8 100644
--- a/tools/perf/tests/attr/test-stat-detailed-2
+++ b/tools/perf/tests/attr/test-stat-detailed-2
@@ -39,12 +39,14 @@ config=0
 fd=6
 type=0
 config=7
+optional=1
 
 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_BACKEND
 [event7:base-stat]
 fd=7
 type=0
 config=8
+optional=1
 
 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_INSTRUCTIONS
 [event8:base-stat]
@@ -108,6 +110,7 @@ config=65538
 fd=15
 type=3
 config=1
+optional=1
 
 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_L1I                <<  0  |
diff --git a/tools/perf/tests/attr/test-stat-detailed-3 b/tools/perf/tests/attr/test-stat-detailed-3
index 0a1f45bf7d79..30ae0fb7a3fd 100644
--- a/tools/perf/tests/attr/test-stat-detailed-3
+++ b/tools/perf/tests/attr/test-stat-detailed-3
@@ -39,12 +39,14 @@ config=0
 fd=6
 type=0
 config=7
+optional=1
 
 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_BACKEND
 [event7:base-stat]
 fd=7
 type=0
 config=8
+optional=1
 
 # PERF_TYPE_HARDWARE / PERF_COUNT_HW_INSTRUCTIONS
 [event8:base-stat]
@@ -108,6 +110,7 @@ config=65538
 fd=15
 type=3
 config=1
+optional=1
 
 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_L1I                <<  0  |
@@ -162,6 +165,7 @@ config=65540
 fd=21
 type=3
 config=512
+optional=1
 
 # PERF_TYPE_HW_CACHE,
 #  PERF_COUNT_HW_CACHE_L1D                <<  0  |
@@ -171,3 +175,4 @@ config=512
 fd=22
 type=3
 config=66048
+optional=1
diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
index 50f6d7afee58..d233ad336463 100644
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -75,7 +75,7 @@ static int do_test(struct perf_evlist *evlist, int mmap_pages,
 }
 
 
-int test__backward_ring_buffer(int subtest __maybe_unused)
+int test__backward_ring_buffer(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int ret = TEST_SKIP, err, sample_count = 0, comm_count = 0;
 	char pid[16], sbuf[STRERR_BUFSIZE];
diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
index 9abe6c13090f..0d7c06584905 100644
--- a/tools/perf/tests/bitmap.c
+++ b/tools/perf/tests/bitmap.c
@@ -40,7 +40,7 @@ static int test_bitmap(const char *str)
 	return ret;
 }
 
-int test__bitmap_print(int subtest __maybe_unused)
+int test__bitmap_print(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	TEST_ASSERT_VAL("failed to convert map", test_bitmap("1"));
 	TEST_ASSERT_VAL("failed to convert map", test_bitmap("1,5"));
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index 39bbb97cd30a..97937e1bc53a 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -164,7 +164,7 @@ static long long bp_count(int fd)
 	return count;
 }
 
-int test__bp_signal(int subtest __maybe_unused)
+int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct sigaction sa;
 	long long count1, count2, count3;
diff --git a/tools/perf/tests/bp_signal_overflow.c b/tools/perf/tests/bp_signal_overflow.c
index 3b1ac6f31b15..61ecd8021f49 100644
--- a/tools/perf/tests/bp_signal_overflow.c
+++ b/tools/perf/tests/bp_signal_overflow.c
@@ -57,7 +57,7 @@ static long long bp_count(int fd)
 #define EXECUTIONS 10000
 #define THRESHOLD  100
 
-int test__bp_signal_overflow(int subtest __maybe_unused)
+int test__bp_signal_overflow(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct perf_event_attr pe;
 	struct sigaction sa;
diff --git a/tools/perf/tests/bpf-script-test-prologue.c b/tools/perf/tests/bpf-script-test-prologue.c
index b4ebc75e25ae..43f1e16486f4 100644
--- a/tools/perf/tests/bpf-script-test-prologue.c
+++ b/tools/perf/tests/bpf-script-test-prologue.c
@@ -26,9 +26,11 @@ static void (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
 	(void *) 6;
 
 SEC("func=null_lseek file->f_mode offset orig")
-int bpf_func__null_lseek(void *ctx, int err, unsigned long f_mode,
+int bpf_func__null_lseek(void *ctx, int err, unsigned long _f_mode,
 			 unsigned long offset, unsigned long orig)
 {
+	fmode_t f_mode = (fmode_t)_f_mode;
+
 	if (err)
 		return 0;
 	if (f_mode & FMODE_WRITE)
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 5876da126b58..34c22cdf4d5d 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -124,16 +124,16 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 	struct perf_evlist *evlist;
 	int i, ret = TEST_FAIL, err = 0, count = 0;
 
-	struct parse_events_evlist parse_evlist;
+	struct parse_events_state parse_state;
 	struct parse_events_error parse_error;
 
 	bzero(&parse_error, sizeof(parse_error));
-	bzero(&parse_evlist, sizeof(parse_evlist));
-	parse_evlist.error = &parse_error;
-	INIT_LIST_HEAD(&parse_evlist.list);
+	bzero(&parse_state, sizeof(parse_state));
+	parse_state.error = &parse_error;
+	INIT_LIST_HEAD(&parse_state.list);
 
-	err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj, NULL);
-	if (err || list_empty(&parse_evlist.list)) {
+	err = parse_events_load_bpf_obj(&parse_state, &parse_state.list, obj, NULL);
+	if (err || list_empty(&parse_state.list)) {
 		pr_debug("Failed to add events selected by BPF\n");
 		return TEST_FAIL;
 	}
@@ -155,8 +155,8 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 		goto out_delete_evlist;
 	}
 
-	perf_evlist__splice_list_tail(evlist, &parse_evlist.list);
-	evlist->nr_groups = parse_evlist.nr_groups;
+	perf_evlist__splice_list_tail(evlist, &parse_state.list);
+	evlist->nr_groups = parse_state.nr_groups;
 
 	perf_evlist__config(evlist, &opts, NULL);
 
@@ -321,7 +321,7 @@ static int check_env(void)
 	return 0;
 }
 
-int test__bpf(int i)
+int test__bpf(struct test *test __maybe_unused, int i)
 {
 	int err;
 
@@ -351,7 +351,7 @@ const char *test__bpf_subtest_get_desc(int i __maybe_unused)
 	return NULL;
 }
 
-int test__bpf(int i __maybe_unused)
+int test__bpf(struct test *test __maybe_unused, int i __maybe_unused)
 {
 	pr_debug("Skip BPF test because BPF support is not compiled\n");
 	return TEST_SKIP;
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 3ccfd58a8c3c..377bea009163 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -6,7 +6,10 @@
 #include <errno.h>
 #include <unistd.h>
 #include <string.h>
+#include <sys/types.h>
+#include <dirent.h>
 #include <sys/wait.h>
+#include <sys/stat.h>
 #include "builtin.h"
 #include "hist.h"
 #include "intlist.h"
@@ -14,8 +17,10 @@
 #include "debug.h"
 #include "color.h"
 #include <subcmd/parse-options.h>
+#include "string2.h"
 #include "symbol.h"
 #include <linux/kernel.h>
+#include <subcmd/exec-cmd.h>
 
 static bool dont_fork;
 
@@ -43,6 +48,10 @@ static struct test generic_tests[] = {
 		.func = test__basic_mmap,
 	},
 	{
+		.desc = "Test data source output",
+		.func = test__mem,
+	},
+	{
 		.desc = "Parse event definition strings",
 		.func = test__parse_events,
 	},
@@ -179,7 +188,7 @@ static struct test generic_tests[] = {
 	},
 	{
 		.desc = "Session topology",
-		.func = test_session_topology,
+		.func = test__session_topology,
 	},
 	{
 		.desc = "BPF filter",
@@ -325,7 +334,7 @@ static int run_test(struct test *test, int subtest)
 			}
 		}
 
-		err = test->func(subtest);
+		err = test->func(test, subtest);
 		if (!dont_fork)
 			exit(err);
 	}
@@ -383,12 +392,143 @@ static int test_and_print(struct test *t, bool force_skip, int subtest)
 	return err;
 }
 
+static const char *shell_test__description(char *description, size_t size,
+					   const char *path, const char *name)
+{
+	FILE *fp;
+	char filename[PATH_MAX];
+
+	path__join(filename, sizeof(filename), path, name);
+	fp = fopen(filename, "r");
+	if (!fp)
+		return NULL;
+
+	description = fgets(description, size, fp);
+	fclose(fp);
+
+	return description ? trim(description + 1) : NULL;
+}
+
+#define for_each_shell_test(dir, ent)		\
+	while ((ent = readdir(dir)) != NULL)	\
+		if (ent->d_type == DT_REG && ent->d_name[0] != '.')
+
+static const char *shell_tests__dir(char *path, size_t size)
+{
+	const char *devel_dirs[] = { "./tools/perf/tests", "./tests", };
+        char *exec_path;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(devel_dirs); ++i) {
+		struct stat st;
+		if (!lstat(devel_dirs[i], &st)) {
+			scnprintf(path, size, "%s/shell", devel_dirs[i]);
+			if (!lstat(devel_dirs[i], &st))
+				return path;
+		}
+	}
+
+        /* Then installed path. */
+        exec_path = get_argv_exec_path();
+        scnprintf(path, size, "%s/tests/shell", exec_path);
+	free(exec_path);
+	return path;
+}
+
+static int shell_tests__max_desc_width(void)
+{
+	DIR *dir;
+	struct dirent *ent;
+	char path_dir[PATH_MAX];
+	const char *path = shell_tests__dir(path_dir, sizeof(path_dir));
+	int width = 0;
+
+	if (path == NULL)
+		return -1;
+
+	dir = opendir(path);
+	if (!dir)
+		return -1;
+
+	for_each_shell_test(dir, ent) {
+		char bf[256];
+		const char *desc = shell_test__description(bf, sizeof(bf), path, ent->d_name);
+
+		if (desc) {
+			int len = strlen(desc);
+
+			if (width < len)
+				width = len;
+		}
+	}
+
+	closedir(dir);
+	return width;
+}
+
+struct shell_test {
+	const char *dir;
+	const char *file;
+};
+
+static int shell_test__run(struct test *test, int subdir __maybe_unused)
+{
+	int err;
+	char script[PATH_MAX];
+	struct shell_test *st = test->priv;
+
+	path__join(script, sizeof(script), st->dir, st->file);
+
+	err = system(script);
+	if (!err)
+		return TEST_OK;
+
+	return WEXITSTATUS(err) == 2 ? TEST_SKIP : TEST_FAIL;
+}
+
+static int run_shell_tests(int argc, const char *argv[], int i, int width)
+{
+	DIR *dir;
+	struct dirent *ent;
+	char path_dir[PATH_MAX];
+	struct shell_test st = {
+		.dir = shell_tests__dir(path_dir, sizeof(path_dir)),
+	};
+
+	if (st.dir == NULL)
+		return -1;
+
+	dir = opendir(st.dir);
+	if (!dir)
+		return -1;
+
+	for_each_shell_test(dir, ent) {
+		int curr = i++;
+		char desc[256];
+		struct test test = {
+			.desc = shell_test__description(desc, sizeof(desc), st.dir, ent->d_name),
+			.func = shell_test__run,
+			.priv = &st,
+		};
+
+		if (!perf_test__matches(&test, curr, argc, argv))
+			continue;
+
+		st.file = ent->d_name;
+		pr_info("%2d: %-*s:", i, width, test.desc);
+		test_and_print(&test, false, -1);
+	}
+
+	closedir(dir);
+	return 0;
+}
+
 static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 {
 	struct test *t;
 	unsigned int j;
 	int i = 0;
-	int width = 0;
+	int width = shell_tests__max_desc_width();
 
 	for_each_test(j, t) {
 		int len = strlen(t->desc);
@@ -455,6 +595,37 @@ static int __cmd_test(int argc, const char *argv[], struct intlist *skiplist)
 		}
 	}
 
+	return run_shell_tests(argc, argv, i, width);
+}
+
+static int perf_test__list_shell(int argc, const char **argv, int i)
+{
+	DIR *dir;
+	struct dirent *ent;
+	char path_dir[PATH_MAX];
+	const char *path = shell_tests__dir(path_dir, sizeof(path_dir));
+
+	if (path == NULL)
+		return -1;
+
+	dir = opendir(path);
+	if (!dir)
+		return -1;
+
+	for_each_shell_test(dir, ent) {
+		int curr = i++;
+		char bf[256];
+		struct test t = {
+			.desc = shell_test__description(bf, sizeof(bf), path, ent->d_name),
+		};
+
+		if (!perf_test__matches(&t, curr, argc, argv))
+			continue;
+
+		pr_info("%2d: %s\n", i, t.desc);
+	}
+
+	closedir(dir);
 	return 0;
 }
 
@@ -465,12 +636,17 @@ static int perf_test__list(int argc, const char **argv)
 	int i = 0;
 
 	for_each_test(j, t) {
-		if (argc > 1 && !strstr(t->desc, argv[1]))
+		int curr = i++;
+
+		if (!perf_test__matches(t, curr, argc, argv) ||
+		    (t->is_supported && !t->is_supported()))
 			continue;
 
-		pr_info("%2d: %s\n", ++i, t->desc);
+		pr_info("%2d: %s\n", i, t->desc);
 	}
 
+	perf_test__list_shell(argc, argv, i);
+
 	return 0;
 }
 
@@ -498,7 +674,7 @@ int cmd_test(int argc, const char **argv)
 
 	argc = parse_options_subcommand(argc, argv, test_options, test_subcommands, test_usage, 0);
 	if (argc >= 1 && !strcmp(argv[0], "list"))
-		return perf_test__list(argc, argv);
+		return perf_test__list(argc - 1, argv + 1);
 
 	symbol_conf.priv_size = sizeof(int);
 	symbol_conf.sort_by_name = true;
diff --git a/tools/perf/tests/clang.c b/tools/perf/tests/clang.c
index c5bb2203f5a9..c60ec916f0f2 100644
--- a/tools/perf/tests/clang.c
+++ b/tools/perf/tests/clang.c
@@ -33,12 +33,12 @@ const char *test__clang_subtest_get_desc(int i)
 }
 
 #ifndef HAVE_LIBCLANGLLVM_SUPPORT
-int test__clang(int i __maybe_unused)
+int test__clang(struct test *test __maybe_unused, int i __maybe_unused)
 {
 	return TEST_SKIP;
 }
 #else
-int test__clang(int i)
+int test__clang(struct test *test __maybe_unused, int i)
 {
 	if (i < 0 || i >= (int)ARRAY_SIZE(clang_testcase_table))
 		return TEST_FAIL;
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 94b7c7b02bde..761c5a448c56 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -673,7 +673,7 @@ out_err:
 	return err;
 }
 
-int test__code_reading(int subtest __maybe_unused)
+int test__code_reading(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int ret;
 
diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
index 4478773cdb97..199702252270 100644
--- a/tools/perf/tests/cpumap.c
+++ b/tools/perf/tests/cpumap.c
@@ -72,7 +72,7 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
 }
 
 
-int test__cpu_map_synthesize(int subtest __maybe_unused)
+int test__cpu_map_synthesize(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct cpu_map *cpus;
 
@@ -106,7 +106,7 @@ static int cpu_map_print(const char *str)
 	return !strcmp(buf, str);
 }
 
-int test__cpu_map_print(int subtest __maybe_unused)
+int test__cpu_map_print(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	TEST_ASSERT_VAL("failed to convert map", cpu_map_print("1"));
 	TEST_ASSERT_VAL("failed to convert map", cpu_map_print("1,5"));
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c
index 8f08df5861cb..30aead42d136 100644
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -112,7 +112,7 @@ static int dso__data_fd(struct dso *dso, struct machine *machine)
 	return fd;
 }
 
-int test__dso_data(int subtest __maybe_unused)
+int test__dso_data(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct machine machine;
 	struct dso *dso;
@@ -247,7 +247,7 @@ static int set_fd_limit(int n)
 	return setrlimit(RLIMIT_NOFILE, &rlim);
 }
 
-int test__dso_data_cache(int subtest __maybe_unused)
+int test__dso_data_cache(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct machine machine;
 	long nr_end, nr = open_files_cnt();
@@ -307,7 +307,7 @@ int test__dso_data_cache(int subtest __maybe_unused)
 	return 0;
 }
 
-int test__dso_data_reopen(int subtest __maybe_unused)
+int test__dso_data_reopen(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct machine machine;
 	long nr_end, nr = open_files_cnt();
diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 3e56d08f7995..2a7b9b47bbcb 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -154,7 +154,7 @@ static noinline int krava_1(struct thread *thread)
 	return krava_2(thread);
 }
 
-int test__dwarf_unwind(int subtest __maybe_unused)
+int test__dwarf_unwind(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct machine *machine;
 	struct thread *thread;
diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c
index 634f20c631d8..b82b981c3259 100644
--- a/tools/perf/tests/event-times.c
+++ b/tools/perf/tests/event-times.c
@@ -213,7 +213,7 @@ out_err:
  * and checks that enabled and running times
  * match.
  */
-int test__event_times(int subtest __maybe_unused)
+int test__event_times(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err, ret = 0;
 
diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index 63ecf21750eb..9484da2ec6b4 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -76,7 +76,7 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-int test__event_update(int subtest __maybe_unused)
+int test__event_update(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct perf_evlist *evlist;
 	struct perf_evsel *evsel;
diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index d2bea6f780f8..d32759b6e38a 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c
@@ -97,7 +97,7 @@ out_delete_evlist:
 #define perf_evsel__name_array_test(names) \
 	__perf_evsel__name_array_test(names, ARRAY_SIZE(names))
 
-int test__perf_evsel__roundtrip_name_test(int subtest __maybe_unused)
+int test__perf_evsel__roundtrip_name_test(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = 0, ret = 0;
 
diff --git a/tools/perf/tests/evsel-tp-sched.c b/tools/perf/tests/evsel-tp-sched.c
index 1984b3bbfe15..5fc906d26c5c 100644
--- a/tools/perf/tests/evsel-tp-sched.c
+++ b/tools/perf/tests/evsel-tp-sched.c
@@ -32,7 +32,7 @@ static int perf_evsel__test_field(struct perf_evsel *evsel, const char *name,
 	return ret;
 }
 
-int test__perf_evsel__tp_sched_test(int subtest __maybe_unused)
+int test__perf_evsel__tp_sched_test(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct perf_evsel *evsel = perf_evsel__newtp("sched", "sched_switch");
 	int ret = 0;
diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
index 6c6a3749aaf6..cb251bf523e7 100644
--- a/tools/perf/tests/expr.c
+++ b/tools/perf/tests/expr.c
@@ -13,7 +13,7 @@ static int test(struct parse_ctx *ctx, const char *e, double val2)
 	return 0;
 }
 
-int test__expr(int subtest __maybe_unused)
+int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused)
 {
 	const char *p;
 	const char **other;
@@ -31,6 +31,11 @@ int test__expr(int subtest __maybe_unused)
 	ret |= test(&ctx, "(BAR/2)%2", 1);
 	ret |= test(&ctx, "1 - -4",  5);
 	ret |= test(&ctx, "(FOO-1)*2 + (BAR/2)%2 - -4",  5);
+	ret |= test(&ctx, "1-1 | 1", 1);
+	ret |= test(&ctx, "1-1 & 1", 0);
+	ret |= test(&ctx, "min(1,2) + 1", 2);
+	ret |= test(&ctx, "max(1,2) + 1", 3);
+	ret |= test(&ctx, "1+1 if 3*4 else 0", 2);
 
 	if (ret)
 		return ret;
diff --git a/tools/perf/tests/fdarray.c b/tools/perf/tests/fdarray.c
index bc5982f42dc3..7d3a9e2ff897 100644
--- a/tools/perf/tests/fdarray.c
+++ b/tools/perf/tests/fdarray.c
@@ -26,7 +26,7 @@ static int fdarray__fprintf_prefix(struct fdarray *fda, const char *prefix, FILE
 	return printed + fdarray__fprintf(fda, fp);
 }
 
-int test__fdarray__filter(int subtest __maybe_unused)
+int test__fdarray__filter(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int nr_fds, expected_fd[2], fd, err = TEST_FAIL;
 	struct fdarray *fda = fdarray__new(5, 5);
@@ -104,7 +104,7 @@ out:
 	return err;
 }
 
-int test__fdarray__add(int subtest __maybe_unused)
+int test__fdarray__add(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = TEST_FAIL;
 	struct fdarray *fda = fdarray__new(2, 2);
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index d549a9f2c41b..8d19c0200cb7 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -687,7 +687,7 @@ out:
 	return err;
 }
 
-int test__hists_cumulate(int subtest __maybe_unused)
+int test__hists_cumulate(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = TEST_FAIL;
 	struct machines machines;
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index df9c91f49af1..755ca551b810 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -101,7 +101,7 @@ out:
 	return TEST_FAIL;
 }
 
-int test__hists_filter(int subtest __maybe_unused)
+int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = TEST_FAIL;
 	struct machines machines;
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index a26cbb79e988..073c9c2856bc 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -264,7 +264,7 @@ static int validate_link(struct hists *leader, struct hists *other)
 	return __validate_link(leader, 0) || __validate_link(other, 1);
 }
 
-int test__hists_link(int subtest __maybe_unused)
+int test__hists_link(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = -1;
 	struct hists *hists, *first_hists;
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index 06e5080182d3..282d62eaebe2 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -573,7 +573,7 @@ out:
 	return err;
 }
 
-int test__hists_output(int subtest __maybe_unused)
+int test__hists_output(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = TEST_FAIL;
 	struct machines machines;
diff --git a/tools/perf/tests/is_printable_array.c b/tools/perf/tests/is_printable_array.c
index a5192f6a20d7..38f765767587 100644
--- a/tools/perf/tests/is_printable_array.c
+++ b/tools/perf/tests/is_printable_array.c
@@ -4,7 +4,7 @@
 #include "debug.h"
 #include "print_binary.h"
 
-int test__is_printable_array(int subtest __maybe_unused)
+int test__is_printable_array(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	char buf1[] = { 'k', 'r', 4, 'v', 'a', 0 };
 	char buf2[] = { 'k', 'r', 'a', 'v', 4, 0 };
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index 614e45a3c603..739428603b71 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -49,7 +49,7 @@ static int find_comm(struct perf_evlist *evlist, const char *comm)
  * when an event is disabled but a dummy software event is not disabled.  If the
  * test passes %0 is returned, otherwise %-1 is returned.
  */
-int test__keep_tracking(int subtest __maybe_unused)
+int test__keep_tracking(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.mmap_pages	     = UINT_MAX,
diff --git a/tools/perf/tests/kmod-path.c b/tools/perf/tests/kmod-path.c
index 6cd9e5107f77..8b9d4ba06c0e 100644
--- a/tools/perf/tests/kmod-path.c
+++ b/tools/perf/tests/kmod-path.c
@@ -50,7 +50,7 @@ static int test_is_kernel_module(const char *path, int cpumode, bool expect)
 #define M(path, c, e) \
 	TEST_ASSERT_VAL("failed", !test_is_kernel_module(path, c, e))
 
-int test__kmod_path__parse(int subtest __maybe_unused)
+int test__kmod_path__parse(struct test *t __maybe_unused, int subtest __maybe_unused)
 {
 	/* path                alloc_name  alloc_ext   kmod  comp   name     ext */
 	T("/xxxx/xxxx/x-x.ko", true      , true      , true, false, "[x_x]", NULL);
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index 482b5365e68d..5187b50dbafe 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -132,7 +132,7 @@ out:
 	return ret;
 }
 
-int test__llvm(int subtest)
+int test__llvm(struct test *test __maybe_unused, int subtest)
 {
 	int ret;
 	void *obj_buf = NULL;
diff --git a/tools/perf/tests/mem.c b/tools/perf/tests/mem.c
new file mode 100644
index 000000000000..21952e1e6e6d
--- /dev/null
+++ b/tools/perf/tests/mem.c
@@ -0,0 +1,56 @@
+#include "util/mem-events.h"
+#include "util/symbol.h"
+#include "linux/perf_event.h"
+#include "util/debug.h"
+#include "tests.h"
+#include <string.h>
+
+static int check(union perf_mem_data_src data_src,
+		  const char *string)
+{
+	char out[100];
+	char failure[100];
+	struct mem_info mi = { .data_src = data_src };
+
+	int n;
+
+	n = perf_mem__snp_scnprintf(out, sizeof out, &mi);
+	n += perf_mem__lvl_scnprintf(out + n, sizeof out - n, &mi);
+	snprintf(failure, sizeof failure, "unexpected %s", out);
+	TEST_ASSERT_VAL(failure, !strcmp(string, out));
+	return 0;
+}
+
+int test__mem(struct test *text __maybe_unused, int subtest __maybe_unused)
+{
+	int ret = 0;
+	union perf_mem_data_src src;
+
+	memset(&src, 0, sizeof(src));
+
+	src.mem_lvl = PERF_MEM_LVL_HIT;
+	src.mem_lvl_num = 4;
+
+	ret |= check(src, "N/AL4 hit");
+
+	src.mem_remote = 1;
+
+	ret |= check(src, "N/ARemote L4 hit");
+
+	src.mem_lvl = PERF_MEM_LVL_MISS;
+	src.mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
+	src.mem_remote = 0;
+
+	ret |= check(src, "N/APMEM miss");
+
+	src.mem_remote = 1;
+
+	ret |= check(src, "N/ARemote PMEM miss");
+
+	src.mem_snoopx = PERF_MEM_SNOOPX_FWD;
+	src.mem_lvl_num = PERF_MEM_LVLNUM_RAM;
+
+	ret |= check(src , "FwdRemote RAM miss");
+
+	return ret;
+}
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 15c770856aac..bc8a70ee46d8 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -22,7 +22,7 @@
  * Then it checks if the number of syscalls reported as perf events by
  * the kernel corresponds to the number of syscalls made.
  */
-int test__basic_mmap(int subtest __maybe_unused)
+int test__basic_mmap(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = -1;
 	union perf_event *event;
diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index 6ea4d8a5d26b..f94a4196e7c9 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c
@@ -221,7 +221,7 @@ static int mmap_events(synth_cb synth)
  *
  * by using all thread objects.
  */
-int test__mmap_thread_lookup(int subtest __maybe_unused)
+int test__mmap_thread_lookup(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	/* perf_event__synthesize_threads synthesize */
 	TEST_ASSERT_VAL("failed with sythesizing all",
diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index 1a74dd9fd067..9cf1c35f2ad0 100644
--- a/tools/perf/tests/openat-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -16,7 +16,7 @@
 #include "debug.h"
 #include "stat.h"
 
-int test__openat_syscall_event_on_all_cpus(int subtest __maybe_unused)
+int test__openat_syscall_event_on_all_cpus(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = -1, fd, cpu;
 	struct cpu_map *cpus;
@@ -115,7 +115,7 @@ int test__openat_syscall_event_on_all_cpus(int subtest __maybe_unused)
 
 	perf_evsel__free_counts(evsel);
 out_close_fd:
-	perf_evsel__close_fd(evsel, 1, threads->nr);
+	perf_evsel__close_fd(evsel);
 out_evsel_delete:
 	perf_evsel__delete(evsel);
 out_thread_map_delete:
diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c
index 9788fac91095..b6ee1c41f45d 100644
--- a/tools/perf/tests/openat-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
@@ -14,7 +14,7 @@
 #define AT_FDCWD       -100
 #endif
 
-int test__syscall_openat_tp_fields(int subtest __maybe_unused)
+int test__syscall_openat_tp_fields(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.target = {
diff --git a/tools/perf/tests/openat-syscall.c b/tools/perf/tests/openat-syscall.c
index e44506e21ee7..9dc5c5d37553 100644
--- a/tools/perf/tests/openat-syscall.c
+++ b/tools/perf/tests/openat-syscall.c
@@ -10,7 +10,7 @@
 #include "debug.h"
 #include "tests.h"
 
-int test__openat_syscall_event(int subtest __maybe_unused)
+int test__openat_syscall_event(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = -1, fd;
 	struct perf_evsel *evsel;
@@ -56,7 +56,7 @@ int test__openat_syscall_event(int subtest __maybe_unused)
 
 	err = 0;
 out_close_fd:
-	perf_evsel__close_fd(evsel, 1, threads->nr);
+	perf_evsel__close_fd(evsel);
 out_evsel_delete:
 	perf_evsel__delete(evsel);
 out_thread_map_delete:
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 812a053d1941..0f0b025faa4b 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1810,7 +1810,7 @@ static int test_pmu_events(void)
 	return ret;
 }
 
-int test__parse_events(int subtest __maybe_unused)
+int test__parse_events(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int ret1, ret2 = 0;
 
diff --git a/tools/perf/tests/parse-no-sample-id-all.c b/tools/perf/tests/parse-no-sample-id-all.c
index c6207db09f12..91867dcc39f0 100644
--- a/tools/perf/tests/parse-no-sample-id-all.c
+++ b/tools/perf/tests/parse-no-sample-id-all.c
@@ -68,7 +68,7 @@ struct test_attr_event {
  *
  * Return: %0 on success, %-1 if the test fails.
  */
-int test__parse_no_sample_id_all(int subtest __maybe_unused)
+int test__parse_no_sample_id_all(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err;
 
diff --git a/tools/perf/tests/perf-hooks.c b/tools/perf/tests/perf-hooks.c
index 665ecc19671c..bf2517d6de70 100644
--- a/tools/perf/tests/perf-hooks.c
+++ b/tools/perf/tests/perf-hooks.c
@@ -27,7 +27,7 @@ static void the_hook(void *_hook_flags)
 	*p = 0;
 }
 
-int test__perf_hooks(int subtest __maybe_unused)
+int test__perf_hooks(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int hook_flags = 0;
 
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index d37cd9588cc0..19b650064b70 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -37,7 +37,7 @@ realloc:
 	return cpu;
 }
 
-int test__PERF_RECORD(int subtest __maybe_unused)
+int test__PERF_RECORD(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct record_opts opts = {
 		.target = {
diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index a6d7aef30030..9f7f589f9c54 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -135,7 +135,7 @@ static struct list_head *test_terms_list(void)
 	return &terms;
 }
 
-int test__pmu(int subtest __maybe_unused)
+int test__pmu(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	char *format = test_format_dir_get();
 	LIST_HEAD(formats);
diff --git a/tools/perf/tests/python-use.c b/tools/perf/tests/python-use.c
index fa79509da535..598a7e058ad4 100644
--- a/tools/perf/tests/python-use.c
+++ b/tools/perf/tests/python-use.c
@@ -9,7 +9,7 @@
 
 extern int verbose;
 
-int test__python_use(int subtest __maybe_unused)
+int test__python_use(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	char *cmd;
 	int ret;
diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index bac5c3885b3b..6d028f42b3cf 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -292,7 +292,7 @@ out_free:
  * checks sample format bits separately and together.  If the test passes %0 is
  * returned, otherwise %-1 is returned.
  */
-int test__sample_parsing(int subtest __maybe_unused)
+int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	const u64 rf[] = {4, 5, 6, 7, 12, 13, 14, 15};
 	u64 sample_type;
diff --git a/tools/perf/tests/sdt.c b/tools/perf/tests/sdt.c
index 06eda675ae2c..a9903d9b8bc2 100644
--- a/tools/perf/tests/sdt.c
+++ b/tools/perf/tests/sdt.c
@@ -33,7 +33,7 @@ static int build_id_cache__add_file(const char *filename)
 	}
 
 	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-	err = build_id_cache__add_s(sbuild_id, filename, false, false);
+	err = build_id_cache__add_s(sbuild_id, filename, NULL, false, false);
 	if (err < 0)
 		pr_debug("Failed to add build id cache of %s\n", filename);
 	return err;
@@ -54,7 +54,7 @@ static char *get_self_path(void)
 static int search_cached_probe(const char *target,
 			       const char *group, const char *event)
 {
-	struct probe_cache *cache = probe_cache__new(target);
+	struct probe_cache *cache = probe_cache__new(target, NULL);
 	int ret = 0;
 
 	if (!cache) {
@@ -71,7 +71,7 @@ static int search_cached_probe(const char *target,
 	return ret;
 }
 
-int test__sdt_event(int subtests __maybe_unused)
+int test__sdt_event(struct test *test __maybe_unused, int subtests __maybe_unused)
 {
 	int ret = TEST_FAIL;
 	char __tempdir[] = "./test-buildid-XXXXXX";
@@ -83,6 +83,8 @@ int test__sdt_event(int subtests __maybe_unused)
 	}
 	/* Note that buildid_dir must be an absolute path */
 	tempdir = realpath(__tempdir, NULL);
+	if (tempdir == NULL)
+		goto error_rmdir;
 
 	/* At first, scan itself */
 	set_buildid_dir(tempdir);
@@ -100,14 +102,14 @@ int test__sdt_event(int subtests __maybe_unused)
 
 error_rmdir:
 	/* Cleanup temporary buildid dir */
-	rm_rf(tempdir);
+	rm_rf(__tempdir);
 error:
 	free(tempdir);
 	free(myself);
 	return ret;
 }
 #else
-int test__sdt_event(int subtests __maybe_unused)
+int test__sdt_event(struct test *test __maybe_unused, int subtests __maybe_unused)
 {
 	pr_debug("Skip SDT event test because SDT support is not compiled\n");
 	return TEST_SKIP;
diff --git a/tools/perf/tests/shell/lib/probe.sh b/tools/perf/tests/shell/lib/probe.sh
new file mode 100644
index 000000000000..6293cc660947
--- /dev/null
+++ b/tools/perf/tests/shell/lib/probe.sh
@@ -0,0 +1,6 @@
+# Arnaldo Carvalho de Melo <acme@kernel.org>, 2017
+
+skip_if_no_perf_probe() {
+	perf probe 2>&1 | grep -q 'is not a perf-command' && return 2
+	return 0
+}
diff --git a/tools/perf/tests/shell/lib/probe_vfs_getname.sh b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
new file mode 100644
index 000000000000..30a950c9d407
--- /dev/null
+++ b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
@@ -0,0 +1,23 @@
+# Arnaldo Carvalho de Melo <acme@kernel.org>, 2017
+
+perf probe -l 2>&1 | grep -q probe:vfs_getname
+had_vfs_getname=$?
+
+cleanup_probe_vfs_getname() {
+	if [ $had_vfs_getname -eq 1 ] ; then
+		perf probe -q -d probe:vfs_getname
+	fi
+}
+
+add_probe_vfs_getname() {
+	local verbose=$1
+	if [ $had_vfs_getname -eq 1 ] ; then
+		line=$(perf probe -L getname_flags 2>&1 | egrep 'result.*=.*filename;' | sed -r 's/[[:space:]]+([[:digit:]]+)[[:space:]]+result->uptr.*/\1/')
+		perf probe $verbose "vfs_getname=getname_flags:${line} pathname=result->name:string"
+	fi
+}
+
+skip_if_no_debuginfo() {
+	add_probe_vfs_getname -v 2>&1 | egrep -q "^(Failed to find the path for kernel|Debuginfo-analysis is not supported)" && return 2
+	return 1
+}
diff --git a/tools/perf/tests/shell/probe_vfs_getname.sh b/tools/perf/tests/shell/probe_vfs_getname.sh
new file mode 100755
index 000000000000..9b7635184dc2
--- /dev/null
+++ b/tools/perf/tests/shell/probe_vfs_getname.sh
@@ -0,0 +1,14 @@
+# Add vfs_getname probe to get syscall args filenames
+#
+# Arnaldo Carvalho de Melo <acme@kernel.org>, 2017
+
+. $(dirname $0)/lib/probe.sh
+
+skip_if_no_perf_probe || exit 2
+
+. $(dirname $0)/lib/probe_vfs_getname.sh
+
+add_probe_vfs_getname || skip_if_no_debuginfo
+err=$?
+cleanup_probe_vfs_getname
+exit $err
diff --git a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh
new file mode 100755
index 000000000000..ba29535b8580
--- /dev/null
+++ b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh
@@ -0,0 +1,41 @@
+# Use vfs_getname probe to get syscall args filenames
+
+# Uses the 'perf test shell' library to add probe:vfs_getname to the system
+# then use it with 'perf record' using 'touch' to write to a temp file, then
+# checks that that was captured by the vfs_getname probe in the generated
+# perf.data file, with the temp file name as the pathname argument.
+
+# Arnaldo Carvalho de Melo <acme@kernel.org>, 2017
+
+. $(dirname $0)/lib/probe.sh
+
+skip_if_no_perf_probe || exit 2
+
+. $(dirname $0)/lib/probe_vfs_getname.sh
+
+perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
+file=$(mktemp /tmp/temporary_file.XXXXX)
+
+record_open_file() {
+	echo "Recording open file:"
+	perf record -o ${perfdata} -e probe:vfs_getname touch $file
+}
+
+perf_script_filenames() {
+	echo "Looking at perf.data file for vfs_getname records for the file we touched:"
+	perf script -i ${perfdata} | \
+	egrep " +touch +[0-9]+ +\[[0-9]+\] +[0-9]+\.[0-9]+: +probe:vfs_getname: +\([[:xdigit:]]+\) +pathname=\"${file}\""
+}
+
+add_probe_vfs_getname || skip_if_no_debuginfo
+err=$?
+if [ $err -ne 0 ] ; then
+	exit $err
+fi
+
+record_open_file && perf_script_filenames
+err=$?
+rm -f ${perfdata}
+rm -f ${file}
+cleanup_probe_vfs_getname
+exit $err
diff --git a/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh b/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh
new file mode 100755
index 000000000000..462fc755092e
--- /dev/null
+++ b/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh
@@ -0,0 +1,43 @@
+# probe libc's inet_pton & backtrace it with ping
+
+# Installs a probe on libc's inet_pton function, that will use uprobes,
+# then use 'perf trace' on a ping to localhost asking for just one packet
+# with the a backtrace 3 levels deep, check that it is what we expect.
+# This needs no debuginfo package, all is done using the libc ELF symtab
+# and the CFI info in the binaries.
+
+# Arnaldo Carvalho de Melo <acme@kernel.org>, 2017
+
+. $(dirname $0)/lib/probe.sh
+
+trace_libc_inet_pton_backtrace() {
+	idx=0
+	expected[0]="PING.*bytes"
+	expected[1]="64 bytes from ::1.*"
+	expected[2]=".*ping statistics.*"
+	expected[3]=".*packets transmitted.*"
+	expected[4]="rtt min.*"
+	expected[5]="[0-9]+\.[0-9]+[[:space:]]+probe_libc:inet_pton:\([[:xdigit:]]+\)"
+	expected[6]=".*inet_pton[[:space:]]\(/usr/lib.*/libc-[0-9]+\.[0-9]+\.so\)$"
+	expected[7]="getaddrinfo[[:space:]]\(/usr/lib.*/libc-[0-9]+\.[0-9]+\.so\)$"
+	expected[8]=".*\(.*/bin/ping.*\)$"
+
+	perf trace --no-syscalls -e probe_libc:inet_pton/max-stack=3/ ping -6 -c 1 ::1 2>&1 | grep -v ^$ | while read line ; do
+		echo $line
+		echo "$line" | egrep -q "${expected[$idx]}"
+		if [ $? -ne 0 ] ; then
+			printf "FAIL: expected backtrace entry %d \"%s\" got \"%s\"\n" $idx "${expected[$idx]}" "$line"
+			exit 1
+		fi
+		let idx+=1
+		[ $idx -eq 9 ] && break
+	done
+}
+
+skip_if_no_perf_probe && \
+perf probe -q /lib64/libc-*.so inet_pton && \
+trace_libc_inet_pton_backtrace
+err=$?
+rm -f ${file}
+perf probe -q -d probe_libc:inet_pton
+exit $err
diff --git a/tools/perf/tests/shell/trace+probe_vfs_getname.sh b/tools/perf/tests/shell/trace+probe_vfs_getname.sh
new file mode 100755
index 000000000000..2e68c5f120da
--- /dev/null
+++ b/tools/perf/tests/shell/trace+probe_vfs_getname.sh
@@ -0,0 +1,35 @@
+# Check open filename arg using perf trace + vfs_getname
+
+# Uses the 'perf test shell' library to add probe:vfs_getname to the system
+# then use it with 'perf trace' using 'touch' to write to a temp file, then
+# checks that that was captured by the vfs_getname was used by 'perf trace',
+# that already handles "probe:vfs_getname" if present, and used in the
+# "open" syscall "filename" argument beautifier.
+
+# Arnaldo Carvalho de Melo <acme@kernel.org>, 2017
+
+. $(dirname $0)/lib/probe.sh
+
+skip_if_no_perf_probe || exit 2
+
+. $(dirname $0)/lib/probe_vfs_getname.sh
+
+file=$(mktemp /tmp/temporary_file.XXXXX)
+
+trace_open_vfs_getname() {
+	perf trace -e open touch $file 2>&1 | \
+	egrep " +[0-9]+\.[0-9]+ +\( +[0-9]+\.[0-9]+ ms\): +touch\/[0-9]+ open\(filename: +${file}, +flags: CREAT\|NOCTTY\|NONBLOCK\|WRONLY, +mode: +IRUGO\|IWUGO\) += +[0-9]+$"
+}
+
+
+add_probe_vfs_getname || skip_if_no_debuginfo
+err=$?
+if [ $err -ne 0 ] ; then
+	exit $err
+fi
+
+trace_open_vfs_getname
+err=$?
+rm -f ${file}
+cleanup_probe_vfs_getname
+exit $err
diff --git a/tools/perf/tests/stat.c b/tools/perf/tests/stat.c
index 6a20ff2326bb..7f988a939036 100644
--- a/tools/perf/tests/stat.c
+++ b/tools/perf/tests/stat.c
@@ -45,7 +45,7 @@ static int process_stat_config_event(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-int test__synthesize_stat_config(int subtest __maybe_unused)
+int test__synthesize_stat_config(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct perf_stat_config stat_config = {
 		.aggr_mode	= AGGR_CORE,
@@ -75,7 +75,7 @@ static int process_stat_event(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-int test__synthesize_stat(int subtest __maybe_unused)
+int test__synthesize_stat(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct perf_counts_values count;
 
@@ -101,7 +101,7 @@ static int process_stat_round_event(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-int test__synthesize_stat_round(int subtest __maybe_unused)
+int test__synthesize_stat_round(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	TEST_ASSERT_VAL("failed to synthesize stat_config",
 		!perf_event__synthesize_stat_round(NULL, 0xdeadbeef, PERF_STAT_ROUND_TYPE__INTERVAL,
diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 828494db4a19..d88511f6072c 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c
@@ -124,7 +124,7 @@ out_delete_evlist:
 	return err;
 }
 
-int test__sw_clock_freq(int subtest __maybe_unused)
+int test__sw_clock_freq(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int ret;
 
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index 65474fd80da7..2acd78555192 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -306,7 +306,7 @@ out_free_nodes:
  * evsel->system_wide and evsel->tracking flags (respectively) with other events
  * sometimes enabled or disabled.
  */
-int test__switch_tracking(int subtest __maybe_unused)
+int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	const char *sched_switch = "sched:sched_switch";
 	struct switch_tracking switch_tracking = { .tids = NULL, };
diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c
index cf00ebad2ef5..f0881d0dd9c9 100644
--- a/tools/perf/tests/task-exit.c
+++ b/tools/perf/tests/task-exit.c
@@ -32,7 +32,7 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
  * if the number of exit event reported by the kernel is 1 or not
  * in order to check the kernel returns correct number of event.
  */
-int test__task_exit(int subtest __maybe_unused)
+int test__task_exit(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = -1;
 	union perf_event *event;
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 577363809c9b..921412a6a880 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -28,77 +28,79 @@ enum {
 
 struct test {
 	const char *desc;
-	int (*func)(int subtest);
+	int (*func)(struct test *test, int subtest);
 	struct {
 		bool skip_if_fail;
 		int (*get_nr)(void);
 		const char *(*get_desc)(int subtest);
 	} subtest;
 	bool (*is_supported)(void);
+	void *priv;
 };
 
 /* Tests */
-int test__vmlinux_matches_kallsyms(int subtest);
-int test__openat_syscall_event(int subtest);
-int test__openat_syscall_event_on_all_cpus(int subtest);
-int test__basic_mmap(int subtest);
-int test__PERF_RECORD(int subtest);
-int test__perf_evsel__roundtrip_name_test(int subtest);
-int test__perf_evsel__tp_sched_test(int subtest);
-int test__syscall_openat_tp_fields(int subtest);
-int test__pmu(int subtest);
-int test__attr(int subtest);
-int test__dso_data(int subtest);
-int test__dso_data_cache(int subtest);
-int test__dso_data_reopen(int subtest);
-int test__parse_events(int subtest);
-int test__hists_link(int subtest);
-int test__python_use(int subtest);
-int test__bp_signal(int subtest);
-int test__bp_signal_overflow(int subtest);
-int test__task_exit(int subtest);
-int test__sw_clock_freq(int subtest);
-int test__code_reading(int subtest);
-int test__sample_parsing(int subtest);
-int test__keep_tracking(int subtest);
-int test__parse_no_sample_id_all(int subtest);
-int test__dwarf_unwind(int subtest);
-int test__expr(int subtest);
-int test__hists_filter(int subtest);
-int test__mmap_thread_lookup(int subtest);
-int test__thread_mg_share(int subtest);
-int test__hists_output(int subtest);
-int test__hists_cumulate(int subtest);
-int test__switch_tracking(int subtest);
-int test__fdarray__filter(int subtest);
-int test__fdarray__add(int subtest);
-int test__kmod_path__parse(int subtest);
-int test__thread_map(int subtest);
-int test__llvm(int subtest);
+int test__vmlinux_matches_kallsyms(struct test *test, int subtest);
+int test__openat_syscall_event(struct test *test, int subtest);
+int test__openat_syscall_event_on_all_cpus(struct test *test, int subtest);
+int test__basic_mmap(struct test *test, int subtest);
+int test__PERF_RECORD(struct test *test, int subtest);
+int test__perf_evsel__roundtrip_name_test(struct test *test, int subtest);
+int test__perf_evsel__tp_sched_test(struct test *test, int subtest);
+int test__syscall_openat_tp_fields(struct test *test, int subtest);
+int test__pmu(struct test *test, int subtest);
+int test__attr(struct test *test, int subtest);
+int test__dso_data(struct test *test, int subtest);
+int test__dso_data_cache(struct test *test, int subtest);
+int test__dso_data_reopen(struct test *test, int subtest);
+int test__parse_events(struct test *test, int subtest);
+int test__hists_link(struct test *test, int subtest);
+int test__python_use(struct test *test, int subtest);
+int test__bp_signal(struct test *test, int subtest);
+int test__bp_signal_overflow(struct test *test, int subtest);
+int test__task_exit(struct test *test, int subtest);
+int test__mem(struct test *test, int subtest);
+int test__sw_clock_freq(struct test *test, int subtest);
+int test__code_reading(struct test *test, int subtest);
+int test__sample_parsing(struct test *test, int subtest);
+int test__keep_tracking(struct test *test, int subtest);
+int test__parse_no_sample_id_all(struct test *test, int subtest);
+int test__dwarf_unwind(struct test *test, int subtest);
+int test__expr(struct test *test, int subtest);
+int test__hists_filter(struct test *test, int subtest);
+int test__mmap_thread_lookup(struct test *test, int subtest);
+int test__thread_mg_share(struct test *test, int subtest);
+int test__hists_output(struct test *test, int subtest);
+int test__hists_cumulate(struct test *test, int subtest);
+int test__switch_tracking(struct test *test, int subtest);
+int test__fdarray__filter(struct test *test, int subtest);
+int test__fdarray__add(struct test *test, int subtest);
+int test__kmod_path__parse(struct test *test, int subtest);
+int test__thread_map(struct test *test, int subtest);
+int test__llvm(struct test *test, int subtest);
 const char *test__llvm_subtest_get_desc(int subtest);
 int test__llvm_subtest_get_nr(void);
-int test__bpf(int subtest);
+int test__bpf(struct test *test, int subtest);
 const char *test__bpf_subtest_get_desc(int subtest);
 int test__bpf_subtest_get_nr(void);
-int test_session_topology(int subtest);
-int test__thread_map_synthesize(int subtest);
-int test__thread_map_remove(int subtest);
-int test__cpu_map_synthesize(int subtest);
-int test__synthesize_stat_config(int subtest);
-int test__synthesize_stat(int subtest);
-int test__synthesize_stat_round(int subtest);
-int test__event_update(int subtest);
-int test__event_times(int subtest);
-int test__backward_ring_buffer(int subtest);
-int test__cpu_map_print(int subtest);
-int test__sdt_event(int subtest);
-int test__is_printable_array(int subtest);
-int test__bitmap_print(int subtest);
-int test__perf_hooks(int subtest);
-int test__clang(int subtest);
+int test__session_topology(struct test *test, int subtest);
+int test__thread_map_synthesize(struct test *test, int subtest);
+int test__thread_map_remove(struct test *test, int subtest);
+int test__cpu_map_synthesize(struct test *test, int subtest);
+int test__synthesize_stat_config(struct test *test, int subtest);
+int test__synthesize_stat(struct test *test, int subtest);
+int test__synthesize_stat_round(struct test *test, int subtest);
+int test__event_update(struct test *test, int subtest);
+int test__event_times(struct test *test, int subtest);
+int test__backward_ring_buffer(struct test *test, int subtest);
+int test__cpu_map_print(struct test *test, int subtest);
+int test__sdt_event(struct test *test, int subtest);
+int test__is_printable_array(struct test *test, int subtest);
+int test__bitmap_print(struct test *test, int subtest);
+int test__perf_hooks(struct test *test, int subtest);
+int test__clang(struct test *test, int subtest);
 const char *test__clang_subtest_get_desc(int subtest);
 int test__clang_subtest_get_nr(void);
-int test__unit_number__scnprint(int subtest);
+int test__unit_number__scnprint(struct test *test, int subtest);
 
 bool test__bp_signal_is_supported(void);
 
diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index a63d6945807b..b3423c744f46 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -9,7 +9,7 @@
 #define NAME	(const char *) "perf"
 #define NAMEUL	(unsigned long) NAME
 
-int test__thread_map(int subtest __maybe_unused)
+int test__thread_map(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct thread_map *map;
 
@@ -76,7 +76,7 @@ static int process_event(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-int test__thread_map_synthesize(int subtest __maybe_unused)
+int test__thread_map_synthesize(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct thread_map *threads;
 
@@ -95,7 +95,7 @@ int test__thread_map_synthesize(int subtest __maybe_unused)
 	return 0;
 }
 
-int test__thread_map_remove(int subtest __maybe_unused)
+int test__thread_map_remove(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct thread_map *threads;
 	char *str;
diff --git a/tools/perf/tests/thread-mg-share.c b/tools/perf/tests/thread-mg-share.c
index 76686dd6f5ec..b9c7f58db6c4 100644
--- a/tools/perf/tests/thread-mg-share.c
+++ b/tools/perf/tests/thread-mg-share.c
@@ -4,7 +4,7 @@
 #include "map.h"
 #include "debug.h"
 
-int test__thread_mg_share(int subtest __maybe_unused)
+int test__thread_mg_share(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	struct machines machines;
 	struct machine *machine;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 803f893550d6..19b0561fd6f6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -86,7 +86,7 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
 	return 0;
 }
 
-int test_session_topology(int subtest __maybe_unused)
+int test__session_topology(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	char path[PATH_MAX];
 	struct cpu_map *map;
diff --git a/tools/perf/tests/unit_number__scnprintf.c b/tools/perf/tests/unit_number__scnprintf.c
index 44589de084b8..15cd1cf8c129 100644
--- a/tools/perf/tests/unit_number__scnprintf.c
+++ b/tools/perf/tests/unit_number__scnprintf.c
@@ -5,7 +5,7 @@
 #include "units.h"
 #include "debug.h"
 
-int test__unit_number__scnprint(int subtest __maybe_unused)
+int test__unit_number__scnprint(struct test *t __maybe_unused, int subtest __maybe_unused)
 {
 	struct {
 		u64		 n;
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 8456175fc234..86cb8868f67f 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -11,7 +11,7 @@
 
 #define UM(x) kallsyms_map->unmap_ip(kallsyms_map, (x))
 
-int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
+int test__vmlinux_matches_kallsyms(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = -1;
 	struct rb_node *nd;
diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build
index be95ac6ce845..175d633c6b49 100644
--- a/tools/perf/trace/beauty/Build
+++ b/tools/perf/trace/beauty/Build
@@ -1 +1,7 @@
+libperf-y += clone.o
+libperf-y += fcntl.o
+ifeq ($(SRCARCH),$(filter $(SRCARCH),x86))
+libperf-y += ioctl.o
+endif
+libperf-y += pkey_alloc.o
 libperf-y += statx.o
diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h
index cf50be3f17a4..4b58581a6053 100644
--- a/tools/perf/trace/beauty/beauty.h
+++ b/tools/perf/trace/beauty/beauty.h
@@ -1,13 +1,44 @@
 #ifndef _PERF_TRACE_BEAUTY_H
 #define _PERF_TRACE_BEAUTY_H
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 
+struct strarray {
+	int	    offset;
+	int	    nr_entries;
+	const char **entries;
+};
+
+#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
+	.nr_entries = ARRAY_SIZE(array), \
+	.entries = array, \
+}
+
+#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
+	.offset	    = off, \
+	.nr_entries = ARRAY_SIZE(array), \
+	.entries = array, \
+}
+
+size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val);
+
 struct trace;
 struct thread;
 
+/**
+ * @val: value of syscall argument being formatted
+ * @args: All the args, use syscall_args__val(arg, nth) to access one
+ * @thread: tid state (maps, pid, tid, etc)
+ * @trace: 'perf trace' internals: all threads, etc
+ * @parm: private area, may be an strarray, for instance
+ * @idx: syscall arg idx (is this the first?)
+ * @mask: a syscall arg may mask another arg, see syscall_arg__scnprintf_futex_op
+ */
+
 struct syscall_arg {
 	unsigned long val;
+	unsigned char *args;
 	struct thread *thread;
 	struct trace  *trace;
 	void	      *parm;
@@ -15,10 +46,53 @@ struct syscall_arg {
 	u8	      mask;
 };
 
+unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx);
+
+size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_STRARRAYS syscall_arg__scnprintf_strarrays
+
+size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_FD syscall_arg__scnprintf_fd
+
+size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_HEX syscall_arg__scnprintf_hex
+
+size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_INT syscall_arg__scnprintf_int
+
+size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_LONG syscall_arg__scnprintf_long
+
+size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_PID syscall_arg__scnprintf_pid
+
+size_t syscall_arg__scnprintf_clone_flags(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_CLONE_FLAGS syscall_arg__scnprintf_clone_flags
+
+size_t syscall_arg__scnprintf_fcntl_cmd(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_FCNTL_CMD syscall_arg__scnprintf_fcntl_cmd
+
+size_t syscall_arg__scnprintf_fcntl_arg(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_FCNTL_ARG syscall_arg__scnprintf_fcntl_arg
+
+size_t syscall_arg__scnprintf_ioctl_cmd(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_IOCTL_CMD syscall_arg__scnprintf_ioctl_cmd
+
+size_t syscall_arg__scnprintf_pkey_alloc_access_rights(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_PKEY_ALLOC_ACCESS_RIGHTS syscall_arg__scnprintf_pkey_alloc_access_rights
+
+size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
+
 size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg);
 #define SCA_STATX_FLAGS syscall_arg__scnprintf_statx_flags
 
 size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg);
 #define SCA_STATX_MASK syscall_arg__scnprintf_statx_mask
 
+size_t open__scnprintf_flags(unsigned long flags, char *bf, size_t size);
+
+void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
+				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg));
+
 #endif /* _PERF_TRACE_BEAUTY_H */
diff --git a/tools/perf/trace/beauty/clone.c b/tools/perf/trace/beauty/clone.c
new file mode 100644
index 000000000000..d64d049ab991
--- /dev/null
+++ b/tools/perf/trace/beauty/clone.c
@@ -0,0 +1,75 @@
+/*
+ * trace/beauty/cone.c
+ *
+ *  Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "trace/beauty/beauty.h"
+#include <linux/kernel.h>
+#include <sys/types.h>
+#include <uapi/linux/sched.h>
+
+static size_t clone__scnprintf_flags(unsigned long flags, char *bf, size_t size)
+{
+	int printed = 0;
+
+#define	P_FLAG(n) \
+	if (flags & CLONE_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~CLONE_##n; \
+	}
+
+	P_FLAG(VM);
+	P_FLAG(FS);
+	P_FLAG(FILES);
+	P_FLAG(SIGHAND);
+	P_FLAG(PTRACE);
+	P_FLAG(VFORK);
+	P_FLAG(PARENT);
+	P_FLAG(THREAD);
+	P_FLAG(NEWNS);
+	P_FLAG(SYSVSEM);
+	P_FLAG(SETTLS);
+	P_FLAG(PARENT_SETTID);
+	P_FLAG(CHILD_CLEARTID);
+	P_FLAG(DETACHED);
+	P_FLAG(UNTRACED);
+	P_FLAG(CHILD_SETTID);
+	P_FLAG(NEWCGROUP);
+	P_FLAG(NEWUTS);
+	P_FLAG(NEWIPC);
+	P_FLAG(NEWUSER);
+	P_FLAG(NEWPID);
+	P_FLAG(NEWNET);
+	P_FLAG(IO);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+size_t syscall_arg__scnprintf_clone_flags(char *bf, size_t size, struct syscall_arg *arg)
+{
+	unsigned long flags = arg->val;
+	enum syscall_clone_args {
+		SCC_FLAGS	  = (1 << 0),
+		SCC_CHILD_STACK	  = (1 << 1),
+		SCC_PARENT_TIDPTR = (1 << 2),
+		SCC_CHILD_TIDPTR  = (1 << 3),
+		SCC_TLS		  = (1 << 4),
+	};
+	if (!(flags & CLONE_PARENT_SETTID))
+		arg->mask |= SCC_PARENT_TIDPTR;
+
+	if (!(flags & (CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)))
+		arg->mask |= SCC_CHILD_TIDPTR;
+
+	if (!(flags & CLONE_SETTLS))
+		arg->mask |= SCC_TLS;
+
+	return clone__scnprintf_flags(flags, bf, size);
+}
diff --git a/tools/perf/trace/beauty/drm_ioctl.sh b/tools/perf/trace/beauty/drm_ioctl.sh
new file mode 100755
index 000000000000..2149d3a98e42
--- /dev/null
+++ b/tools/perf/trace/beauty/drm_ioctl.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+drm_header_dir=$1
+printf "#ifndef DRM_COMMAND_BASE\n"
+grep "#define DRM_COMMAND_BASE" $drm_header_dir/drm.h
+printf "#endif\n"
+
+printf "static const char *drm_ioctl_cmds[] = {\n"
+grep "^#define DRM_IOCTL.*DRM_IO" $drm_header_dir/drm.h | \
+	sed -r 's/^#define +DRM_IOCTL_([A-Z0-9_]+)[	 ]+DRM_IO[A-Z]* *\( *(0x[[:xdigit:]]+),*.*/	[\2] = "\1",/g'
+grep "^#define DRM_I915_[A-Z_0-9]\+[	 ]\+0x" $drm_header_dir/i915_drm.h | \
+	sed -r 's/^#define +DRM_I915_([A-Z0-9_]+)[	 ]+(0x[[:xdigit:]]+)/\t[DRM_COMMAND_BASE + \2] = "I915_\1",/g'
+printf "};\n"
diff --git a/tools/perf/trace/beauty/fcntl.c b/tools/perf/trace/beauty/fcntl.c
new file mode 100644
index 000000000000..9e8900c13cb1
--- /dev/null
+++ b/tools/perf/trace/beauty/fcntl.c
@@ -0,0 +1,100 @@
+/*
+ * trace/beauty/fcntl.c
+ *
+ *  Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "trace/beauty/beauty.h"
+#include <linux/kernel.h>
+#include <uapi/linux/fcntl.h>
+
+static size_t fcntl__scnprintf_getfd(unsigned long val, char *bf, size_t size)
+{
+	return scnprintf(bf, size, "%s", val ? "CLOEXEC" : "0");
+}
+
+static size_t syscall_arg__scnprintf_fcntl_getfd(char *bf, size_t size, struct syscall_arg *arg)
+{
+	return fcntl__scnprintf_getfd(arg->val, bf, size);
+}
+
+static size_t fcntl__scnprintf_getlease(unsigned long val, char *bf, size_t size)
+{
+	static const char *fcntl_setlease[] = { "RDLCK", "WRLCK", "UNLCK", };
+	static DEFINE_STRARRAY(fcntl_setlease);
+
+	return strarray__scnprintf(&strarray__fcntl_setlease, bf, size, "%x", val);
+}
+
+static size_t syscall_arg__scnprintf_fcntl_getlease(char *bf, size_t size, struct syscall_arg *arg)
+{
+	return fcntl__scnprintf_getlease(arg->val, bf, size);
+}
+
+size_t syscall_arg__scnprintf_fcntl_cmd(char *bf, size_t size, struct syscall_arg *arg)
+{
+	if (arg->val == F_GETFL) {
+		syscall_arg__set_ret_scnprintf(arg, syscall_arg__scnprintf_open_flags);
+		goto mask_arg;
+	}
+	if (arg->val == F_GETFD) {
+		syscall_arg__set_ret_scnprintf(arg, syscall_arg__scnprintf_fcntl_getfd);
+		goto mask_arg;
+	}
+	if (arg->val == F_DUPFD_CLOEXEC || arg->val == F_DUPFD) {
+		syscall_arg__set_ret_scnprintf(arg, syscall_arg__scnprintf_fd);
+		goto out;
+	}
+	if (arg->val == F_GETOWN) {
+		syscall_arg__set_ret_scnprintf(arg, syscall_arg__scnprintf_pid);
+		goto mask_arg;
+	}
+	if (arg->val == F_GETLEASE) {
+		syscall_arg__set_ret_scnprintf(arg, syscall_arg__scnprintf_fcntl_getlease);
+		goto mask_arg;
+	}
+	/*
+	 * Some commands ignore the third fcntl argument, "arg", so mask it
+	 */
+	if (arg->val == F_GET_SEALS ||
+	    arg->val == F_GETSIG) {
+mask_arg:
+		arg->mask |= (1 << 2);
+	}
+out:
+	return syscall_arg__scnprintf_strarrays(bf, size, arg);
+}
+
+size_t syscall_arg__scnprintf_fcntl_arg(char *bf, size_t size, struct syscall_arg *arg)
+{
+	int cmd = syscall_arg__val(arg, 1);
+
+	if (cmd == F_DUPFD)
+		return syscall_arg__scnprintf_fd(bf, size, arg);
+
+	if (cmd == F_SETFD)
+		return fcntl__scnprintf_getfd(arg->val, bf, size);
+
+	if (cmd == F_SETFL)
+		return open__scnprintf_flags(arg->val, bf, size);
+
+	if (cmd == F_SETOWN)
+		return syscall_arg__scnprintf_pid(bf, size, arg);
+
+	if (cmd == F_SETLEASE)
+		return fcntl__scnprintf_getlease(arg->val, bf, size);
+	/*
+	 * We still don't grab the contents of pointers on entry or exit,
+	 * so just print them as hex numbers
+	 */
+	if (cmd == F_SETLK || cmd == F_SETLKW || cmd == F_GETLK ||
+	    cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK ||
+	    cmd == F_GETOWN_EX || cmd == F_SETOWN_EX ||
+	    cmd == F_GET_RW_HINT || cmd == F_SET_RW_HINT ||
+	    cmd == F_GET_FILE_RW_HINT || cmd == F_SET_FILE_RW_HINT)
+		return syscall_arg__scnprintf_hex(bf, size, arg);
+
+	return syscall_arg__scnprintf_long(bf, size, arg);
+}
diff --git a/tools/perf/trace/beauty/ioctl.c b/tools/perf/trace/beauty/ioctl.c
new file mode 100644
index 000000000000..1be3b4cf0827
--- /dev/null
+++ b/tools/perf/trace/beauty/ioctl.c
@@ -0,0 +1,162 @@
+/*
+ * trace/beauty/ioctl.c
+ *
+ *  Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "trace/beauty/beauty.h"
+#include <linux/kernel.h>
+
+/*
+ * FIXME: to support all arches we have to improve this, for
+ * now, to build on older systems without things like TIOCGEXCL,
+ * get it directly from our copy.
+ *
+ * Right now only x86 is being supported for beautifying ioctl args
+ * in 'perf trace', see tools/perf/trace/beauty/Build and builtin-trace.c
+ */
+#include <uapi/asm-generic/ioctls.h>
+
+static size_t ioctl__scnprintf_tty_cmd(int nr, int dir, char *bf, size_t size)
+{
+	static const char *ioctl_tty_cmd[] = {
+	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
+	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL", "TIOCSCTTY",
+	"TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI", "TIOCGWINSZ", "TIOCSWINSZ",
+	"TIOCMGET", "TIOCMBIS", "TIOCMBIC", "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR",
+	"FIONREAD", "TIOCLINUX", "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT",
+	"FIONBIO", "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP",
+	[_IOC_NR(TIOCSBRK)] = "TIOCSBRK", "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2",
+	"TCSETSW2", "TCSETSF2", "TIOCGRS48", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
+	"TIOCGDEV", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG", "TIOCVHANGUP", "TIOCGPKT",
+	"TIOCGPTLCK", [_IOC_NR(TIOCGEXCL)] = "TIOCGEXCL", "TIOCGPTPEER",
+	[_IOC_NR(FIONCLEX)] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
+	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
+	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
+	"TIOCMIWAIT", "TIOCGICOUNT", };
+	static DEFINE_STRARRAY(ioctl_tty_cmd);
+
+	if (nr < strarray__ioctl_tty_cmd.nr_entries && strarray__ioctl_tty_cmd.entries[nr] != NULL)
+		return scnprintf(bf, size, "%s", strarray__ioctl_tty_cmd.entries[nr]);
+
+	return scnprintf(bf, size, "(%#x, %#x, %#x)", 'T', nr, dir);
+}
+
+static size_t ioctl__scnprintf_drm_cmd(int nr, int dir, char *bf, size_t size)
+{
+#include "trace/beauty/generated/ioctl/drm_ioctl_array.c"
+	static DEFINE_STRARRAY(drm_ioctl_cmds);
+
+	if (nr < strarray__drm_ioctl_cmds.nr_entries && strarray__drm_ioctl_cmds.entries[nr] != NULL)
+		return scnprintf(bf, size, "DRM_%s", strarray__drm_ioctl_cmds.entries[nr]);
+
+	return scnprintf(bf, size, "(%#x, %#x, %#x)", 'd', nr, dir);
+}
+
+static size_t ioctl__scnprintf_sndrv_pcm_cmd(int nr, int dir, char *bf, size_t size)
+{
+#include "trace/beauty/generated/ioctl/sndrv_pcm_ioctl_array.c"
+	static DEFINE_STRARRAY(sndrv_pcm_ioctl_cmds);
+
+	if (nr < strarray__sndrv_pcm_ioctl_cmds.nr_entries && strarray__sndrv_pcm_ioctl_cmds.entries[nr] != NULL)
+		return scnprintf(bf, size, "SNDRV_PCM_%s", strarray__sndrv_pcm_ioctl_cmds.entries[nr]);
+
+	return scnprintf(bf, size, "(%#x, %#x, %#x)", 'A', nr, dir);
+}
+
+static size_t ioctl__scnprintf_sndrv_ctl_cmd(int nr, int dir, char *bf, size_t size)
+{
+#include "trace/beauty/generated/ioctl/sndrv_ctl_ioctl_array.c"
+	static DEFINE_STRARRAY(sndrv_ctl_ioctl_cmds);
+
+	if (nr < strarray__sndrv_ctl_ioctl_cmds.nr_entries && strarray__sndrv_ctl_ioctl_cmds.entries[nr] != NULL)
+		return scnprintf(bf, size, "SNDRV_CTL_%s", strarray__sndrv_ctl_ioctl_cmds.entries[nr]);
+
+	return scnprintf(bf, size, "(%#x, %#x, %#x)", 'U', nr, dir);
+}
+
+static size_t ioctl__scnprintf_kvm_cmd(int nr, int dir, char *bf, size_t size)
+{
+#include "trace/beauty/generated/ioctl/kvm_ioctl_array.c"
+	static DEFINE_STRARRAY(kvm_ioctl_cmds);
+
+	if (nr < strarray__kvm_ioctl_cmds.nr_entries && strarray__kvm_ioctl_cmds.entries[nr] != NULL)
+		return scnprintf(bf, size, "KVM_%s", strarray__kvm_ioctl_cmds.entries[nr]);
+
+	return scnprintf(bf, size, "(%#x, %#x, %#x)", 0xAE, nr, dir);
+}
+
+static size_t ioctl__scnprintf_vhost_virtio_cmd(int nr, int dir, char *bf, size_t size)
+{
+#include "trace/beauty/generated/ioctl/vhost_virtio_ioctl_array.c"
+	static DEFINE_STRARRAY(vhost_virtio_ioctl_cmds);
+	static DEFINE_STRARRAY(vhost_virtio_ioctl_read_cmds);
+	struct strarray *s = (dir & _IOC_READ) ? &strarray__vhost_virtio_ioctl_read_cmds : &strarray__vhost_virtio_ioctl_cmds;
+
+	if (nr < s->nr_entries && s->entries[nr] != NULL)
+		return scnprintf(bf, size, "VHOST_%s", s->entries[nr]);
+
+	return scnprintf(bf, size, "(%#x, %#x, %#x)", 0xAF, nr, dir);
+}
+
+static size_t ioctl__scnprintf_perf_cmd(int nr, int dir, char *bf, size_t size)
+{
+#include "trace/beauty/generated/ioctl/perf_ioctl_array.c"
+	static DEFINE_STRARRAY(perf_ioctl_cmds);
+
+	if (nr < strarray__perf_ioctl_cmds.nr_entries && strarray__perf_ioctl_cmds.entries[nr] != NULL)
+		return scnprintf(bf, size, "PERF_%s", strarray__perf_ioctl_cmds.entries[nr]);
+
+	return scnprintf(bf, size, "(%#x, %#x, %#x)", 0xAE, nr, dir);
+}
+
+static size_t ioctl__scnprintf_cmd(unsigned long cmd, char *bf, size_t size)
+{
+	int dir	 = _IOC_DIR(cmd),
+	    type = _IOC_TYPE(cmd),
+	    nr	 = _IOC_NR(cmd),
+	    sz	 = _IOC_SIZE(cmd);
+	int printed = 0;
+	static const struct ioctl_type {
+		int	type;
+		size_t	(*scnprintf)(int nr, int dir, char *bf, size_t size);
+	} ioctl_types[] = { /* Must be ordered by type */
+			      { .type	= '$', .scnprintf = ioctl__scnprintf_perf_cmd, },
+		['A' - '$'] = { .type	= 'A', .scnprintf = ioctl__scnprintf_sndrv_pcm_cmd, },
+		['T' - '$'] = { .type	= 'T', .scnprintf = ioctl__scnprintf_tty_cmd, },
+		['U' - '$'] = { .type	= 'U', .scnprintf = ioctl__scnprintf_sndrv_ctl_cmd, },
+		['d' - '$'] = { .type	= 'd', .scnprintf = ioctl__scnprintf_drm_cmd, },
+		[0xAE - '$'] = { .type	= 0xAE, .scnprintf = ioctl__scnprintf_kvm_cmd, },
+		[0xAF - '$'] = { .type	= 0xAF, .scnprintf = ioctl__scnprintf_vhost_virtio_cmd, },
+	};
+	const int nr_types = ARRAY_SIZE(ioctl_types);
+
+	if (type >= ioctl_types[0].type && type <= ioctl_types[nr_types - 1].type) {
+		const int index = type - ioctl_types[0].type;
+
+		if (ioctl_types[index].scnprintf != NULL)
+			return ioctl_types[index].scnprintf(nr, dir, bf, size);
+	}
+
+	printed += scnprintf(bf + printed, size - printed, "%c", '(');
+
+	if (dir == _IOC_NONE) {
+		printed += scnprintf(bf + printed, size - printed, "%s", "NONE");
+	} else {
+		if (dir & _IOC_READ)
+			printed += scnprintf(bf + printed, size - printed, "%s", "READ");
+		if (dir & _IOC_WRITE)
+			printed += scnprintf(bf + printed, size - printed, "%s%s", dir & _IOC_READ ? "|" : "", "WRITE");
+	}
+
+	return printed + scnprintf(bf + printed, size - printed, ", %#x, %#x, %#x)", type, nr, sz);
+}
+
+size_t syscall_arg__scnprintf_ioctl_cmd(char *bf, size_t size, struct syscall_arg *arg)
+{
+	unsigned long cmd = arg->val;
+
+	return ioctl__scnprintf_cmd(cmd, bf, size);
+}
diff --git a/tools/perf/trace/beauty/kvm_ioctl.sh b/tools/perf/trace/beauty/kvm_ioctl.sh
new file mode 100755
index 000000000000..bd28817afced
--- /dev/null
+++ b/tools/perf/trace/beauty/kvm_ioctl.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+kvm_header_dir=$1
+
+printf "static const char *kvm_ioctl_cmds[] = {\n"
+regex='^#[[:space:]]*define[[:space:]]+KVM_(\w+)[[:space:]]+_IO[RW]*\([[:space:]]*KVMIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*'
+egrep $regex ${kvm_header_dir}/kvm.h	| \
+	sed -r "s/$regex/\2 \1/g"	| \
+	egrep -v " ((ARM|PPC|S390)_|[GS]ET_(DEBUGREGS|PIT2|XSAVE|TSC_KHZ)|CREATE_SPAPR_TCE_64)" | \
+	sort | xargs printf "\t[%s] = \"%s\",\n"
+printf "};\n"
diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c
index af1cfde6b97b..754558f9009d 100644
--- a/tools/perf/trace/beauty/mmap.c
+++ b/tools/perf/trace/beauty/mmap.c
@@ -34,6 +34,9 @@ static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
 {
 	int printed = 0, flags = arg->val;
 
+	if (flags & MAP_ANONYMOUS)
+		arg->mask |= (1 << 4) | (1 << 5); /* Mask 4th ('fd') and 5th ('offset') args, ignored */
+
 #define	P_MMAP_FLAG(n) \
 	if (flags & MAP_##n) { \
 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
diff --git a/tools/perf/trace/beauty/open_flags.c b/tools/perf/trace/beauty/open_flags.c
index f55a4597fc38..e359e041dc0e 100644
--- a/tools/perf/trace/beauty/open_flags.c
+++ b/tools/perf/trace/beauty/open_flags.c
@@ -14,13 +14,16 @@
 #define O_NOATIME	01000000
 #endif
 
-static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
-					       struct syscall_arg *arg)
-{
-	int printed = 0, flags = arg->val;
+#ifndef O_TMPFILE
+#define O_TMPFILE	020000000
+#endif
 
-	if (!(flags & O_CREAT))
-		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
+#undef O_LARGEFILE
+#define O_LARGEFILE	00100000
+
+size_t open__scnprintf_flags(unsigned long flags, char *bf, size_t size)
+{
+	int printed = 0;
 
 	if (flags == 0)
 		return scnprintf(bf, size, "RDONLY");
@@ -30,6 +33,7 @@ static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
 		flags &= ~O_##n; \
 	}
 
+	P_FLAG(RDWR);
 	P_FLAG(APPEND);
 	P_FLAG(ASYNC);
 	P_FLAG(CLOEXEC);
@@ -38,6 +42,8 @@ static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
 	P_FLAG(DIRECTORY);
 	P_FLAG(EXCL);
 	P_FLAG(LARGEFILE);
+	P_FLAG(NOFOLLOW);
+	P_FLAG(TMPFILE);
 	P_FLAG(NOATIME);
 	P_FLAG(NOCTTY);
 #ifdef O_NONBLOCK
@@ -48,7 +54,6 @@ static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
 #ifdef O_PATH
 	P_FLAG(PATH);
 #endif
-	P_FLAG(RDWR);
 #ifdef O_DSYNC
 	if ((flags & O_SYNC) == O_SYNC)
 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
@@ -68,4 +73,12 @@ static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
 	return printed;
 }
 
-#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
+size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size, struct syscall_arg *arg)
+{
+	int flags = arg->val;
+
+	if (!(flags & O_CREAT))
+		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
+
+	return open__scnprintf_flags(flags, bf, size);
+}
diff --git a/tools/perf/trace/beauty/perf_ioctl.sh b/tools/perf/trace/beauty/perf_ioctl.sh
new file mode 100755
index 000000000000..faea4237c793
--- /dev/null
+++ b/tools/perf/trace/beauty/perf_ioctl.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+header_dir=$1
+
+printf "static const char *perf_ioctl_cmds[] = {\n"
+regex='^#[[:space:]]*define[[:space:]]+PERF_EVENT_IOC_(\w+)[[:space:]]+_IO[RW]*[[:space:]]*\([[:space:]]*.\$.[[:space:]]*,[[:space:]]*([[:digit:]]+).*'
+egrep $regex ${header_dir}/perf_event.h	| \
+	sed -r "s/$regex/\2 \1/g"	| \
+	sort | xargs printf "\t[%s] = \"%s\",\n"
+printf "};\n"
diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c
index 07486ea65ae3..b6d419e16dcf 100644
--- a/tools/perf/trace/beauty/pid.c
+++ b/tools/perf/trace/beauty/pid.c
@@ -1,4 +1,4 @@
-static size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg)
+size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg)
 {
 	int pid = arg->val;
 	struct trace *trace = arg->trace;
@@ -17,5 +17,3 @@ static size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_a
 
 	return printed;
 }
-
-#define SCA_PID syscall_arg__scnprintf_pid
diff --git a/tools/perf/trace/beauty/pkey_alloc.c b/tools/perf/trace/beauty/pkey_alloc.c
new file mode 100644
index 000000000000..2ba784a3734a
--- /dev/null
+++ b/tools/perf/trace/beauty/pkey_alloc.c
@@ -0,0 +1,50 @@
+/*
+ * trace/beauty/pkey_alloc.c
+ *
+ *  Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "trace/beauty/beauty.h"
+#include <linux/kernel.h>
+#include <linux/log2.h>
+
+static size_t pkey_alloc__scnprintf_access_rights(int access_rights, char *bf, size_t size)
+{
+	int i, printed = 0;
+
+#include "trace/beauty/generated/pkey_alloc_access_rights_array.c"
+	static DEFINE_STRARRAY(pkey_alloc_access_rights);
+
+	if (access_rights == 0) {
+		const char *s = strarray__pkey_alloc_access_rights.entries[0];
+		if (s)
+			return scnprintf(bf, size, "%s", s);
+		return scnprintf(bf, size, "%d", 0);
+	}
+
+	for (i = 1; i < strarray__pkey_alloc_access_rights.nr_entries; ++i) {
+		int bit = 1 << (i - 1);
+
+		if (!(access_rights & bit))
+			continue;
+
+		if (printed != 0)
+			printed += scnprintf(bf + printed, size - printed, "|");
+
+		if (strarray__pkey_alloc_access_rights.entries[i] != NULL)
+			printed += scnprintf(bf + printed, size - printed, "%s", strarray__pkey_alloc_access_rights.entries[i]);
+		else
+			printed += scnprintf(bf + printed, size - printed, "0x%#", bit);
+	}
+
+	return printed;
+}
+
+size_t syscall_arg__scnprintf_pkey_alloc_access_rights(char *bf, size_t size, struct syscall_arg *arg)
+{
+	unsigned long cmd = arg->val;
+
+	return pkey_alloc__scnprintf_access_rights(cmd, bf, size);
+}
diff --git a/tools/perf/trace/beauty/pkey_alloc_access_rights.sh b/tools/perf/trace/beauty/pkey_alloc_access_rights.sh
new file mode 100755
index 000000000000..62e51a02b839
--- /dev/null
+++ b/tools/perf/trace/beauty/pkey_alloc_access_rights.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+header_dir=$1
+
+printf "static const char *pkey_alloc_access_rights[] = {\n"
+regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+PKEY_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*'
+egrep $regex ${header_dir}/mman-common.h	| \
+	sed -r "s/$regex/\2 \2 \1/g"	| \
+	sort | xargs printf "\t[%s ? (ilog2(%s) + 1) : 0] = \"%s\",\n"
+printf "};\n"
diff --git a/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh
new file mode 100755
index 000000000000..aad5ab130539
--- /dev/null
+++ b/tools/perf/trace/beauty/sndrv_ctl_ioctl.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+sound_header_dir=$1
+
+printf "static const char *sndrv_ctl_ioctl_cmds[] = {\n"
+grep "^#define[\t ]\+SNDRV_CTL_IOCTL_" $sound_header_dir/asound.h | \
+	sed -r 's/^#define +SNDRV_CTL_IOCTL_([A-Z0-9_]+)[\t ]+_IO[RW]*\( *.U., *(0x[[:xdigit:]]+),?.*/\t[\2] = \"\1\",/g'
+printf "};\n"
diff --git a/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh
new file mode 100755
index 000000000000..b7e9ef6b2f55
--- /dev/null
+++ b/tools/perf/trace/beauty/sndrv_pcm_ioctl.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+sound_header_dir=$1
+
+printf "static const char *sndrv_pcm_ioctl_cmds[] = {\n"
+grep "^#define[\t ]\+SNDRV_PCM_IOCTL_" $sound_header_dir/asound.h | \
+	sed -r 's/^#define +SNDRV_PCM_IOCTL_([A-Z0-9_]+)[\t ]+_IO[RW]*\( *.A., *(0x[[:xdigit:]]+),?.*/\t[\2] = \"\1\",/g'
+printf "};\n"
diff --git a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh
new file mode 100755
index 000000000000..76f1de697787
--- /dev/null
+++ b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+vhost_virtio_header_dir=$1
+
+printf "static const char *vhost_virtio_ioctl_cmds[] = {\n"
+regex='^#[[:space:]]*define[[:space:]]+VHOST_(\w+)[[:space:]]+_IOW?\([[:space:]]*VHOST_VIRTIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*'
+egrep $regex ${vhost_virtio_header_dir}/vhost.h | \
+	sed -r "s/$regex/\2 \1/g"	| \
+	sort | xargs printf "\t[%s] = \"%s\",\n"
+printf "};\n"
+
+printf "static const char *vhost_virtio_ioctl_read_cmds[] = {\n"
+regex='^#[[:space:]]*define[[:space:]]+VHOST_(\w+)[[:space:]]+_IOW?R\([[:space:]]*VHOST_VIRTIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*'
+egrep $regex ${vhost_virtio_header_dir}/vhost.h | \
+	sed -r "s/$regex/\2 \1/g"	| \
+	sort | xargs printf "\t[%s] = \"%s\",\n"
+printf "};\n"
diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c
index 83874b0e266c..d0c2007c307b 100644
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c
@@ -8,6 +8,7 @@
 #include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include <linux/string.h>
 #include <stdlib.h>
 #include <sys/ttydefaults.h>
 #include "browser.h"
@@ -563,7 +564,7 @@ static int ui_browser__color_config(const char *var, const char *value,
 	int i;
 
 	/* same dir for all commands */
-	if (prefixcmp(var, "colors.") != 0)
+	if (!strstarts(var, "colors.") != 0)
 		return 0;
 
 	for (i = 0; ui_browser__colorsets[i].name != NULL; ++i) {
@@ -738,6 +739,35 @@ void __ui_browser__line_arrow(struct ui_browser *browser, unsigned int column,
 		__ui_browser__line_arrow_down(browser, column, start, end);
 }
 
+void ui_browser__mark_fused(struct ui_browser *browser, unsigned int column,
+			    unsigned int row, bool arrow_down)
+{
+	unsigned int end_row;
+
+	if (row >= browser->top_idx)
+		end_row = row - browser->top_idx;
+	else
+		return;
+
+	SLsmg_set_char_set(1);
+
+	if (arrow_down) {
+		ui_browser__gotorc(browser, end_row, column - 1);
+		SLsmg_write_char(SLSMG_ULCORN_CHAR);
+		ui_browser__gotorc(browser, end_row, column);
+		SLsmg_draw_hline(2);
+		ui_browser__gotorc(browser, end_row + 1, column - 1);
+		SLsmg_write_char(SLSMG_LTEE_CHAR);
+	} else {
+		ui_browser__gotorc(browser, end_row, column - 1);
+		SLsmg_write_char(SLSMG_LTEE_CHAR);
+		ui_browser__gotorc(browser, end_row, column);
+		SLsmg_draw_hline(2);
+	}
+
+	SLsmg_set_char_set(0);
+}
+
 void ui_browser__init(void)
 {
 	int i = 0;
diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h
index be3b70eb5fca..a12eff75638b 100644
--- a/tools/perf/ui/browser.h
+++ b/tools/perf/ui/browser.h
@@ -43,6 +43,8 @@ void ui_browser__printf(struct ui_browser *browser, const char *fmt, ...);
 void ui_browser__write_graph(struct ui_browser *browser, int graph);
 void __ui_browser__line_arrow(struct ui_browser *browser, unsigned int column,
 			      u64 start, u64 end);
+void ui_browser__mark_fused(struct ui_browser *browser, unsigned int column,
+			    unsigned int row, bool arrow_down);
 void __ui_browser__show_title(struct ui_browser *browser, const char *title);
 void ui_browser__show_title(struct ui_browser *browser, const char *title);
 int ui_browser__show(struct ui_browser *browser, const char *title,
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 27f41f28dcb4..ba0aee576a2b 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -9,14 +9,16 @@
 #include "../../util/symbol.h"
 #include "../../util/evsel.h"
 #include "../../util/config.h"
+#include "../../util/evlist.h"
 #include <inttypes.h>
 #include <pthread.h>
 #include <linux/kernel.h>
+#include <linux/string.h>
 #include <sys/ttydefaults.h>
 
 struct disasm_line_samples {
-	double		percent;
-	u64		nr;
+	double		      percent;
+	struct sym_hist_entry he;
 };
 
 #define IPC_WIDTH 6
@@ -40,6 +42,7 @@ static struct annotate_browser_opt {
 	     jump_arrows,
 	     show_linenr,
 	     show_nr_jumps,
+	     show_nr_samples,
 	     show_total_period;
 } annotate_browser__opts = {
 	.use_offset	= true,
@@ -108,11 +111,12 @@ static int annotate_browser__set_jumps_percent_color(struct annotate_browser *br
 
 static int annotate_browser__pcnt_width(struct annotate_browser *ab)
 {
-	int w = 7 * ab->nr_events;
+	return (annotate_browser__opts.show_total_period ? 12 : 7) * ab->nr_events;
+}
 
-	if (ab->have_cycles)
-		w += IPC_WIDTH + CYCLES_WIDTH;
-	return w;
+static int annotate_browser__cycles_width(struct annotate_browser *ab)
+{
+	return ab->have_cycles ? IPC_WIDTH + CYCLES_WIDTH : 0;
 }
 
 static void annotate_browser__write(struct ui_browser *browser, void *entry, int row)
@@ -125,7 +129,8 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 			     (!current_entry || (browser->use_navkeypressed &&
 					         !browser->navkeypressed)));
 	int width = browser->width, printed;
-	int i, pcnt_width = annotate_browser__pcnt_width(ab);
+	int i, pcnt_width = annotate_browser__pcnt_width(ab),
+	       cycles_width = annotate_browser__cycles_width(ab);
 	double percent_max = 0.0;
 	char bf[256];
 	bool show_title = false;
@@ -149,8 +154,11 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 						bdl->samples[i].percent,
 						current_entry);
 			if (annotate_browser__opts.show_total_period) {
+				ui_browser__printf(browser, "%11" PRIu64 " ",
+						   bdl->samples[i].he.period);
+			} else if (annotate_browser__opts.show_nr_samples) {
 				ui_browser__printf(browser, "%6" PRIu64 " ",
-						   bdl->samples[i].nr);
+						   bdl->samples[i].he.nr_samples);
 			} else {
 				ui_browser__printf(browser, "%6.2f ",
 						   bdl->samples[i].percent);
@@ -160,9 +168,12 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 		ui_browser__set_percent_color(browser, 0, current_entry);
 
 		if (!show_title)
-			ui_browser__write_nstring(browser, " ", 7 * ab->nr_events);
-		else
-			ui_browser__printf(browser, "%*s", 7, "Percent");
+			ui_browser__write_nstring(browser, " ", pcnt_width);
+		else {
+			ui_browser__printf(browser, "%*s", pcnt_width,
+					   annotate_browser__opts.show_total_period ? "Period" :
+					   annotate_browser__opts.show_nr_samples ? "Samples" : "Percent");
+		}
 	}
 	if (ab->have_cycles) {
 		if (dl->ipc)
@@ -188,7 +199,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 		width += 1;
 
 	if (!*dl->line)
-		ui_browser__write_nstring(browser, " ", width - pcnt_width);
+		ui_browser__write_nstring(browser, " ", width - pcnt_width - cycles_width);
 	else if (dl->offset == -1) {
 		if (dl->line_nr && annotate_browser__opts.show_linenr)
 			printed = scnprintf(bf, sizeof(bf), "%-*d ",
@@ -197,7 +208,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 			printed = scnprintf(bf, sizeof(bf), "%*s  ",
 				    ab->addr_width, " ");
 		ui_browser__write_nstring(browser, bf, printed);
-		ui_browser__write_nstring(browser, dl->line, width - printed - pcnt_width + 1);
+		ui_browser__write_nstring(browser, dl->line, width - printed - pcnt_width - cycles_width + 1);
 	} else {
 		u64 addr = dl->offset;
 		int color = -1;
@@ -254,7 +265,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int
 		}
 
 		disasm_line__scnprintf(dl, bf, sizeof(bf), !annotate_browser__opts.use_offset);
-		ui_browser__write_nstring(browser, bf, width - pcnt_width - 3 - printed);
+		ui_browser__write_nstring(browser, bf, width - pcnt_width - cycles_width - 3 - printed);
 	}
 
 	if (current_entry)
@@ -272,6 +283,25 @@ static bool disasm_line__is_valid_jump(struct disasm_line *dl, struct symbol *sy
 	return true;
 }
 
+static bool is_fused(struct annotate_browser *ab, struct disasm_line *cursor)
+{
+	struct disasm_line *pos = list_prev_entry(cursor, node);
+	const char *name;
+
+	if (!pos)
+		return false;
+
+	if (ins__is_lock(&pos->ins))
+		name = pos->ops.locked.ins.name;
+	else
+		name = pos->ins.name;
+
+	if (!name || !cursor->ins.name)
+		return false;
+
+	return ins__is_fused(ab->arch, name, cursor->ins.name);
+}
+
 static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 {
 	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
@@ -307,6 +337,13 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 	ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS);
 	__ui_browser__line_arrow(browser, pcnt_width + 2 + ab->addr_width,
 				 from, to);
+
+	if (is_fused(ab, cursor)) {
+		ui_browser__mark_fused(browser,
+				       pcnt_width + 3 + ab->addr_width,
+				       from - 1,
+				       to > from ? true : false);
+	}
 }
 
 static unsigned int annotate_browser__refresh(struct ui_browser *browser)
@@ -422,14 +459,14 @@ static void annotate_browser__calc_percent(struct annotate_browser *browser,
 		next = disasm__get_next_ip_line(&notes->src->source, pos);
 
 		for (i = 0; i < browser->nr_events; i++) {
-			u64 nr_samples;
+			struct sym_hist_entry sample;
 
 			bpos->samples[i].percent = disasm__calc_percent(notes,
 						evsel->idx + i,
 						pos->offset,
 						next ? next->offset : len,
-						&path, &nr_samples);
-			bpos->samples[i].nr = nr_samples;
+						&path, &sample);
+			bpos->samples[i].he = sample;
 
 			if (max_percent < bpos->samples[i].percent)
 				max_percent = bpos->samples[i].percent;
@@ -798,7 +835,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
 		"n             Search next string\n"
 		"o             Toggle disassembler output/simplified view\n"
 		"s             Toggle source code view\n"
-		"t             Toggle total period view\n"
+		"t             Circulate percent, total period, samples view\n"
 		"/             Search string\n"
 		"k             Toggle line numbers\n"
 		"r             Run available scripts\n"
@@ -875,8 +912,13 @@ show_sup_ins:
 			}
 			continue;
 		case 't':
-			annotate_browser__opts.show_total_period =
-			  !annotate_browser__opts.show_total_period;
+			if (annotate_browser__opts.show_total_period) {
+				annotate_browser__opts.show_total_period = false;
+				annotate_browser__opts.show_nr_samples = true;
+			} else if (annotate_browser__opts.show_nr_samples)
+				annotate_browser__opts.show_nr_samples = false;
+			else
+				annotate_browser__opts.show_total_period = true;
 			annotate_browser__update_addr_width(browser);
 			continue;
 		case K_LEFT:
@@ -899,9 +941,11 @@ out:
 int map_symbol__tui_annotate(struct map_symbol *ms, struct perf_evsel *evsel,
 			     struct hist_browser_timer *hbt)
 {
-	/* Set default value for show_total_period.  */
+	/* Set default value for show_total_period and show_nr_samples  */
 	annotate_browser__opts.show_total_period =
-	  symbol_conf.show_total_period;
+		symbol_conf.show_total_period;
+	annotate_browser__opts.show_nr_samples =
+		symbol_conf.show_nr_samples;
 
 	return symbol__tui_annotate(ms->sym, ms->map, evsel, hbt);
 }
@@ -1074,7 +1118,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map,
 	}
 
 	err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
-				  sizeof_bdl, &browser.arch);
+				  sizeof_bdl, &browser.arch,
+				  perf_evsel__env_cpuid(evsel));
 	if (err) {
 		char msg[BUFSIZ];
 		symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg));
@@ -1151,6 +1196,7 @@ static struct annotate_config {
 	ANNOTATE_CFG(jump_arrows),
 	ANNOTATE_CFG(show_linenr),
 	ANNOTATE_CFG(show_nr_jumps),
+	ANNOTATE_CFG(show_nr_samples),
 	ANNOTATE_CFG(show_total_period),
 	ANNOTATE_CFG(use_offset),
 };
@@ -1170,7 +1216,7 @@ static int annotate__config(const char *var, const char *value,
 	struct annotate_config *cfg;
 	const char *name;
 
-	if (prefixcmp(var, "annotate.") != 0)
+	if (!strstarts(var, "annotate."))
 		return 0;
 
 	name = var + 9;
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 69f4570bd4f9..f4bc2462bc2c 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -166,9 +166,6 @@ static struct inline_node *inline_node__create(struct map *map, u64 ip)
 	if (dso == NULL)
 		return NULL;
 
-	if (dso->kernel != DSO_TYPE_USER)
-		return NULL;
-
 	node = dso__parse_addr_inlines(dso,
 				       map__rip_2objdump(map, ip));
 
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index d903fd493416..02176193f427 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -34,10 +34,10 @@ static int perf_gtk__get_percent(char *buf, size_t size, struct symbol *sym,
 		return 0;
 
 	symhist = annotation__histogram(symbol__annotation(sym), evidx);
-	if (!symbol_conf.event_group && !symhist->addr[dl->offset])
+	if (!symbol_conf.event_group && !symhist->addr[dl->offset].nr_samples)
 		return 0;
 
-	percent = 100.0 * symhist->addr[dl->offset] / symhist->sum;
+	percent = 100.0 * symhist->addr[dl->offset].nr_samples / symhist->nr_samples;
 
 	markup = perf_gtk__get_percent_color(percent);
 	if (markup)
@@ -169,7 +169,7 @@ static int symbol__gtk_annotate(struct symbol *sym, struct map *map,
 		return -1;
 
 	err = symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
-				  0, NULL);
+				  0, NULL, NULL);
 	if (err) {
 		char msg[BUFSIZ];
 		symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg));
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 42e432bd2eb4..5c95b8301c67 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -1,4 +1,5 @@
 #include <stdio.h>
+#include <linux/string.h>
 
 #include "../../util/util.h"
 #include "../../util/hist.h"
@@ -35,9 +36,6 @@ static size_t inline__fprintf(struct map *map, u64 ip, int left_margin,
 	if (dso == NULL)
 		return 0;
 
-	if (dso->kernel != DSO_TYPE_USER)
-		return 0;
-
 	node = dso__parse_addr_inlines(dso,
 				       map__rip_2objdump(map, ip));
 	if (node == NULL)
@@ -295,7 +293,7 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 			 * displayed twice.
 			 */
 			if (!i++ && field_order == NULL &&
-			    sort_order && !prefixcmp(sort_order, "sym"))
+			    sort_order && strstarts(sort_order, "sym"))
 				continue;
 
 			if (!printed) {
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 79dea95a7f68..94518c1bf8b6 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -22,6 +22,7 @@ libperf-y += rbtree.o
 libperf-y += libstring.o
 libperf-y += bitmap.o
 libperf-y += hweight.o
+libperf-y += smt.o
 libperf-y += quote.o
 libperf-y += strbuf.o
 libperf-y += string.o
@@ -93,6 +94,7 @@ libperf-y += drv_configs.o
 libperf-y += units.o
 libperf-y += time-utils.o
 libperf-y += expr-bison.o
+libperf-y += branch.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
 libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
@@ -104,6 +106,10 @@ ifndef CONFIG_LIBELF
 libperf-y += symbol-minimal.o
 endif
 
+ifndef CONFIG_SETNS
+libperf-y += setns.o
+endif
+
 libperf-$(CONFIG_DWARF) += probe-finder.o
 libperf-$(CONFIG_DWARF) += dwarf-aux.o
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index be1caabb9290..4397a8b6e6cd 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -47,7 +47,12 @@ struct arch {
 	bool		sorted_instructions;
 	bool		initialized;
 	void		*priv;
+	unsigned int	model;
+	unsigned int	family;
 	int		(*init)(struct arch *arch);
+	bool		(*ins_is_fused)(struct arch *arch, const char *ins1,
+					const char *ins2);
+	int		(*cpuid_parse)(struct arch *arch, char *cpuid);
 	struct		{
 		char comment_char;
 		char skip_functions_char;
@@ -129,6 +134,8 @@ static struct arch architectures[] = {
 		.name = "x86",
 		.instructions = x86__instructions,
 		.nr_instructions = ARRAY_SIZE(x86__instructions),
+		.ins_is_fused = x86__ins_is_fused,
+		.cpuid_parse = x86__cpuid_parse,
 		.objdump =  {
 			.comment_char = '#',
 		},
@@ -171,6 +178,14 @@ int ins__scnprintf(struct ins *ins, char *bf, size_t size,
 	return ins__raw_scnprintf(ins, bf, size, ops);
 }
 
+bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2)
+{
+	if (!arch || !arch->ins_is_fused)
+		return false;
+
+	return arch->ins_is_fused(arch, ins1, ins2);
+}
+
 static int call__parse(struct arch *arch, struct ins_operands *ops, struct map *map)
 {
 	char *endptr, *tok, *name;
@@ -502,6 +517,11 @@ bool ins__is_ret(const struct ins *ins)
 	return ins->ops == &ret_ops;
 }
 
+bool ins__is_lock(const struct ins *ins)
+{
+	return ins->ops == &lock_ops;
+}
+
 static int ins__key_cmp(const void *name, const void *insp)
 {
 	const struct ins *ins = insp;
@@ -590,10 +610,10 @@ int symbol__alloc_hist(struct symbol *sym)
 	size_t sizeof_sym_hist;
 
 	/* Check for overflow when calculating sizeof_sym_hist */
-	if (size > (SIZE_MAX - sizeof(struct sym_hist)) / sizeof(u64))
+	if (size > (SIZE_MAX - sizeof(struct sym_hist)) / sizeof(struct sym_hist_entry))
 		return -1;
 
-	sizeof_sym_hist = (sizeof(struct sym_hist) + size * sizeof(u64));
+	sizeof_sym_hist = (sizeof(struct sym_hist) + size * sizeof(struct sym_hist_entry));
 
 	/* Check for overflow in zalloc argument */
 	if (sizeof_sym_hist > (SIZE_MAX - sizeof(*notes->src))
@@ -677,7 +697,8 @@ static int __symbol__account_cycles(struct annotation *notes,
 }
 
 static int __symbol__inc_addr_samples(struct symbol *sym, struct map *map,
-				      struct annotation *notes, int evidx, u64 addr)
+				      struct annotation *notes, int evidx, u64 addr,
+				      struct perf_sample *sample)
 {
 	unsigned offset;
 	struct sym_hist *h;
@@ -693,12 +714,15 @@ static int __symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 
 	offset = addr - sym->start;
 	h = annotation__histogram(notes, evidx);
-	h->sum++;
-	h->addr[offset]++;
+	h->nr_samples++;
+	h->addr[offset].nr_samples++;
+	h->period += sample->period;
+	h->addr[offset].period += sample->period;
 
 	pr_debug3("%#" PRIx64 " %s: period++ [addr: %#" PRIx64 ", %#" PRIx64
-		  ", evidx=%d] => %" PRIu64 "\n", sym->start, sym->name,
-		  addr, addr - sym->start, evidx, h->addr[offset]);
+		  ", evidx=%d] => nr_samples: %" PRIu64 ", period: %" PRIu64 "\n",
+		  sym->start, sym->name, addr, addr - sym->start, evidx,
+		  h->addr[offset].nr_samples, h->addr[offset].period);
 	return 0;
 }
 
@@ -718,7 +742,8 @@ static struct annotation *symbol__get_annotation(struct symbol *sym, bool cycles
 }
 
 static int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
-				    int evidx, u64 addr)
+				    int evidx, u64 addr,
+				    struct perf_sample *sample)
 {
 	struct annotation *notes;
 
@@ -727,7 +752,7 @@ static int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 	notes = symbol__get_annotation(sym, false);
 	if (notes == NULL)
 		return -ENOMEM;
-	return __symbol__inc_addr_samples(sym, map, notes, evidx, addr);
+	return __symbol__inc_addr_samples(sym, map, notes, evidx, addr, sample);
 }
 
 static int symbol__account_cycles(u64 addr, u64 start,
@@ -791,14 +816,16 @@ int addr_map_symbol__account_cycles(struct addr_map_symbol *ams,
 	return err;
 }
 
-int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, int evidx)
+int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample,
+				 int evidx)
 {
-	return symbol__inc_addr_samples(ams->sym, ams->map, evidx, ams->al_addr);
+	return symbol__inc_addr_samples(ams->sym, ams->map, evidx, ams->al_addr, sample);
 }
 
-int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 ip)
+int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample,
+				 int evidx, u64 ip)
 {
-	return symbol__inc_addr_samples(he->ms.sym, he->ms.map, evidx, ip);
+	return symbol__inc_addr_samples(he->ms.sym, he->ms.map, evidx, ip, sample);
 }
 
 static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map *map)
@@ -908,11 +935,12 @@ struct disasm_line *disasm__get_next_ip_line(struct list_head *head, struct disa
 }
 
 double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
-			    s64 end, const char **path, u64 *nr_samples)
+			    s64 end, const char **path, struct sym_hist_entry *sample)
 {
 	struct source_line *src_line = notes->src->lines;
 	double percent = 0.0;
-	*nr_samples = 0;
+
+	sample->nr_samples = sample->period = 0;
 
 	if (src_line) {
 		size_t sizeof_src_line = sizeof(*src_line) +
@@ -926,19 +954,24 @@ double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
 				*path = src_line->path;
 
 			percent += src_line->samples[evidx].percent;
-			*nr_samples += src_line->samples[evidx].nr;
+			sample->nr_samples += src_line->samples[evidx].nr;
 			offset++;
 		}
 	} else {
 		struct sym_hist *h = annotation__histogram(notes, evidx);
 		unsigned int hits = 0;
+		u64 period = 0;
 
-		while (offset < end)
-			hits += h->addr[offset++];
+		while (offset < end) {
+			hits   += h->addr[offset].nr_samples;
+			period += h->addr[offset].period;
+			++offset;
+		}
 
-		if (h->sum) {
-			*nr_samples = hits;
-			percent = 100.0 * hits / h->sum;
+		if (h->nr_samples) {
+			sample->period	   = period;
+			sample->nr_samples = hits;
+			percent = 100.0 * hits / h->nr_samples;
 		}
 	}
 
@@ -1037,10 +1070,10 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
 
 	if (dl->offset != -1) {
 		const char *path = NULL;
-		u64 nr_samples;
 		double percent, max_percent = 0.0;
 		double *ppercents = &percent;
-		u64 *psamples = &nr_samples;
+		struct sym_hist_entry sample;
+		struct sym_hist_entry *psamples = &sample;
 		int i, nr_percent = 1;
 		const char *color;
 		struct annotation *notes = symbol__annotation(sym);
@@ -1054,7 +1087,7 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
 		if (perf_evsel__is_group_event(evsel)) {
 			nr_percent = evsel->nr_members;
 			ppercents = calloc(nr_percent, sizeof(double));
-			psamples = calloc(nr_percent, sizeof(u64));
+			psamples = calloc(nr_percent, sizeof(struct sym_hist_entry));
 			if (ppercents == NULL || psamples == NULL) {
 				return -1;
 			}
@@ -1065,10 +1098,10 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
 					notes->src->lines ? i : evsel->idx + i,
 					offset,
 					next ? next->offset : (s64) len,
-					&path, &nr_samples);
+					&path, &sample);
 
 			ppercents[i] = percent;
-			psamples[i] = nr_samples;
+			psamples[i] = sample;
 			if (percent > max_percent)
 				max_percent = percent;
 		}
@@ -1106,12 +1139,15 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
 
 		for (i = 0; i < nr_percent; i++) {
 			percent = ppercents[i];
-			nr_samples = psamples[i];
+			sample = psamples[i];
 			color = get_percent_color(percent);
 
 			if (symbol_conf.show_total_period)
+				color_fprintf(stdout, color, " %11" PRIu64,
+					      sample.period);
+			else if (symbol_conf.show_nr_samples)
 				color_fprintf(stdout, color, " %7" PRIu64,
-					      nr_samples);
+					      sample.nr_samples);
 			else
 				color_fprintf(stdout, color, " %7.2f", percent);
 		}
@@ -1127,13 +1163,13 @@ static int disasm_line__print(struct disasm_line *dl, struct symbol *sym, u64 st
 		if (ppercents != &percent)
 			free(ppercents);
 
-		if (psamples != &nr_samples)
+		if (psamples != &sample)
 			free(psamples);
 
 	} else if (max_lines && printed >= max_lines)
 		return 1;
 	else {
-		int width = 8;
+		int width = symbol_conf.show_total_period ? 12 : 8;
 
 		if (queue)
 			return -1;
@@ -1327,7 +1363,7 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil
 	    !dso__is_kcore(dso))
 		return SYMBOL_ANNOTATE_ERRNO__NO_VMLINUX;
 
-	build_id_filename = dso__build_id_filename(dso, NULL, 0);
+	build_id_filename = dso__build_id_filename(dso, NULL, 0, false);
 	if (build_id_filename) {
 		__symbol__join_symfs(filename, filename_size, build_id_filename);
 		free(build_id_filename);
@@ -1381,7 +1417,7 @@ static const char *annotate__norm_arch(const char *arch_name)
 
 int symbol__disassemble(struct symbol *sym, struct map *map,
 			const char *arch_name, size_t privsize,
-			struct arch **parch)
+			struct arch **parch, char *cpuid)
 {
 	struct dso *dso = map->dso;
 	char command[PATH_MAX * 2];
@@ -1418,6 +1454,9 @@ int symbol__disassemble(struct symbol *sym, struct map *map,
 		}
 	}
 
+	if (arch->cpuid_parse && cpuid)
+		arch->cpuid_parse(arch, cpuid);
+
 	pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
 		 symfs_filename, sym->name, map->unmap_ip(map, sym->start),
 		 map->unmap_ip(map, sym->end));
@@ -1648,19 +1687,19 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
 	struct sym_hist *h = annotation__histogram(notes, evidx);
 	struct rb_root tmp_root = RB_ROOT;
 	int nr_pcnt = 1;
-	u64 h_sum = h->sum;
+	u64 nr_samples = h->nr_samples;
 	size_t sizeof_src_line = sizeof(struct source_line);
 
 	if (perf_evsel__is_group_event(evsel)) {
 		for (i = 1; i < evsel->nr_members; i++) {
 			h = annotation__histogram(notes, evidx + i);
-			h_sum += h->sum;
+			nr_samples += h->nr_samples;
 		}
 		nr_pcnt = evsel->nr_members;
 		sizeof_src_line += (nr_pcnt - 1) * sizeof(src_line->samples);
 	}
 
-	if (!h_sum)
+	if (!nr_samples)
 		return 0;
 
 	src_line = notes->src->lines = calloc(len, sizeof_src_line);
@@ -1670,7 +1709,7 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
 	start = map__rip_2objdump(map, sym->start);
 
 	for (i = 0; i < len; i++) {
-		u64 offset, nr_samples;
+		u64 offset;
 		double percent_max = 0.0;
 
 		src_line->nr_pcnt = nr_pcnt;
@@ -1679,9 +1718,9 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
 			double percent = 0.0;
 
 			h = annotation__histogram(notes, evidx + k);
-			nr_samples = h->addr[i];
-			if (h->sum)
-				percent = 100.0 * nr_samples / h->sum;
+			nr_samples = h->addr[i].nr_samples;
+			if (h->nr_samples)
+				percent = 100.0 * nr_samples / h->nr_samples;
 
 			if (percent > percent_max)
 				percent_max = percent;
@@ -1750,10 +1789,10 @@ static void symbol__annotate_hits(struct symbol *sym, struct perf_evsel *evsel)
 	u64 len = symbol__size(sym), offset;
 
 	for (offset = 0; offset < len; ++offset)
-		if (h->addr[offset] != 0)
+		if (h->addr[offset].nr_samples != 0)
 			printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2,
-			       sym->start + offset, h->addr[offset]);
-	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum);
+			       sym->start + offset, h->addr[offset].nr_samples);
+	printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->nr_samples", h->nr_samples);
 }
 
 int symbol__annotate_printf(struct symbol *sym, struct map *map,
@@ -1771,7 +1810,7 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
 	int printed = 2, queue_len = 0;
 	int more = 0;
 	u64 len;
-	int width = 8;
+	int width = symbol_conf.show_total_period ? 12 : 8;
 	int graph_dotted_len;
 
 	filename = strdup(dso->long_name);
@@ -1789,7 +1828,9 @@ int symbol__annotate_printf(struct symbol *sym, struct map *map,
 		width *= evsel->nr_members;
 
 	graph_dotted_len = printf(" %-*.*s|	Source code & Disassembly of %s for %s (%" PRIu64 " samples)\n",
-	       width, width, "Percent", d_filename, evsel_name, h->sum);
+				  width, width, symbol_conf.show_total_period ? "Period" :
+				  symbol_conf.show_nr_samples ? "Samples" : "Percent",
+				  d_filename, evsel_name, h->nr_samples);
 
 	printf("%-*.*s----\n",
 	       graph_dotted_len, graph_dotted_len, graph_dotted_line);
@@ -1853,10 +1894,10 @@ void symbol__annotate_decay_histogram(struct symbol *sym, int evidx)
 	struct sym_hist *h = annotation__histogram(notes, evidx);
 	int len = symbol__size(sym), offset;
 
-	h->sum = 0;
+	h->nr_samples = 0;
 	for (offset = 0; offset < len; ++offset) {
-		h->addr[offset] = h->addr[offset] * 7 / 8;
-		h->sum += h->addr[offset];
+		h->addr[offset].nr_samples = h->addr[offset].nr_samples * 7 / 8;
+		h->nr_samples += h->addr[offset].nr_samples;
 	}
 }
 
@@ -1907,7 +1948,7 @@ int symbol__tty_annotate(struct symbol *sym, struct map *map,
 	u64 len;
 
 	if (symbol__disassemble(sym, map, perf_evsel__env_arch(evsel),
-				0, NULL) < 0)
+				0, NULL, NULL) < 0)
 		return -1;
 
 	len = symbol__size(sym);
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 21055034aedd..9ce575c25fd9 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -52,7 +52,9 @@ struct ins_ops {
 bool ins__is_jump(const struct ins *ins);
 bool ins__is_call(const struct ins *ins);
 bool ins__is_ret(const struct ins *ins);
+bool ins__is_lock(const struct ins *ins);
 int ins__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops);
+bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2);
 
 struct annotation;
 
@@ -72,16 +74,22 @@ static inline bool disasm_line__has_offset(const struct disasm_line *dl)
 	return dl->ops.target.offset_avail;
 }
 
+struct sym_hist_entry {
+	u64		nr_samples;
+	u64		period;
+};
+
 void disasm_line__free(struct disasm_line *dl);
 struct disasm_line *disasm__get_next_ip_line(struct list_head *head, struct disasm_line *pos);
 int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, bool raw);
 size_t disasm__fprintf(struct list_head *head, FILE *fp);
 double disasm__calc_percent(struct annotation *notes, int evidx, s64 offset,
-			    s64 end, const char **path, u64 *nr_samples);
+			    s64 end, const char **path, struct sym_hist_entry *sample);
 
 struct sym_hist {
-	u64		sum;
-	u64		addr[0];
+	u64		      nr_samples;
+	u64		      period;
+	struct sym_hist_entry addr[0];
 };
 
 struct cyc_hist {
@@ -147,20 +155,22 @@ static inline struct annotation *symbol__annotation(struct symbol *sym)
 	return (void *)sym - symbol_conf.priv_size;
 }
 
-int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, int evidx);
+int addr_map_symbol__inc_samples(struct addr_map_symbol *ams, struct perf_sample *sample,
+				 int evidx);
 
 int addr_map_symbol__account_cycles(struct addr_map_symbol *ams,
 				    struct addr_map_symbol *start,
 				    unsigned cycles);
 
-int hist_entry__inc_addr_samples(struct hist_entry *he, int evidx, u64 addr);
+int hist_entry__inc_addr_samples(struct hist_entry *he, struct perf_sample *sample,
+				 int evidx, u64 addr);
 
 int symbol__alloc_hist(struct symbol *sym);
 void symbol__annotate_zero_histograms(struct symbol *sym);
 
 int symbol__disassemble(struct symbol *sym, struct map *map,
 			const char *arch_name, size_t privsize,
-			struct arch **parch);
+			struct arch **parch, char *cpuid);
 
 enum symbol_disassemble_errno {
 	SYMBOL_ANNOTATE_ERRNO__SUCCESS		= 0,
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 4bd2d1d882af..4a1264c66101 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -1246,7 +1246,7 @@ int bpf__config_obj(struct bpf_object *obj,
 	if (!obj || !term || !term->config)
 		return -EINVAL;
 
-	if (!prefixcmp(term->config, "map:")) {
+	if (strstarts(term->config, "map:")) {
 		key_scan_pos = sizeof("map:") - 1;
 		err = bpf__obj_config_map(obj, term, evlist, &key_scan_pos);
 		goto out;
diff --git a/tools/perf/util/bpf-prologue.c b/tools/perf/util/bpf-prologue.c
index 1356220a9f1b..827f9140f3b8 100644
--- a/tools/perf/util/bpf-prologue.c
+++ b/tools/perf/util/bpf-prologue.c
@@ -58,6 +58,46 @@ check_pos(struct bpf_insn_pos *pos)
 	return 0;
 }
 
+/*
+ * Convert type string (u8/u16/u32/u64/s8/s16/s32/s64 ..., see
+ * Documentation/trace/kprobetrace.txt) to size field of BPF_LDX_MEM
+ * instruction (BPF_{B,H,W,DW}).
+ */
+static int
+argtype_to_ldx_size(const char *type)
+{
+	int arg_size = type ? atoi(&type[1]) : 64;
+
+	switch (arg_size) {
+	case 8:
+		return BPF_B;
+	case 16:
+		return BPF_H;
+	case 32:
+		return BPF_W;
+	case 64:
+	default:
+		return BPF_DW;
+	}
+}
+
+static const char *
+insn_sz_to_str(int insn_sz)
+{
+	switch (insn_sz) {
+	case BPF_B:
+		return "BPF_B";
+	case BPF_H:
+		return "BPF_H";
+	case BPF_W:
+		return "BPF_W";
+	case BPF_DW:
+		return "BPF_DW";
+	default:
+		return "UNKNOWN";
+	}
+}
+
 /* Give it a shorter name */
 #define ins(i, p) append_insn((i), (p))
 
@@ -258,9 +298,14 @@ gen_prologue_slowpath(struct bpf_insn_pos *pos,
 	}
 
 	/* Final pass: read to registers */
-	for (i = 0; i < nargs; i++)
-		ins(BPF_LDX_MEM(BPF_DW, BPF_PROLOGUE_START_ARG_REG + i,
+	for (i = 0; i < nargs; i++) {
+		int insn_sz = (args[i].ref) ? argtype_to_ldx_size(args[i].type) : BPF_DW;
+
+		pr_debug("prologue: load arg %d, insn_sz is %s\n",
+			 i, insn_sz_to_str(insn_sz));
+		ins(BPF_LDX_MEM(insn_sz, BPF_PROLOGUE_START_ARG_REG + i,
 				BPF_REG_FP, -BPF_REG_SIZE * (i + 1)), pos);
+	}
 
 	ins(BPF_JMP_IMM(BPF_JA, BPF_REG_0, 0, JMP_TO_SUCCESS_CODE), pos);
 
diff --git a/tools/perf/util/branch.c b/tools/perf/util/branch.c
new file mode 100644
index 000000000000..a4fce2729e50
--- /dev/null
+++ b/tools/perf/util/branch.c
@@ -0,0 +1,147 @@
+#include "perf.h"
+#include "util/util.h"
+#include "util/debug.h"
+#include "util/branch.h"
+
+static bool cross_area(u64 addr1, u64 addr2, int size)
+{
+	u64 align1, align2;
+
+	align1 = addr1 & ~(size - 1);
+	align2 = addr2 & ~(size - 1);
+
+	return (align1 != align2) ? true : false;
+}
+
+#define AREA_4K		4096
+#define AREA_2M		(2 * 1024 * 1024)
+
+void branch_type_count(struct branch_type_stat *st, struct branch_flags *flags,
+		       u64 from, u64 to)
+{
+	if (flags->type == PERF_BR_UNKNOWN || from == 0)
+		return;
+
+	st->counts[flags->type]++;
+
+	if (flags->type == PERF_BR_COND) {
+		if (to > from)
+			st->cond_fwd++;
+		else
+			st->cond_bwd++;
+	}
+
+	if (cross_area(from, to, AREA_2M))
+		st->cross_2m++;
+	else if (cross_area(from, to, AREA_4K))
+		st->cross_4k++;
+}
+
+const char *branch_type_name(int type)
+{
+	const char *branch_names[PERF_BR_MAX] = {
+		"N/A",
+		"COND",
+		"UNCOND",
+		"IND",
+		"CALL",
+		"IND_CALL",
+		"RET",
+		"SYSCALL",
+		"SYSRET",
+		"COND_CALL",
+		"COND_RET"
+	};
+
+	if (type >= 0 && type < PERF_BR_MAX)
+		return branch_names[type];
+
+	return NULL;
+}
+
+void branch_type_stat_display(FILE *fp, struct branch_type_stat *st)
+{
+	u64 total = 0;
+	int i;
+
+	for (i = 0; i < PERF_BR_MAX; i++)
+		total += st->counts[i];
+
+	if (total == 0)
+		return;
+
+	fprintf(fp, "\n#");
+	fprintf(fp, "\n# Branch Statistics:");
+	fprintf(fp, "\n#");
+
+	if (st->cond_fwd > 0) {
+		fprintf(fp, "\n%8s: %5.1f%%",
+			"COND_FWD",
+			100.0 * (double)st->cond_fwd / (double)total);
+	}
+
+	if (st->cond_bwd > 0) {
+		fprintf(fp, "\n%8s: %5.1f%%",
+			"COND_BWD",
+			100.0 * (double)st->cond_bwd / (double)total);
+	}
+
+	if (st->cross_4k > 0) {
+		fprintf(fp, "\n%8s: %5.1f%%",
+			"CROSS_4K",
+			100.0 * (double)st->cross_4k / (double)total);
+	}
+
+	if (st->cross_2m > 0) {
+		fprintf(fp, "\n%8s: %5.1f%%",
+			"CROSS_2M",
+			100.0 * (double)st->cross_2m / (double)total);
+	}
+
+	for (i = 0; i < PERF_BR_MAX; i++) {
+		if (st->counts[i] > 0)
+			fprintf(fp, "\n%8s: %5.1f%%",
+				branch_type_name(i),
+				100.0 *
+				(double)st->counts[i] / (double)total);
+	}
+}
+
+static int count_str_scnprintf(int idx, const char *str, char *bf, int size)
+{
+	return scnprintf(bf, size, "%s%s", (idx) ? " " : " (", str);
+}
+
+int branch_type_str(struct branch_type_stat *st, char *bf, int size)
+{
+	int i, j = 0, printed = 0;
+	u64 total = 0;
+
+	for (i = 0; i < PERF_BR_MAX; i++)
+		total += st->counts[i];
+
+	if (total == 0)
+		return 0;
+
+	if (st->cond_fwd > 0)
+		printed += count_str_scnprintf(j++, "COND_FWD", bf + printed, size - printed);
+
+	if (st->cond_bwd > 0)
+		printed += count_str_scnprintf(j++, "COND_BWD", bf + printed, size - printed);
+
+	for (i = 0; i < PERF_BR_MAX; i++) {
+		if (i == PERF_BR_COND)
+			continue;
+
+		if (st->counts[i] > 0)
+			printed += count_str_scnprintf(j++, branch_type_name(i), bf + printed, size - printed);
+	}
+
+	if (st->cross_4k > 0)
+		printed += count_str_scnprintf(j++, "CROSS_4K", bf + printed, size - printed);
+
+	if (st->cross_2m > 0)
+		printed += count_str_scnprintf(j++, "CROSS_2M", bf + printed, size - printed);
+
+	return printed;
+}
diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h
new file mode 100644
index 000000000000..1e3c7c5cdc63
--- /dev/null
+++ b/tools/perf/util/branch.h
@@ -0,0 +1,25 @@
+#ifndef _PERF_BRANCH_H
+#define _PERF_BRANCH_H 1
+
+#include <stdint.h>
+#include "../perf.h"
+
+struct branch_type_stat {
+	bool	branch_to;
+	u64	counts[PERF_BR_MAX];
+	u64	cond_fwd;
+	u64	cond_bwd;
+	u64	cross_4k;
+	u64	cross_2m;
+};
+
+struct branch_flags;
+
+void branch_type_count(struct branch_type_stat *st, struct branch_flags *flags,
+		       u64 from, u64 to);
+
+const char *branch_type_name(int type);
+void branch_type_stat_display(FILE *fp, struct branch_type_stat *st);
+int branch_type_str(struct branch_type_stat *st, char *bf, int bfsize);
+
+#endif /* _PERF_BRANCH_H */
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index e0148b081bdf..c1a06fcd7e70 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -243,12 +243,15 @@ static bool build_id_cache__valid_id(char *sbuild_id)
 	return result;
 }
 
-static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso)
+static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso,
+					    bool is_debug)
 {
-	return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : "elf");
+	return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : (is_debug ?
+	    "debug" : "elf"));
 }
 
-char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
+char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size,
+			     bool is_debug)
 {
 	bool is_kallsyms = dso__is_kallsyms((struct dso *)dso);
 	bool is_vdso = dso__is_vdso((struct dso *)dso);
@@ -270,7 +273,8 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
 		ret = asnprintf(&bf, size, "%s", linkname);
 	else
 		ret = asnprintf(&bf, size, "%s/%s", linkname,
-			 build_id_cache__basename(is_kallsyms, is_vdso));
+			 build_id_cache__basename(is_kallsyms, is_vdso,
+						  is_debug));
 	if (ret < 0 || (!alloc && size < (unsigned int)ret))
 		bf = NULL;
 	free(linkname);
@@ -285,7 +289,7 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
 		else
 
 static int write_buildid(const char *name, size_t name_len, u8 *build_id,
-			 pid_t pid, u16 misc, int fd)
+			 pid_t pid, u16 misc, struct feat_fd *fd)
 {
 	int err;
 	struct build_id_event b;
@@ -300,14 +304,15 @@ static int write_buildid(const char *name, size_t name_len, u8 *build_id,
 	b.header.misc = misc;
 	b.header.size = sizeof(b) + len;
 
-	err = writen(fd, &b, sizeof(b));
+	err = do_write(fd, &b, sizeof(b));
 	if (err < 0)
 		return err;
 
 	return write_padded(fd, name, name_len + 1, len);
 }
 
-static int machine__write_buildid_table(struct machine *machine, int fd)
+static int machine__write_buildid_table(struct machine *machine,
+					struct feat_fd *fd)
 {
 	int err = 0;
 	char nm[PATH_MAX];
@@ -352,7 +357,8 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
 	return err;
 }
 
-int perf_session__write_buildid_table(struct perf_session *session, int fd)
+int perf_session__write_buildid_table(struct perf_session *session,
+				      struct feat_fd *fd)
 {
 	struct rb_node *nd;
 	int err = machine__write_buildid_table(&session->machines.host, fd);
@@ -534,13 +540,14 @@ char *build_id_cache__complement(const char *incomplete_sbuild_id)
 }
 
 char *build_id_cache__cachedir(const char *sbuild_id, const char *name,
-			       bool is_kallsyms, bool is_vdso)
+			       struct nsinfo *nsi, bool is_kallsyms,
+			       bool is_vdso)
 {
 	char *realname = (char *)name, *filename;
 	bool slash = is_kallsyms || is_vdso;
 
 	if (!slash) {
-		realname = realpath(name, NULL);
+		realname = nsinfo__realpath(name, nsi);
 		if (!realname)
 			return NULL;
 	}
@@ -556,13 +563,13 @@ char *build_id_cache__cachedir(const char *sbuild_id, const char *name,
 	return filename;
 }
 
-int build_id_cache__list_build_ids(const char *pathname,
+int build_id_cache__list_build_ids(const char *pathname, struct nsinfo *nsi,
 				   struct strlist **result)
 {
 	char *dir_name;
 	int ret = 0;
 
-	dir_name = build_id_cache__cachedir(NULL, pathname, false, false);
+	dir_name = build_id_cache__cachedir(NULL, pathname, nsi, false, false);
 	if (!dir_name)
 		return -ENOMEM;
 
@@ -576,16 +583,20 @@ int build_id_cache__list_build_ids(const char *pathname,
 
 #if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_GELF_GETNOTE_SUPPORT)
 static int build_id_cache__add_sdt_cache(const char *sbuild_id,
-					  const char *realname)
+					  const char *realname,
+					  struct nsinfo *nsi)
 {
 	struct probe_cache *cache;
 	int ret;
+	struct nscookie nsc;
 
-	cache = probe_cache__new(sbuild_id);
+	cache = probe_cache__new(sbuild_id, nsi);
 	if (!cache)
 		return -1;
 
+	nsinfo__mountns_enter(nsi, &nsc);
 	ret = probe_cache__scan_sdt(cache, realname);
+	nsinfo__mountns_exit(&nsc);
 	if (ret >= 0) {
 		pr_debug4("Found %d SDTs in %s\n", ret, realname);
 		if (probe_cache__commit(cache) < 0)
@@ -595,25 +606,56 @@ static int build_id_cache__add_sdt_cache(const char *sbuild_id,
 	return ret;
 }
 #else
-#define build_id_cache__add_sdt_cache(sbuild_id, realname) (0)
+#define build_id_cache__add_sdt_cache(sbuild_id, realname, nsi) (0)
 #endif
 
+static char *build_id_cache__find_debug(const char *sbuild_id,
+					struct nsinfo *nsi)
+{
+	char *realname = NULL;
+	char *debugfile;
+	struct nscookie nsc;
+	size_t len = 0;
+
+	debugfile = calloc(1, PATH_MAX);
+	if (!debugfile)
+		goto out;
+
+	len = __symbol__join_symfs(debugfile, PATH_MAX,
+				   "/usr/lib/debug/.build-id/");
+	snprintf(debugfile + len, PATH_MAX - len, "%.2s/%s.debug", sbuild_id,
+		 sbuild_id + 2);
+
+	nsinfo__mountns_enter(nsi, &nsc);
+	realname = realpath(debugfile, NULL);
+	if (realname && access(realname, R_OK))
+		zfree(&realname);
+	nsinfo__mountns_exit(&nsc);
+out:
+	free(debugfile);
+	return realname;
+}
+
 int build_id_cache__add_s(const char *sbuild_id, const char *name,
-			  bool is_kallsyms, bool is_vdso)
+			  struct nsinfo *nsi, bool is_kallsyms, bool is_vdso)
 {
 	const size_t size = PATH_MAX;
 	char *realname = NULL, *filename = NULL, *dir_name = NULL,
 	     *linkname = zalloc(size), *tmp;
+	char *debugfile = NULL;
 	int err = -1;
 
 	if (!is_kallsyms) {
-		realname = realpath(name, NULL);
+		if (!is_vdso)
+			realname = nsinfo__realpath(name, nsi);
+		else
+			realname = realpath(name, NULL);
 		if (!realname)
 			goto out_free;
 	}
 
-	dir_name = build_id_cache__cachedir(sbuild_id, name,
-					    is_kallsyms, is_vdso);
+	dir_name = build_id_cache__cachedir(sbuild_id, name, nsi, is_kallsyms,
+					    is_vdso);
 	if (!dir_name)
 		goto out_free;
 
@@ -627,20 +669,52 @@ int build_id_cache__add_s(const char *sbuild_id, const char *name,
 
 	/* Save the allocated buildid dirname */
 	if (asprintf(&filename, "%s/%s", dir_name,
-		     build_id_cache__basename(is_kallsyms, is_vdso)) < 0) {
+		     build_id_cache__basename(is_kallsyms, is_vdso,
+		     false)) < 0) {
 		filename = NULL;
 		goto out_free;
 	}
 
 	if (access(filename, F_OK)) {
 		if (is_kallsyms) {
-			 if (copyfile("/proc/kallsyms", filename))
+			if (copyfile("/proc/kallsyms", filename))
+				goto out_free;
+		} else if (nsi && nsi->need_setns) {
+			if (copyfile_ns(name, filename, nsi))
 				goto out_free;
 		} else if (link(realname, filename) && errno != EEXIST &&
 				copyfile(name, filename))
 			goto out_free;
 	}
 
+	/* Some binaries are stripped, but have .debug files with their symbol
+	 * table.  Check to see if we can locate one of those, since the elf
+	 * file itself may not be very useful to users of our tools without a
+	 * symtab.
+	 */
+	if (!is_kallsyms && !is_vdso &&
+	    strncmp(".ko", name + strlen(name) - 3, 3)) {
+		debugfile = build_id_cache__find_debug(sbuild_id, nsi);
+		if (debugfile) {
+			zfree(&filename);
+			if (asprintf(&filename, "%s/%s", dir_name,
+			    build_id_cache__basename(false, false, true)) < 0) {
+				filename = NULL;
+				goto out_free;
+			}
+			if (access(filename, F_OK)) {
+				if (nsi && nsi->need_setns) {
+					if (copyfile_ns(debugfile, filename,
+							nsi))
+						goto out_free;
+				} else if (link(debugfile, filename) &&
+						errno != EEXIST &&
+						copyfile(debugfile, filename))
+					goto out_free;
+			}
+		}
+	}
+
 	if (!build_id_cache__linkname(sbuild_id, linkname, size))
 		goto out_free;
 	tmp = strrchr(linkname, '/');
@@ -657,27 +731,30 @@ int build_id_cache__add_s(const char *sbuild_id, const char *name,
 		err = 0;
 
 	/* Update SDT cache : error is just warned */
-	if (realname && build_id_cache__add_sdt_cache(sbuild_id, realname) < 0)
+	if (realname &&
+	    build_id_cache__add_sdt_cache(sbuild_id, realname, nsi) < 0)
 		pr_debug4("Failed to update/scan SDT cache for %s\n", realname);
 
 out_free:
 	if (!is_kallsyms)
 		free(realname);
 	free(filename);
+	free(debugfile);
 	free(dir_name);
 	free(linkname);
 	return err;
 }
 
 static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size,
-				 const char *name, bool is_kallsyms,
-				 bool is_vdso)
+				 const char *name, struct nsinfo *nsi,
+				 bool is_kallsyms, bool is_vdso)
 {
 	char sbuild_id[SBUILD_ID_SIZE];
 
 	build_id__sprintf(build_id, build_id_size, sbuild_id);
 
-	return build_id_cache__add_s(sbuild_id, name, is_kallsyms, is_vdso);
+	return build_id_cache__add_s(sbuild_id, name, nsi, is_kallsyms,
+				     is_vdso);
 }
 
 bool build_id_cache__cached(const char *sbuild_id)
@@ -743,7 +820,7 @@ static int dso__cache_build_id(struct dso *dso, struct machine *machine)
 		name = nm;
 	}
 	return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id), name,
-				     is_kallsyms, is_vdso);
+				     dso->nsinfo, is_kallsyms, is_vdso);
 }
 
 static int __dsos__cache_build_ids(struct list_head *head,
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index 96690a55c62c..c94b0dcbfd74 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -5,10 +5,12 @@
 #define SBUILD_ID_SIZE	(BUILD_ID_SIZE * 2 + 1)
 
 #include "tool.h"
+#include "namespaces.h"
 #include <linux/types.h>
 
 extern struct perf_tool build_id__mark_dso_hit_ops;
 struct dso;
+struct feat_fd;
 
 int build_id__sprintf(const u8 *build_id, int len, char *bf);
 int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id);
@@ -16,7 +18,8 @@ int filename__sprintf_build_id(const char *pathname, char *sbuild_id);
 char *build_id_cache__kallsyms_path(const char *sbuild_id, char *bf,
 				    size_t size);
 
-char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size);
+char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size,
+			     bool is_debug);
 
 int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event,
 			   struct perf_sample *sample, struct perf_evsel *evsel,
@@ -25,23 +28,26 @@ int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event,
 int dsos__hit_all(struct perf_session *session);
 
 bool perf_session__read_build_ids(struct perf_session *session, bool with_hits);
-int perf_session__write_buildid_table(struct perf_session *session, int fd);
+int perf_session__write_buildid_table(struct perf_session *session,
+				      struct feat_fd *fd);
 int perf_session__cache_build_ids(struct perf_session *session);
 
 char *build_id_cache__origname(const char *sbuild_id);
 char *build_id_cache__linkname(const char *sbuild_id, char *bf, size_t size);
 char *build_id_cache__cachedir(const char *sbuild_id, const char *name,
-			       bool is_kallsyms, bool is_vdso);
+			       struct nsinfo *nsi, bool is_kallsyms,
+			       bool is_vdso);
 
 struct strlist;
 
 struct strlist *build_id_cache__list_all(bool validonly);
 char *build_id_cache__complement(const char *incomplete_sbuild_id);
-int build_id_cache__list_build_ids(const char *pathname,
+int build_id_cache__list_build_ids(const char *pathname, struct nsinfo *nsi,
 				   struct strlist **result);
 bool build_id_cache__cached(const char *sbuild_id);
 int build_id_cache__add_s(const char *sbuild_id,
-			  const char *name, bool is_kallsyms, bool is_vdso);
+			  const char *name, struct nsinfo *nsi,
+			  bool is_kallsyms, bool is_vdso);
 int build_id_cache__remove_s(const char *sbuild_id);
 
 extern char buildid_dir[];
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index b4204b43ed58..f320b0777e0d 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -23,6 +23,7 @@
 #include "sort.h"
 #include "machine.h"
 #include "callchain.h"
+#include "branch.h"
 
 #define CALLCHAIN_PARAM_DEFAULT			\
 	.mode		= CHAIN_GRAPH_ABS,	\
@@ -303,7 +304,7 @@ int perf_callchain_config(const char *var, const char *value)
 {
 	char *endptr;
 
-	if (prefixcmp(var, "call-graph."))
+	if (!strstarts(var, "call-graph."))
 		return 0;
 	var += sizeof("call-graph.") - 1;
 
@@ -562,15 +563,33 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
 		if (cursor_node->branch) {
 			call->branch_count = 1;
 
-			if (cursor_node->branch_flags.predicted)
-				call->predicted_count = 1;
-
-			if (cursor_node->branch_flags.abort)
-				call->abort_count = 1;
-
-			call->cycles_count = cursor_node->branch_flags.cycles;
-			call->iter_count = cursor_node->nr_loop_iter;
-			call->samples_count = cursor_node->samples;
+			if (cursor_node->branch_from) {
+				/*
+				 * branch_from is set with value somewhere else
+				 * to imply it's "to" of a branch.
+				 */
+				call->brtype_stat.branch_to = true;
+
+				if (cursor_node->branch_flags.predicted)
+					call->predicted_count = 1;
+
+				if (cursor_node->branch_flags.abort)
+					call->abort_count = 1;
+
+				branch_type_count(&call->brtype_stat,
+						  &cursor_node->branch_flags,
+						  cursor_node->branch_from,
+						  cursor_node->ip);
+			} else {
+				/*
+				 * It's "from" of a branch
+				 */
+				call->brtype_stat.branch_to = false;
+				call->cycles_count =
+					cursor_node->branch_flags.cycles;
+				call->iter_count = cursor_node->nr_loop_iter;
+				call->samples_count = cursor_node->samples;
+			}
 		}
 
 		list_add_tail(&call->list, &node->val);
@@ -679,15 +698,32 @@ static enum match_result match_chain(struct callchain_cursor_node *node,
 		if (node->branch) {
 			cnode->branch_count++;
 
-			if (node->branch_flags.predicted)
-				cnode->predicted_count++;
-
-			if (node->branch_flags.abort)
-				cnode->abort_count++;
-
-			cnode->cycles_count += node->branch_flags.cycles;
-			cnode->iter_count += node->nr_loop_iter;
-			cnode->samples_count += node->samples;
+			if (node->branch_from) {
+				/*
+				 * It's "to" of a branch
+				 */
+				cnode->brtype_stat.branch_to = true;
+
+				if (node->branch_flags.predicted)
+					cnode->predicted_count++;
+
+				if (node->branch_flags.abort)
+					cnode->abort_count++;
+
+				branch_type_count(&cnode->brtype_stat,
+						  &node->branch_flags,
+						  node->branch_from,
+						  node->ip);
+			} else {
+				/*
+				 * It's "from" of a branch
+				 */
+				cnode->brtype_stat.branch_to = false;
+				cnode->cycles_count +=
+					node->branch_flags.cycles;
+				cnode->iter_count += node->nr_loop_iter;
+				cnode->samples_count += node->samples;
+			}
 		}
 
 		return MATCH_EQ;
@@ -922,7 +958,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
 	list_for_each_entry_safe(list, next_list, &src->val, list) {
 		callchain_cursor_append(cursor, list->ip,
 					list->ms.map, list->ms.sym,
-					false, NULL, 0, 0);
+					false, NULL, 0, 0, 0);
 		list_del(&list->list);
 		map__zput(list->ms.map);
 		free(list);
@@ -962,7 +998,7 @@ int callchain_merge(struct callchain_cursor *cursor,
 int callchain_cursor_append(struct callchain_cursor *cursor,
 			    u64 ip, struct map *map, struct symbol *sym,
 			    bool branch, struct branch_flags *flags,
-			    int nr_loop_iter, int samples)
+			    int nr_loop_iter, int samples, u64 branch_from)
 {
 	struct callchain_cursor_node *node = *cursor->last;
 
@@ -986,6 +1022,7 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
 		memcpy(&node->branch_flags, flags,
 			sizeof(struct branch_flags));
 
+	node->branch_from = branch_from;
 	cursor->nr++;
 
 	cursor->last = &node->next;
@@ -998,11 +1035,11 @@ int sample__resolve_callchain(struct perf_sample *sample,
 			      struct perf_evsel *evsel, struct addr_location *al,
 			      int max_stack)
 {
-	if (sample->callchain == NULL)
+	if (sample->callchain == NULL && !symbol_conf.show_branchflag_count)
 		return 0;
 
 	if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain ||
-	    perf_hpp_list.parent) {
+	    perf_hpp_list.parent || symbol_conf.show_branchflag_count) {
 		return thread__resolve_callchain(al->thread, cursor, evsel, sample,
 						 parent, al, max_stack);
 	}
@@ -1011,7 +1048,8 @@ int sample__resolve_callchain(struct perf_sample *sample,
 
 int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *sample)
 {
-	if (!symbol_conf.use_callchain || sample->callchain == NULL)
+	if ((!symbol_conf.use_callchain || sample->callchain == NULL) &&
+		!symbol_conf.show_branchflag_count)
 		return 0;
 	return callchain_append(he->callchain, &callchain_cursor, sample->period);
 }
@@ -1214,95 +1252,120 @@ int callchain_branch_counts(struct callchain_root *root,
 						  cycles_count);
 }
 
-static int counts_str_build(char *bf, int bfsize,
-			     u64 branch_count, u64 predicted_count,
-			     u64 abort_count, u64 cycles_count,
-			     u64 iter_count, u64 samples_count)
+static int count_pri64_printf(int idx, const char *str, u64 value, char *bf, int bfsize)
 {
-	double predicted_percent = 0.0;
-	const char *null_str = "";
-	char iter_str[32];
-	char cycle_str[32];
-	char *istr, *cstr;
-	u64 cycles;
+	int printed;
 
-	if (branch_count == 0)
-		return scnprintf(bf, bfsize, " (calltrace)");
+	printed = scnprintf(bf, bfsize, "%s%s:%" PRId64 "", (idx) ? " " : " (", str, value);
 
-	cycles = cycles_count / branch_count;
+	return printed;
+}
 
-	if (iter_count && samples_count) {
-		if (cycles > 0)
-			scnprintf(iter_str, sizeof(iter_str),
-				 " iterations:%" PRId64 "",
-				 iter_count / samples_count);
-		else
-			scnprintf(iter_str, sizeof(iter_str),
-				 "iterations:%" PRId64 "",
-				 iter_count / samples_count);
-		istr = iter_str;
-	} else
-		istr = (char *)null_str;
+static int count_float_printf(int idx, const char *str, float value,
+			      char *bf, int bfsize, float threshold)
+{
+	int printed;
 
-	if (cycles > 0) {
-		scnprintf(cycle_str, sizeof(cycle_str),
-			  "cycles:%" PRId64 "", cycles);
-		cstr = cycle_str;
-	} else
-		cstr = (char *)null_str;
+	if (threshold != 0.0 && value < threshold)
+		return 0;
 
-	predicted_percent = predicted_count * 100.0 / branch_count;
+	printed = scnprintf(bf, bfsize, "%s%s:%.1f%%", (idx) ? " " : " (", str, value);
 
-	if ((predicted_count == branch_count) && (abort_count == 0)) {
-		if ((cycles > 0) || (istr != (char *)null_str))
-			return scnprintf(bf, bfsize, " (%s%s)", cstr, istr);
-		else
-			return scnprintf(bf, bfsize, "%s", (char *)null_str);
+	return printed;
+}
+
+static int branch_to_str(char *bf, int bfsize,
+			 u64 branch_count, u64 predicted_count,
+			 u64 abort_count,
+			 struct branch_type_stat *brtype_stat)
+{
+	int printed, i = 0;
+
+	printed = branch_type_str(brtype_stat, bf, bfsize);
+	if (printed)
+		i++;
+
+	if (predicted_count < branch_count) {
+		printed += count_float_printf(i++, "predicted",
+				predicted_count * 100.0 / branch_count,
+				bf + printed, bfsize - printed, 0.0);
 	}
 
-	if ((predicted_count < branch_count) && (abort_count == 0)) {
-		if ((cycles > 0) || (istr != (char *)null_str))
-			return scnprintf(bf, bfsize,
-				" (predicted:%.1f%% %s%s)",
-				predicted_percent, cstr, istr);
-		else {
-			return scnprintf(bf, bfsize,
-				" (predicted:%.1f%%)",
-				predicted_percent);
-		}
+	if (abort_count) {
+		printed += count_float_printf(i++, "abort",
+				abort_count * 100.0 / branch_count,
+				bf + printed, bfsize - printed, 0.1);
 	}
 
-	if ((predicted_count == branch_count) && (abort_count > 0)) {
-		if ((cycles > 0) || (istr != (char *)null_str))
-			return scnprintf(bf, bfsize,
-				" (abort:%" PRId64 " %s%s)",
-				abort_count, cstr, istr);
-		else
-			return scnprintf(bf, bfsize,
-				" (abort:%" PRId64 ")",
-				abort_count);
+	if (i)
+		printed += scnprintf(bf + printed, bfsize - printed, ")");
+
+	return printed;
+}
+
+static int branch_from_str(char *bf, int bfsize,
+			   u64 branch_count,
+			   u64 cycles_count, u64 iter_count,
+			   u64 samples_count)
+{
+	int printed = 0, i = 0;
+	u64 cycles;
+
+	cycles = cycles_count / branch_count;
+	if (cycles) {
+		printed += count_pri64_printf(i++, "cycles",
+				cycles,
+				bf + printed, bfsize - printed);
+	}
+
+	if (iter_count && samples_count) {
+		printed += count_pri64_printf(i++, "iterations",
+				iter_count / samples_count,
+				bf + printed, bfsize - printed);
+	}
+
+	if (i)
+		printed += scnprintf(bf + printed, bfsize - printed, ")");
+
+	return printed;
+}
+
+static int counts_str_build(char *bf, int bfsize,
+			     u64 branch_count, u64 predicted_count,
+			     u64 abort_count, u64 cycles_count,
+			     u64 iter_count, u64 samples_count,
+			     struct branch_type_stat *brtype_stat)
+{
+	int printed;
+
+	if (branch_count == 0)
+		return scnprintf(bf, bfsize, " (calltrace)");
+
+	if (brtype_stat->branch_to) {
+		printed = branch_to_str(bf, bfsize, branch_count,
+				predicted_count, abort_count, brtype_stat);
+	} else {
+		printed = branch_from_str(bf, bfsize, branch_count,
+				cycles_count, iter_count, samples_count);
 	}
 
-	if ((cycles > 0) || (istr != (char *)null_str))
-		return scnprintf(bf, bfsize,
-			" (predicted:%.1f%% abort:%" PRId64 " %s%s)",
-			predicted_percent, abort_count, cstr, istr);
+	if (!printed)
+		bf[0] = 0;
 
-	return scnprintf(bf, bfsize,
-			" (predicted:%.1f%% abort:%" PRId64 ")",
-			predicted_percent, abort_count);
+	return printed;
 }
 
 static int callchain_counts_printf(FILE *fp, char *bf, int bfsize,
 				   u64 branch_count, u64 predicted_count,
 				   u64 abort_count, u64 cycles_count,
-				   u64 iter_count, u64 samples_count)
+				   u64 iter_count, u64 samples_count,
+				   struct branch_type_stat *brtype_stat)
 {
-	char str[128];
+	char str[256];
 
 	counts_str_build(str, sizeof(str), branch_count,
 			 predicted_count, abort_count, cycles_count,
-			 iter_count, samples_count);
+			 iter_count, samples_count, brtype_stat);
 
 	if (fp)
 		return fprintf(fp, "%s", str);
@@ -1334,7 +1397,8 @@ int callchain_list_counts__printf_value(struct callchain_node *node,
 
 	return callchain_counts_printf(fp, bf, bfsize, branch_count,
 				       predicted_count, abort_count,
-				       cycles_count, iter_count, samples_count);
+				       cycles_count, iter_count, samples_count,
+				       &clist->brtype_stat);
 }
 
 static void free_callchain_node(struct callchain_node *node)
@@ -1459,7 +1523,8 @@ int callchain_cursor__copy(struct callchain_cursor *dst,
 
 		rc = callchain_cursor_append(dst, node->ip, node->map, node->sym,
 					     node->branch, &node->branch_flags,
-					     node->nr_loop_iter, node->samples);
+					     node->nr_loop_iter, node->samples,
+					     node->branch_from);
 		if (rc)
 			break;
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index c56c23dbbf72..97738201464a 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -7,6 +7,7 @@
 #include "event.h"
 #include "map.h"
 #include "symbol.h"
+#include "branch.h"
 
 #define HELP_PAD "\t\t\t\t"
 
@@ -119,6 +120,7 @@ struct callchain_list {
 	u64			cycles_count;
 	u64			iter_count;
 	u64			samples_count;
+	struct branch_type_stat brtype_stat;
 	char		       *srcline;
 	struct list_head	list;
 };
@@ -135,6 +137,7 @@ struct callchain_cursor_node {
 	struct symbol			*sym;
 	bool				branch;
 	struct branch_flags		branch_flags;
+	u64				branch_from;
 	int				nr_loop_iter;
 	int				samples;
 	struct callchain_cursor_node	*next;
@@ -198,7 +201,7 @@ static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
 int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
 			    struct map *map, struct symbol *sym,
 			    bool branch, struct branch_flags *flags,
-			    int nr_loop_iter, int samples);
+			    int nr_loop_iter, int samples, u64 branch_from);
 
 /* Close a cursor writing session. Initialize for the reader */
 static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 03347748f3fa..0e77bc9e5f3c 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -98,8 +98,10 @@ static int add_cgroup(struct perf_evlist *evlist, char *str)
 		cgrp = counter->cgrp;
 		if (!cgrp)
 			continue;
-		if (!strcmp(cgrp->name, str))
+		if (!strcmp(cgrp->name, str)) {
+			refcount_inc(&cgrp->refcnt);
 			break;
+		}
 
 		cgrp = NULL;
 	}
@@ -110,6 +112,7 @@ static int add_cgroup(struct perf_evlist *evlist, char *str)
 			return -1;
 
 		cgrp->name = str;
+		refcount_set(&cgrp->refcnt, 1);
 
 		cgrp->fd = open_cgroup(str);
 		if (cgrp->fd == -1) {
@@ -128,12 +131,11 @@ static int add_cgroup(struct perf_evlist *evlist, char *str)
 			goto found;
 		n++;
 	}
-	if (refcount_read(&cgrp->refcnt) == 0)
+	if (refcount_dec_and_test(&cgrp->refcnt))
 		free(cgrp);
 
 	return -1;
 found:
-	refcount_inc(&cgrp->refcnt);
 	counter->cgrp = cgrp;
 	return 0;
 }
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 31a7dea248d0..bc75596f9e79 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -19,6 +19,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <linux/string.h>
 
 #include "sane_ctype.h"
 
@@ -433,22 +434,22 @@ static int perf_ui_config(const char *var, const char *value)
 int perf_default_config(const char *var, const char *value,
 			void *dummy __maybe_unused)
 {
-	if (!prefixcmp(var, "core."))
+	if (strstarts(var, "core."))
 		return perf_default_core_config(var, value);
 
-	if (!prefixcmp(var, "hist."))
+	if (strstarts(var, "hist."))
 		return perf_hist_config(var, value);
 
-	if (!prefixcmp(var, "ui."))
+	if (strstarts(var, "ui."))
 		return perf_ui_config(var, value);
 
-	if (!prefixcmp(var, "call-graph."))
+	if (strstarts(var, "call-graph."))
 		return perf_callchain_config(var, value);
 
-	if (!prefixcmp(var, "llvm."))
+	if (strstarts(var, "llvm."))
 		return perf_llvm_config(var, value);
 
-	if (!prefixcmp(var, "buildid."))
+	if (strstarts(var, "buildid."))
 		return perf_buildid_config(var, value);
 
 	/* Add other config variables here. */
diff --git a/tools/perf/util/counts.h b/tools/perf/util/counts.h
index 34d8baaf558a..cb45a6aecf9d 100644
--- a/tools/perf/util/counts.h
+++ b/tools/perf/util/counts.h
@@ -12,6 +12,7 @@ struct perf_counts_values {
 		};
 		u64 values[3];
 	};
+	bool	loaded;
 };
 
 struct perf_counts {
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 3149b70799fd..2346cecb8ea2 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -76,6 +76,8 @@ struct ctf_writer {
 	struct bt_ctf_event_class	*comm_class;
 	struct bt_ctf_event_class	*exit_class;
 	struct bt_ctf_event_class	*fork_class;
+	struct bt_ctf_event_class	*mmap_class;
+	struct bt_ctf_event_class	*mmap2_class;
 };
 
 struct convert {
@@ -506,6 +508,81 @@ put_len_type:
 	return ret;
 }
 
+static int
+add_callchain_output_values(struct bt_ctf_event_class *event_class,
+		      struct bt_ctf_event *event,
+		      struct ip_callchain *callchain)
+{
+	struct bt_ctf_field_type *len_type, *seq_type;
+	struct bt_ctf_field *len_field, *seq_field;
+	unsigned int nr_elements = callchain->nr;
+	unsigned int i;
+	int ret;
+
+	len_type = bt_ctf_event_class_get_field_by_name(
+			event_class, "perf_callchain_size");
+	len_field = bt_ctf_field_create(len_type);
+	if (!len_field) {
+		pr_err("failed to create 'perf_callchain_size' for callchain output event\n");
+		ret = -1;
+		goto put_len_type;
+	}
+
+	ret = bt_ctf_field_unsigned_integer_set_value(len_field, nr_elements);
+	if (ret) {
+		pr_err("failed to set field value for perf_callchain_size\n");
+		goto put_len_field;
+	}
+	ret = bt_ctf_event_set_payload(event, "perf_callchain_size", len_field);
+	if (ret) {
+		pr_err("failed to set payload to perf_callchain_size\n");
+		goto put_len_field;
+	}
+
+	seq_type = bt_ctf_event_class_get_field_by_name(
+			event_class, "perf_callchain");
+	seq_field = bt_ctf_field_create(seq_type);
+	if (!seq_field) {
+		pr_err("failed to create 'perf_callchain' for callchain output event\n");
+		ret = -1;
+		goto put_seq_type;
+	}
+
+	ret = bt_ctf_field_sequence_set_length(seq_field, len_field);
+	if (ret) {
+		pr_err("failed to set length of 'perf_callchain'\n");
+		goto put_seq_field;
+	}
+
+	for (i = 0; i < nr_elements; i++) {
+		struct bt_ctf_field *elem_field =
+			bt_ctf_field_sequence_get_field(seq_field, i);
+
+		ret = bt_ctf_field_unsigned_integer_set_value(elem_field,
+				((u64 *)(callchain->ips))[i]);
+
+		bt_ctf_field_put(elem_field);
+		if (ret) {
+			pr_err("failed to set callchain[%d]\n", i);
+			goto put_seq_field;
+		}
+	}
+
+	ret = bt_ctf_event_set_payload(event, "perf_callchain", seq_field);
+	if (ret)
+		pr_err("failed to set payload for raw_data\n");
+
+put_seq_field:
+	bt_ctf_field_put(seq_field);
+put_seq_type:
+	bt_ctf_field_type_put(seq_type);
+put_len_field:
+	bt_ctf_field_put(len_field);
+put_len_type:
+	bt_ctf_field_type_put(len_type);
+	return ret;
+}
+
 static int add_generic_values(struct ctf_writer *cw,
 			      struct bt_ctf_event *event,
 			      struct perf_evsel *evsel,
@@ -519,7 +596,6 @@ static int add_generic_values(struct ctf_writer *cw,
 	 *   PERF_SAMPLE_TIME         - not needed as we have it in
 	 *                              ctf event header
 	 *   PERF_SAMPLE_READ         - TODO
-	 *   PERF_SAMPLE_CALLCHAIN    - TODO
 	 *   PERF_SAMPLE_RAW          - tracepoint fields are handled separately
 	 *   PERF_SAMPLE_BRANCH_STACK - TODO
 	 *   PERF_SAMPLE_REGS_USER    - TODO
@@ -720,6 +796,7 @@ static int process_sample_event(struct perf_tool *tool,
 	struct bt_ctf_event_class *event_class;
 	struct bt_ctf_event *event;
 	int ret;
+	unsigned long type = evsel->attr.sample_type;
 
 	if (WARN_ONCE(!priv, "Failed to setup all events.\n"))
 		return 0;
@@ -751,6 +828,13 @@ static int process_sample_event(struct perf_tool *tool,
 			return -1;
 	}
 
+	if (type & PERF_SAMPLE_CALLCHAIN) {
+		ret = add_callchain_output_values(event_class,
+				event, sample->callchain);
+		if (ret)
+			return -1;
+	}
+
 	if (perf_evsel__is_bpf_output(evsel)) {
 		ret = add_bpf_output_values(event_class, event, sample);
 		if (ret)
@@ -833,6 +917,18 @@ __FUNC_PROCESS_NON_SAMPLE(exit,
 	__NON_SAMPLE_SET_FIELD(fork, u32, ptid);
 	__NON_SAMPLE_SET_FIELD(fork, u64, time);
 )
+__FUNC_PROCESS_NON_SAMPLE(mmap,
+	__NON_SAMPLE_SET_FIELD(mmap, u32, pid);
+	__NON_SAMPLE_SET_FIELD(mmap, u32, tid);
+	__NON_SAMPLE_SET_FIELD(mmap, u64_hex, start);
+	__NON_SAMPLE_SET_FIELD(mmap, string, filename);
+)
+__FUNC_PROCESS_NON_SAMPLE(mmap2,
+	__NON_SAMPLE_SET_FIELD(mmap2, u32, pid);
+	__NON_SAMPLE_SET_FIELD(mmap2, u32, tid);
+	__NON_SAMPLE_SET_FIELD(mmap2, u64_hex, start);
+	__NON_SAMPLE_SET_FIELD(mmap2, string, filename);
+)
 #undef __NON_SAMPLE_SET_FIELD
 #undef __FUNC_PROCESS_NON_SAMPLE
 
@@ -1043,6 +1139,14 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
 	if (type & PERF_SAMPLE_TRANSACTION)
 		ADD_FIELD(event_class, cw->data.u64, "perf_transaction");
 
+	if (type & PERF_SAMPLE_CALLCHAIN) {
+		ADD_FIELD(event_class, cw->data.u32, "perf_callchain_size");
+		ADD_FIELD(event_class,
+			bt_ctf_field_type_sequence_create(
+				cw->data.u64_hex, "perf_callchain_size"),
+			"perf_callchain");
+	}
+
 #undef ADD_FIELD
 	return 0;
 }
@@ -1164,6 +1268,19 @@ __FUNC_ADD_NON_SAMPLE_EVENT_CLASS(exit,
 	__NON_SAMPLE_ADD_FIELD(u64, time);
 )
 
+__FUNC_ADD_NON_SAMPLE_EVENT_CLASS(mmap,
+	__NON_SAMPLE_ADD_FIELD(u32, pid);
+	__NON_SAMPLE_ADD_FIELD(u32, tid);
+	__NON_SAMPLE_ADD_FIELD(u64_hex, start);
+	__NON_SAMPLE_ADD_FIELD(string, filename);
+)
+
+__FUNC_ADD_NON_SAMPLE_EVENT_CLASS(mmap2,
+	__NON_SAMPLE_ADD_FIELD(u32, pid);
+	__NON_SAMPLE_ADD_FIELD(u32, tid);
+	__NON_SAMPLE_ADD_FIELD(u64_hex, start);
+	__NON_SAMPLE_ADD_FIELD(string, filename);
+)
 #undef __NON_SAMPLE_ADD_FIELD
 #undef __FUNC_ADD_NON_SAMPLE_EVENT_CLASS
 
@@ -1181,6 +1298,12 @@ static int setup_non_sample_events(struct ctf_writer *cw,
 	ret = add_fork_event(cw);
 	if (ret)
 		return ret;
+	ret = add_mmap_event(cw);
+	if (ret)
+		return ret;
+	ret = add_mmap2_event(cw);
+	if (ret)
+		return ret;
 	return 0;
 }
 
@@ -1482,6 +1605,8 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 		c.tool.comm = process_comm_event;
 		c.tool.exit = process_exit_event;
 		c.tool.fork = process_fork_event;
+		c.tool.mmap = process_mmap_event;
+		c.tool.mmap2 = process_mmap2_event;
 	}
 
 	err = perf_config(convert__config, &c);
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 4e7ab611377a..b9e087fb8247 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -32,6 +32,7 @@ char dso__symtab_origin(const struct dso *dso)
 		[DSO_BINARY_TYPE__JAVA_JIT]			= 'j',
 		[DSO_BINARY_TYPE__DEBUGLINK]			= 'l',
 		[DSO_BINARY_TYPE__BUILD_ID_CACHE]		= 'B',
+		[DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO]	= 'D',
 		[DSO_BINARY_TYPE__FEDORA_DEBUGINFO]		= 'f',
 		[DSO_BINARY_TYPE__UBUNTU_DEBUGINFO]		= 'u',
 		[DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO]	= 'o',
@@ -97,7 +98,12 @@ int dso__read_binary_type_filename(const struct dso *dso,
 		break;
 	}
 	case DSO_BINARY_TYPE__BUILD_ID_CACHE:
-		if (dso__build_id_filename(dso, filename, size) == NULL)
+		if (dso__build_id_filename(dso, filename, size, false) == NULL)
+			ret = -1;
+		break;
+
+	case DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO:
+		if (dso__build_id_filename(dso, filename, size, true) == NULL)
 			ret = -1;
 		break;
 
@@ -504,7 +510,14 @@ static void check_data_close(void);
  */
 static int open_dso(struct dso *dso, struct machine *machine)
 {
-	int fd = __open_dso(dso, machine);
+	int fd;
+	struct nscookie nsc;
+
+	if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE)
+		nsinfo__mountns_enter(dso->nsinfo, &nsc);
+	fd = __open_dso(dso, machine);
+	if (dso->binary_type != DSO_BINARY_TYPE__BUILD_ID_CACHE)
+		nsinfo__mountns_exit(&nsc);
 
 	if (fd >= 0) {
 		dso__list_add(dso);
@@ -1236,6 +1249,7 @@ void dso__delete(struct dso *dso)
 	dso_cache__free(dso);
 	dso__free_a2l(dso);
 	zfree(&dso->symsrc_filename);
+	nsinfo__zput(dso->nsinfo);
 	pthread_mutex_destroy(&dso->lock);
 	free(dso);
 }
@@ -1301,6 +1315,7 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
 {
 	bool have_build_id = false;
 	struct dso *pos;
+	struct nscookie nsc;
 
 	list_for_each_entry(pos, head, node) {
 		if (with_hits && !pos->hit && !dso__is_vdso(pos))
@@ -1309,11 +1324,13 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
 			have_build_id = true;
 			continue;
 		}
+		nsinfo__mountns_enter(pos->nsinfo, &nsc);
 		if (filename__read_build_id(pos->long_name, pos->build_id,
 					    sizeof(pos->build_id)) > 0) {
 			have_build_id	  = true;
 			pos->has_build_id = true;
 		}
+		nsinfo__mountns_exit(&nsc);
 	}
 
 	return have_build_id;
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index bd061ba7b47c..f886141678eb 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -10,6 +10,7 @@
 #include <linux/types.h>
 #include <linux/bitops.h>
 #include "map.h"
+#include "namespaces.h"
 #include "build-id.h"
 
 enum dso_binary_type {
@@ -20,6 +21,7 @@ enum dso_binary_type {
 	DSO_BINARY_TYPE__JAVA_JIT,
 	DSO_BINARY_TYPE__DEBUGLINK,
 	DSO_BINARY_TYPE__BUILD_ID_CACHE,
+	DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO,
 	DSO_BINARY_TYPE__FEDORA_DEBUGINFO,
 	DSO_BINARY_TYPE__UBUNTU_DEBUGINFO,
 	DSO_BINARY_TYPE__BUILDID_DEBUGINFO,
@@ -187,6 +189,7 @@ struct dso {
 		void	 *priv;
 		u64	 db_id;
 	};
+	struct nsinfo	*nsinfo;
 	refcount_t	 refcnt;
 	char		 name[0];
 };
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index dc5c3bb69d73..1c905ba3641b 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -57,6 +57,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_STAT_ROUND]		= "STAT_ROUND",
 	[PERF_RECORD_EVENT_UPDATE]		= "EVENT_UPDATE",
 	[PERF_RECORD_TIME_CONV]			= "TIME_CONV",
+	[PERF_RECORD_HEADER_FEATURE]		= "FEATURE",
 };
 
 static const char *perf_ns__names[] = {
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 9967c87af7a6..423ac82605f3 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -142,7 +142,8 @@ struct branch_flags {
 	u64 in_tx:1;
 	u64 abort:1;
 	u64 cycles:16;
-	u64 reserved:44;
+	u64 type:4;
+	u64 reserved:40;
 };
 
 struct branch_entry {
@@ -244,6 +245,7 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_STAT_ROUND			= 77,
 	PERF_RECORD_EVENT_UPDATE		= 78,
 	PERF_RECORD_TIME_CONV			= 79,
+	PERF_RECORD_HEADER_FEATURE		= 80,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -609,6 +611,12 @@ struct time_conv_event {
 	u64 time_zero;
 };
 
+struct feature_event {
+	struct perf_event_header 	header;
+	u64				feat_id;
+	char				data[];
+};
+
 union perf_event {
 	struct perf_event_header	header;
 	struct mmap_event		mmap;
@@ -639,6 +647,7 @@ union perf_event {
 	struct stat_event		stat;
 	struct stat_round_event		stat_round;
 	struct time_conv_event		time_conv;
+	struct feature_event		feat;
 };
 
 void perf_event__print_totals(void);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 46c0faf6c502..6a0d7ffbeba0 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -242,9 +242,9 @@ void perf_event_attr__set_max_precise_ip(struct perf_event_attr *attr)
 	}
 }
 
-int perf_evlist__add_default(struct perf_evlist *evlist)
+int __perf_evlist__add_default(struct perf_evlist *evlist, bool precise)
 {
-	struct perf_evsel *evsel = perf_evsel__new_cycles();
+	struct perf_evsel *evsel = perf_evsel__new_cycles(precise);
 
 	if (evsel == NULL)
 		return -ENOMEM;
@@ -1419,8 +1419,6 @@ int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **e
 {
 	struct perf_evsel *evsel;
 	int err = 0;
-	const int ncpus = cpu_map__nr(evlist->cpus),
-		  nthreads = thread_map__nr(evlist->threads);
 
 	evlist__for_each_entry(evlist, evsel) {
 		if (evsel->filter == NULL)
@@ -1430,7 +1428,7 @@ int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **e
 		 * filters only work for tracepoint event, which doesn't have cpu limit.
 		 * So evlist and evsel should always be same.
 		 */
-		err = perf_evsel__apply_filter(evsel, ncpus, nthreads, evsel->filter);
+		err = perf_evsel__apply_filter(evsel, evsel->filter);
 		if (err) {
 			*err_evsel = evsel;
 			break;
@@ -1623,13 +1621,9 @@ void perf_evlist__set_selected(struct perf_evlist *evlist,
 void perf_evlist__close(struct perf_evlist *evlist)
 {
 	struct perf_evsel *evsel;
-	int ncpus = cpu_map__nr(evlist->cpus);
-	int nthreads = thread_map__nr(evlist->threads);
 
-	evlist__for_each_entry_reverse(evlist, evsel) {
-		int n = evsel->cpus ? evsel->cpus->nr : ncpus;
-		perf_evsel__close(evsel, n, nthreads);
-	}
+	evlist__for_each_entry_reverse(evlist, evsel)
+		perf_evsel__close(evsel);
 }
 
 static int perf_evlist__create_syswide_maps(struct perf_evlist *evlist)
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 8d601fbdd8d6..bf2c4936e35f 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -115,7 +115,14 @@ void perf_evlist__delete(struct perf_evlist *evlist);
 
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 void perf_evlist__remove(struct perf_evlist *evlist, struct perf_evsel *evsel);
-int perf_evlist__add_default(struct perf_evlist *evlist);
+
+int __perf_evlist__add_default(struct perf_evlist *evlist, bool precise);
+
+static inline int perf_evlist__add_default(struct perf_evlist *evlist)
+{
+	return __perf_evlist__add_default(evlist, true);
+}
+
 int __perf_evlist__add_default_attrs(struct perf_evlist *evlist,
 				     struct perf_event_attr *attrs, size_t nr_attrs);
 
@@ -258,6 +265,11 @@ bool perf_evlist__valid_read_format(struct perf_evlist *evlist);
 void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
 				   struct list_head *list);
 
+static inline bool perf_evlist__empty(struct perf_evlist *evlist)
+{
+	return list_empty(&evlist->entries);
+}
+
 static inline struct perf_evsel *perf_evlist__first(struct perf_evlist *evlist)
 {
 	return list_entry(evlist->entries.next, struct perf_evsel, node);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 413f74df08de..d9bd632ed7db 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -49,6 +49,7 @@ static struct {
 	bool clockid_wrong;
 	bool lbr_flags;
 	bool write_backward;
+	bool group_read;
 } perf_missing_features;
 
 static clockid_t clockid;
@@ -58,6 +59,8 @@ static int perf_evsel__no_extra_init(struct perf_evsel *evsel __maybe_unused)
 	return 0;
 }
 
+void __weak test_attr__ready(void) { }
+
 static void perf_evsel__no_extra_fini(struct perf_evsel *evsel __maybe_unused)
 {
 }
@@ -268,7 +271,7 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
 	return evsel;
 }
 
-struct perf_evsel *perf_evsel__new_cycles(void)
+struct perf_evsel *perf_evsel__new_cycles(bool precise)
 {
 	struct perf_event_attr attr = {
 		.type	= PERF_TYPE_HARDWARE,
@@ -278,6 +281,9 @@ struct perf_evsel *perf_evsel__new_cycles(void)
 	struct perf_evsel *evsel;
 
 	event_attr_init(&attr);
+
+	if (!precise)
+		goto new_event;
 	/*
 	 * Unnamed union member, not supported as struct member named
 	 * initializer in older compilers such as gcc 4.4.7
@@ -292,7 +298,7 @@ struct perf_evsel *perf_evsel__new_cycles(void)
 	 * to kick in when we return and before perf_evsel__open() is called.
 	 */
 	attr.sample_period = 0;
-
+new_event:
 	evsel = perf_evsel__new(&attr);
 	if (evsel == NULL)
 		goto out;
@@ -896,8 +902,13 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
 	if (opts->no_samples)
 		attr->sample_freq = 0;
 
-	if (opts->inherit_stat)
+	if (opts->inherit_stat) {
+		evsel->attr.read_format |=
+			PERF_FORMAT_TOTAL_TIME_ENABLED |
+			PERF_FORMAT_TOTAL_TIME_RUNNING |
+			PERF_FORMAT_ID;
 		attr->inherit_stat = 1;
+	}
 
 	if (opts->sample_address) {
 		perf_evsel__set_sample_bit(evsel, ADDR);
@@ -1045,16 +1056,13 @@ static int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthread
 	return evsel->fd != NULL ? 0 : -ENOMEM;
 }
 
-static int perf_evsel__run_ioctl(struct perf_evsel *evsel, int ncpus, int nthreads,
+static int perf_evsel__run_ioctl(struct perf_evsel *evsel,
 			  int ioc,  void *arg)
 {
 	int cpu, thread;
 
-	if (evsel->system_wide)
-		nthreads = 1;
-
-	for (cpu = 0; cpu < ncpus; cpu++) {
-		for (thread = 0; thread < nthreads; thread++) {
+	for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) {
+		for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
 			int fd = FD(evsel, cpu, thread),
 			    err = ioctl(fd, ioc, arg);
 
@@ -1066,10 +1074,9 @@ static int perf_evsel__run_ioctl(struct perf_evsel *evsel, int ncpus, int nthrea
 	return 0;
 }
 
-int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads,
-			     const char *filter)
+int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter)
 {
-	return perf_evsel__run_ioctl(evsel, ncpus, nthreads,
+	return perf_evsel__run_ioctl(evsel,
 				     PERF_EVENT_IOC_SET_FILTER,
 				     (void *)filter);
 }
@@ -1116,20 +1123,14 @@ int perf_evsel__append_addr_filter(struct perf_evsel *evsel, const char *filter)
 
 int perf_evsel__enable(struct perf_evsel *evsel)
 {
-	int nthreads = thread_map__nr(evsel->threads);
-	int ncpus = cpu_map__nr(evsel->cpus);
-
-	return perf_evsel__run_ioctl(evsel, ncpus, nthreads,
+	return perf_evsel__run_ioctl(evsel,
 				     PERF_EVENT_IOC_ENABLE,
 				     0);
 }
 
 int perf_evsel__disable(struct perf_evsel *evsel)
 {
-	int nthreads = thread_map__nr(evsel->threads);
-	int ncpus = cpu_map__nr(evsel->cpus);
-
-	return perf_evsel__run_ioctl(evsel, ncpus, nthreads,
+	return perf_evsel__run_ioctl(evsel,
 				     PERF_EVENT_IOC_DISABLE,
 				     0);
 }
@@ -1179,15 +1180,12 @@ static void perf_evsel__free_config_terms(struct perf_evsel *evsel)
 	}
 }
 
-void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
+void perf_evsel__close_fd(struct perf_evsel *evsel)
 {
 	int cpu, thread;
 
-	if (evsel->system_wide)
-		nthreads = 1;
-
-	for (cpu = 0; cpu < ncpus; cpu++)
-		for (thread = 0; thread < nthreads; ++thread) {
+	for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++)
+		for (thread = 0; thread < xyarray__max_y(evsel->fd); ++thread) {
 			close(FD(evsel, cpu, thread));
 			FD(evsel, cpu, thread) = -1;
 		}
@@ -1256,20 +1254,148 @@ void perf_counts_values__scale(struct perf_counts_values *count,
 		*pscaled = scaled;
 }
 
+static int perf_evsel__read_size(struct perf_evsel *evsel)
+{
+	u64 read_format = evsel->attr.read_format;
+	int entry = sizeof(u64); /* value */
+	int size = 0;
+	int nr = 1;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		size += sizeof(u64);
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		size += sizeof(u64);
+
+	if (read_format & PERF_FORMAT_ID)
+		entry += sizeof(u64);
+
+	if (read_format & PERF_FORMAT_GROUP) {
+		nr = evsel->nr_members;
+		size += sizeof(u64);
+	}
+
+	size += entry * nr;
+	return size;
+}
+
 int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
 		     struct perf_counts_values *count)
 {
+	size_t size = perf_evsel__read_size(evsel);
+
 	memset(count, 0, sizeof(*count));
 
 	if (FD(evsel, cpu, thread) < 0)
 		return -EINVAL;
 
-	if (readn(FD(evsel, cpu, thread), count, sizeof(*count)) <= 0)
+	if (readn(FD(evsel, cpu, thread), count->values, size) <= 0)
 		return -errno;
 
 	return 0;
 }
 
+static int
+perf_evsel__read_one(struct perf_evsel *evsel, int cpu, int thread)
+{
+	struct perf_counts_values *count = perf_counts(evsel->counts, cpu, thread);
+
+	return perf_evsel__read(evsel, cpu, thread, count);
+}
+
+static void
+perf_evsel__set_count(struct perf_evsel *counter, int cpu, int thread,
+		      u64 val, u64 ena, u64 run)
+{
+	struct perf_counts_values *count;
+
+	count = perf_counts(counter->counts, cpu, thread);
+
+	count->val    = val;
+	count->ena    = ena;
+	count->run    = run;
+	count->loaded = true;
+}
+
+static int
+perf_evsel__process_group_data(struct perf_evsel *leader,
+			       int cpu, int thread, u64 *data)
+{
+	u64 read_format = leader->attr.read_format;
+	struct sample_read_value *v;
+	u64 nr, ena = 0, run = 0, i;
+
+	nr = *data++;
+
+	if (nr != (u64) leader->nr_members)
+		return -EINVAL;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		ena = *data++;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		run = *data++;
+
+	v = (struct sample_read_value *) data;
+
+	perf_evsel__set_count(leader, cpu, thread,
+			      v[0].value, ena, run);
+
+	for (i = 1; i < nr; i++) {
+		struct perf_evsel *counter;
+
+		counter = perf_evlist__id2evsel(leader->evlist, v[i].id);
+		if (!counter)
+			return -EINVAL;
+
+		perf_evsel__set_count(counter, cpu, thread,
+				      v[i].value, ena, run);
+	}
+
+	return 0;
+}
+
+static int
+perf_evsel__read_group(struct perf_evsel *leader, int cpu, int thread)
+{
+	struct perf_stat_evsel *ps = leader->priv;
+	u64 read_format = leader->attr.read_format;
+	int size = perf_evsel__read_size(leader);
+	u64 *data = ps->group_data;
+
+	if (!(read_format & PERF_FORMAT_ID))
+		return -EINVAL;
+
+	if (!perf_evsel__is_group_leader(leader))
+		return -EINVAL;
+
+	if (!data) {
+		data = zalloc(size);
+		if (!data)
+			return -ENOMEM;
+
+		ps->group_data = data;
+	}
+
+	if (FD(leader, cpu, thread) < 0)
+		return -EINVAL;
+
+	if (readn(FD(leader, cpu, thread), data, size) <= 0)
+		return -errno;
+
+	return perf_evsel__process_group_data(leader, cpu, thread, data);
+}
+
+int perf_evsel__read_counter(struct perf_evsel *evsel, int cpu, int thread)
+{
+	u64 read_format = evsel->attr.read_format;
+
+	if (read_format & PERF_FORMAT_GROUP)
+		return perf_evsel__read_group(evsel, cpu, thread);
+	else
+		return perf_evsel__read_one(evsel, cpu, thread);
+}
+
 int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
 			      int cpu, int thread, bool scale)
 {
@@ -1545,6 +1671,8 @@ fallback_missing_features:
 	if (perf_missing_features.lbr_flags)
 		evsel->attr.branch_sample_type &= ~(PERF_SAMPLE_BRANCH_NO_FLAGS |
 				     PERF_SAMPLE_BRANCH_NO_CYCLES);
+	if (perf_missing_features.group_read && evsel->attr.inherit)
+		evsel->attr.read_format &= ~(PERF_FORMAT_GROUP|PERF_FORMAT_ID);
 retry_sample_id:
 	if (perf_missing_features.sample_id_all)
 		evsel->attr.sample_id_all = 0;
@@ -1569,6 +1697,8 @@ retry_open:
 			pr_debug2("sys_perf_event_open: pid %d  cpu %d  group_fd %d  flags %#lx",
 				  pid, cpus->map[cpu], group_fd, flags);
 
+			test_attr__ready();
+
 			fd = sys_perf_event_open(&evsel->attr, pid, cpus->map[cpu],
 						 group_fd, flags);
 
@@ -1664,31 +1794,45 @@ try_fallback:
 	 */
 	if (!perf_missing_features.write_backward && evsel->attr.write_backward) {
 		perf_missing_features.write_backward = true;
+		pr_debug2("switching off write_backward\n");
 		goto out_close;
 	} else if (!perf_missing_features.clockid_wrong && evsel->attr.use_clockid) {
 		perf_missing_features.clockid_wrong = true;
+		pr_debug2("switching off clockid\n");
 		goto fallback_missing_features;
 	} else if (!perf_missing_features.clockid && evsel->attr.use_clockid) {
 		perf_missing_features.clockid = true;
+		pr_debug2("switching off use_clockid\n");
 		goto fallback_missing_features;
 	} else if (!perf_missing_features.cloexec && (flags & PERF_FLAG_FD_CLOEXEC)) {
 		perf_missing_features.cloexec = true;
+		pr_debug2("switching off cloexec flag\n");
 		goto fallback_missing_features;
 	} else if (!perf_missing_features.mmap2 && evsel->attr.mmap2) {
 		perf_missing_features.mmap2 = true;
+		pr_debug2("switching off mmap2\n");
 		goto fallback_missing_features;
 	} else if (!perf_missing_features.exclude_guest &&
 		   (evsel->attr.exclude_guest || evsel->attr.exclude_host)) {
 		perf_missing_features.exclude_guest = true;
+		pr_debug2("switching off exclude_guest, exclude_host\n");
 		goto fallback_missing_features;
 	} else if (!perf_missing_features.sample_id_all) {
 		perf_missing_features.sample_id_all = true;
+		pr_debug2("switching off sample_id_all\n");
 		goto retry_sample_id;
 	} else if (!perf_missing_features.lbr_flags &&
 			(evsel->attr.branch_sample_type &
 			 (PERF_SAMPLE_BRANCH_NO_CYCLES |
 			  PERF_SAMPLE_BRANCH_NO_FLAGS))) {
 		perf_missing_features.lbr_flags = true;
+		pr_debug2("switching off branch sample type no (cycles/flags)\n");
+		goto fallback_missing_features;
+	} else if (!perf_missing_features.group_read &&
+		    evsel->attr.inherit &&
+		   (evsel->attr.read_format & PERF_FORMAT_GROUP)) {
+		perf_missing_features.group_read = true;
+		pr_debug2("switching off group read\n");
 		goto fallback_missing_features;
 	}
 out_close:
@@ -1702,12 +1846,12 @@ out_close:
 	return err;
 }
 
-void perf_evsel__close(struct perf_evsel *evsel, int ncpus, int nthreads)
+void perf_evsel__close(struct perf_evsel *evsel)
 {
 	if (evsel->fd == NULL)
 		return;
 
-	perf_evsel__close_fd(evsel, ncpus, nthreads);
+	perf_evsel__close_fd(evsel);
 	perf_evsel__free_fd(evsel);
 }
 
@@ -2535,7 +2679,9 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 		 "unprivileged users (without CAP_SYS_ADMIN).\n\n"
 		 "The current value is %d:\n\n"
 		 "  -1: Allow use of (almost) all events by all users\n"
-		 ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
+		 "      Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK\n"
+		 ">= 0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN\n"
+		 "      Disallow raw tracepoint access by users without CAP_SYS_ADMIN\n"
 		 ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
 		 ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN\n\n"
 		 "To make this setting permanent, edit /etc/sysctl.conf too, e.g.:\n\n"
@@ -2610,3 +2756,10 @@ char *perf_evsel__env_arch(struct perf_evsel *evsel)
 		return evsel->evlist->env->arch;
 	return NULL;
 }
+
+char *perf_evsel__env_cpuid(struct perf_evsel *evsel)
+{
+	if (evsel && evsel->evlist && evsel->evlist->env)
+		return evsel->evlist->env->cpuid;
+	return NULL;
+}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index d101695c482c..351d3b2d8887 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -185,7 +185,7 @@ static inline struct perf_evsel *perf_evsel__newtp(const char *sys, const char *
 	return perf_evsel__newtp_idx(sys, name, 0);
 }
 
-struct perf_evsel *perf_evsel__new_cycles(void);
+struct perf_evsel *perf_evsel__new_cycles(bool precise);
 
 struct event_format *event_format__new(const char *sys, const char *name);
 
@@ -226,7 +226,7 @@ const char *perf_evsel__group_name(struct perf_evsel *evsel);
 int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size);
 
 int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
-void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
+void perf_evsel__close_fd(struct perf_evsel *evsel);
 
 void __perf_evsel__set_sample_bit(struct perf_evsel *evsel,
 				  enum perf_event_sample_format bit);
@@ -246,8 +246,7 @@ int perf_evsel__set_filter(struct perf_evsel *evsel, const char *filter);
 int perf_evsel__append_tp_filter(struct perf_evsel *evsel, const char *filter);
 int perf_evsel__append_addr_filter(struct perf_evsel *evsel,
 				   const char *filter);
-int perf_evsel__apply_filter(struct perf_evsel *evsel, int ncpus, int nthreads,
-			     const char *filter);
+int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter);
 int perf_evsel__enable(struct perf_evsel *evsel);
 int perf_evsel__disable(struct perf_evsel *evsel);
 
@@ -257,7 +256,7 @@ int perf_evsel__open_per_thread(struct perf_evsel *evsel,
 				struct thread_map *threads);
 int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
 		     struct thread_map *threads);
-void perf_evsel__close(struct perf_evsel *evsel, int ncpus, int nthreads);
+void perf_evsel__close(struct perf_evsel *evsel);
 
 struct perf_sample;
 
@@ -299,6 +298,8 @@ static inline bool perf_evsel__match2(struct perf_evsel *e1,
 int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
 		     struct perf_counts_values *count);
 
+int perf_evsel__read_counter(struct perf_evsel *evsel, int cpu, int thread);
+
 int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
 			      int cpu, int thread, bool scale);
 
@@ -436,5 +437,6 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 			     attr__fprintf_f attr__fprintf, void *priv);
 
 char *perf_evsel__env_arch(struct perf_evsel *evsel);
+char *perf_evsel__env_cpuid(struct perf_evsel *evsel);
 
 #endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
index 9c2760a1a96e..400ef9eab00a 100644
--- a/tools/perf/util/expr.h
+++ b/tools/perf/util/expr.h
@@ -1,7 +1,7 @@
 #ifndef PARSE_CTX_H
 #define PARSE_CTX_H 1
 
-#define EXPR_MAX_OTHER 8
+#define EXPR_MAX_OTHER 15
 #define MAX_PARSE_ID EXPR_MAX_OTHER
 
 struct parse_id {
diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y
index 954556bea36e..432b8560cf51 100644
--- a/tools/perf/util/expr.y
+++ b/tools/perf/util/expr.y
@@ -4,6 +4,7 @@
 #include "util/debug.h"
 #define IN_EXPR_Y 1
 #include "expr.h"
+#include "smt.h"
 #include <string.h>
 
 #define MAXIDLEN 256
@@ -22,13 +23,15 @@
 
 %token <num> NUMBER
 %token <id> ID
+%token MIN MAX IF ELSE SMT_ON
+%left MIN MAX IF
 %left '|'
 %left '^'
 %left '&'
 %left '-' '+'
 %left '*' '/' '%'
 %left NEG NOT
-%type <num> expr
+%type <num> expr if_expr
 
 %{
 static int expr__lex(YYSTYPE *res, const char **pp);
@@ -57,22 +60,33 @@ static int lookup_id(struct parse_ctx *ctx, char *id, double *val)
 %}
 %%
 
-all_expr: expr			{ *final_val = $1; }
+all_expr: if_expr			{ *final_val = $1; }
+	;
+
+if_expr:
+	expr IF expr ELSE expr { $$ = $3 ? $1 : $5; }
+	| expr
 	;
 
 expr:	  NUMBER
 	| ID			{ if (lookup_id(ctx, $1, &$$) < 0) {
-					pr_debug("%s not found", $1);
+					pr_debug("%s not found\n", $1);
 					YYABORT;
 				  }
 				}
+	| expr '|' expr		{ $$ = (long)$1 | (long)$3; }
+	| expr '&' expr		{ $$ = (long)$1 & (long)$3; }
+	| expr '^' expr		{ $$ = (long)$1 ^ (long)$3; }
 	| expr '+' expr		{ $$ = $1 + $3; }
 	| expr '-' expr		{ $$ = $1 - $3; }
 	| expr '*' expr		{ $$ = $1 * $3; }
 	| expr '/' expr		{ if ($3 == 0) YYABORT; $$ = $1 / $3; }
 	| expr '%' expr		{ if ((long)$3 == 0) YYABORT; $$ = (long)$1 % (long)$3; }
 	| '-' expr %prec NEG	{ $$ = -$2; }
-	| '(' expr ')'		{ $$ = $2; }
+	| '(' if_expr ')'	{ $$ = $2; }
+	| MIN '(' expr ',' expr ')' { $$ = $3 < $5 ? $3 : $5; }
+	| MAX '(' expr ',' expr ')' { $$ = $3 > $5 ? $3 : $5; }
+	| SMT_ON		 { $$ = smt_on() > 0; }
 	;
 
 %%
@@ -82,13 +96,47 @@ static int expr__symbol(YYSTYPE *res, const char *p, const char **pp)
 	char *dst = res->id;
 	const char *s = p;
 
-	while (isalnum(*p) || *p == '_' || *p == '.') {
+	if (*p == '#')
+		*dst++ = *p++;
+
+	while (isalnum(*p) || *p == '_' || *p == '.' || *p == ':' || *p == '@' || *p == '\\') {
 		if (p - s >= MAXIDLEN)
 			return -1;
-		*dst++ = *p++;
+		/*
+		 * Allow @ instead of / to be able to specify pmu/event/ without
+		 * conflicts with normal division.
+		 */
+		if (*p == '@')
+			*dst++ = '/';
+		else if (*p == '\\')
+			*dst++ = *++p;
+		else
+			*dst++ = *p;
+		p++;
 	}
 	*dst = 0;
 	*pp = p;
+	dst = res->id;
+	switch (dst[0]) {
+	case 'm':
+		if (!strcmp(dst, "min"))
+			return MIN;
+		if (!strcmp(dst, "max"))
+			return MAX;
+		break;
+	case 'i':
+		if (!strcmp(dst, "if"))
+			return IF;
+		break;
+	case 'e':
+		if (!strcmp(dst, "else"))
+			return ELSE;
+		break;
+	case '#':
+		if (!strcasecmp(dst, "#smt_on"))
+			return SMT_ON;
+		break;
+	}
 	return ID;
 }
 
@@ -102,6 +150,7 @@ static int expr__lex(YYSTYPE *res, const char **pp)
 		p++;
 	s = p;
 	switch (*p++) {
+	case '#':
 	case 'a' ... 'z':
 	case 'A' ... 'Z':
 		return expr__symbol(res, p - 1, pp);
@@ -132,6 +181,19 @@ void expr__ctx_init(struct parse_ctx *ctx)
 	ctx->num_ids = 0;
 }
 
+static bool already_seen(const char *val, const char *one, const char **other,
+			 int num_other)
+{
+	int i;
+
+	if (one && !strcasecmp(one, val))
+		return true;
+	for (i = 0; i < num_other; i++)
+		if (!strcasecmp(other[i], val))
+			return true;
+	return false;
+}
+
 int expr__find_other(const char *p, const char *one, const char ***other,
 		     int *num_otherp)
 {
@@ -151,7 +213,7 @@ int expr__find_other(const char *p, const char *one, const char ***other,
 			err = 0;
 			break;
 		}
-		if (tok == ID && strcasecmp(one, val.id)) {
+		if (tok == ID && !already_seen(val.id, one, *other, num_other)) {
 			if (num_other >= EXPR_MAX_OTHER - 1) {
 				pr_debug("Too many extra events in %s\n", orig);
 				break;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 76ed7d03e500..605bbd5404fb 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -12,6 +12,7 @@
 #include <linux/list.h>
 #include <linux/kernel.h>
 #include <linux/bitops.h>
+#include <linux/stringify.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/utsname.h>
@@ -34,6 +35,7 @@
 #include "data.h"
 #include <api/fs/fs.h>
 #include "asm/bug.h"
+#include "tool.h"
 
 #include "sane_ctype.h"
 
@@ -59,6 +61,15 @@ struct perf_file_attr {
 	struct perf_file_section	ids;
 };
 
+struct feat_fd {
+	struct perf_header	*ph;
+	int			fd;
+	void			*buf;	/* Either buf != NULL or fd >= 0 */
+	ssize_t			offset;
+	size_t			size;
+	struct perf_evsel	*events;
+};
+
 void perf_header__set_feat(struct perf_header *header, int feat)
 {
 	set_bit(feat, header->adds_features);
@@ -74,28 +85,60 @@ bool perf_header__has_feat(const struct perf_header *header, int feat)
 	return test_bit(feat, header->adds_features);
 }
 
-static int do_write(int fd, const void *buf, size_t size)
+static int __do_write_fd(struct feat_fd *ff, const void *buf, size_t size)
 {
-	while (size) {
-		int ret = write(fd, buf, size);
+	ssize_t ret = writen(ff->fd, buf, size);
 
-		if (ret < 0)
-			return -errno;
+	if (ret != (ssize_t)size)
+		return ret < 0 ? (int)ret : -1;
+	return 0;
+}
+
+static int __do_write_buf(struct feat_fd *ff,  const void *buf, size_t size)
+{
+	/* struct perf_event_header::size is u16 */
+	const size_t max_size = 0xffff - sizeof(struct perf_event_header);
+	size_t new_size = ff->size;
+	void *addr;
 
-		size -= ret;
-		buf += ret;
+	if (size + ff->offset > max_size)
+		return -E2BIG;
+
+	while (size > (new_size - ff->offset))
+		new_size <<= 1;
+	new_size = min(max_size, new_size);
+
+	if (ff->size < new_size) {
+		addr = realloc(ff->buf, new_size);
+		if (!addr)
+			return -ENOMEM;
+		ff->buf = addr;
+		ff->size = new_size;
 	}
 
+	memcpy(ff->buf + ff->offset, buf, size);
+	ff->offset += size;
+
 	return 0;
 }
 
-int write_padded(int fd, const void *bf, size_t count, size_t count_aligned)
+/* Return: 0 if succeded, -ERR if failed. */
+int do_write(struct feat_fd *ff, const void *buf, size_t size)
+{
+	if (!ff->buf)
+		return __do_write_fd(ff, buf, size);
+	return __do_write_buf(ff, buf, size);
+}
+
+/* Return: 0 if succeded, -ERR if failed. */
+int write_padded(struct feat_fd *ff, const void *bf,
+		 size_t count, size_t count_aligned)
 {
 	static const char zero_buf[NAME_ALIGN];
-	int err = do_write(fd, bf, count);
+	int err = do_write(ff, bf, count);
 
 	if (!err)
-		err = do_write(fd, zero_buf, count_aligned - count);
+		err = do_write(ff, zero_buf, count_aligned - count);
 
 	return err;
 }
@@ -103,7 +146,8 @@ int write_padded(int fd, const void *bf, size_t count, size_t count_aligned)
 #define string_size(str)						\
 	(PERF_ALIGN((strlen(str) + 1), NAME_ALIGN) + sizeof(u32))
 
-static int do_write_string(int fd, const char *str)
+/* Return: 0 if succeded, -ERR if failed. */
+static int do_write_string(struct feat_fd *ff, const char *str)
 {
 	u32 len, olen;
 	int ret;
@@ -112,32 +156,80 @@ static int do_write_string(int fd, const char *str)
 	len = PERF_ALIGN(olen, NAME_ALIGN);
 
 	/* write len, incl. \0 */
-	ret = do_write(fd, &len, sizeof(len));
+	ret = do_write(ff, &len, sizeof(len));
 	if (ret < 0)
 		return ret;
 
-	return write_padded(fd, str, olen, len);
+	return write_padded(ff, str, olen, len);
 }
 
-static char *do_read_string(int fd, struct perf_header *ph)
+static int __do_read_fd(struct feat_fd *ff, void *addr, ssize_t size)
+{
+	ssize_t ret = readn(ff->fd, addr, size);
+
+	if (ret != size)
+		return ret < 0 ? (int)ret : -1;
+	return 0;
+}
+
+static int __do_read_buf(struct feat_fd *ff, void *addr, ssize_t size)
+{
+	if (size > (ssize_t)ff->size - ff->offset)
+		return -1;
+
+	memcpy(addr, ff->buf + ff->offset, size);
+	ff->offset += size;
+
+	return 0;
+
+}
+
+static int __do_read(struct feat_fd *ff, void *addr, ssize_t size)
+{
+	if (!ff->buf)
+		return __do_read_fd(ff, addr, size);
+	return __do_read_buf(ff, addr, size);
+}
+
+static int do_read_u32(struct feat_fd *ff, u32 *addr)
+{
+	int ret;
+
+	ret = __do_read(ff, addr, sizeof(*addr));
+	if (ret)
+		return ret;
+
+	if (ff->ph->needs_swap)
+		*addr = bswap_32(*addr);
+	return 0;
+}
+
+static int do_read_u64(struct feat_fd *ff, u64 *addr)
+{
+	int ret;
+
+	ret = __do_read(ff, addr, sizeof(*addr));
+	if (ret)
+		return ret;
+
+	if (ff->ph->needs_swap)
+		*addr = bswap_64(*addr);
+	return 0;
+}
+
+static char *do_read_string(struct feat_fd *ff)
 {
-	ssize_t sz, ret;
 	u32 len;
 	char *buf;
 
-	sz = readn(fd, &len, sizeof(len));
-	if (sz < (ssize_t)sizeof(len))
+	if (do_read_u32(ff, &len))
 		return NULL;
 
-	if (ph->needs_swap)
-		len = bswap_32(len);
-
 	buf = malloc(len);
 	if (!buf)
 		return NULL;
 
-	ret = readn(fd, buf, len);
-	if (ret == (ssize_t)len) {
+	if (!__do_read(ff, buf, len)) {
 		/*
 		 * strings are padded by zeroes
 		 * thus the actual strlen of buf
@@ -150,25 +242,30 @@ static char *do_read_string(int fd, struct perf_header *ph)
 	return NULL;
 }
 
-static int write_tracing_data(int fd, struct perf_header *h __maybe_unused,
-			    struct perf_evlist *evlist)
+static int write_tracing_data(struct feat_fd *ff,
+			      struct perf_evlist *evlist)
 {
-	return read_tracing_data(fd, &evlist->entries);
-}
+	if (WARN(ff->buf, "Error: calling %s in pipe-mode.\n", __func__))
+		return -1;
 
+	return read_tracing_data(ff->fd, &evlist->entries);
+}
 
-static int write_build_id(int fd, struct perf_header *h,
+static int write_build_id(struct feat_fd *ff,
 			  struct perf_evlist *evlist __maybe_unused)
 {
 	struct perf_session *session;
 	int err;
 
-	session = container_of(h, struct perf_session, header);
+	session = container_of(ff->ph, struct perf_session, header);
 
 	if (!perf_session__read_build_ids(session, true))
 		return -1;
 
-	err = perf_session__write_buildid_table(session, fd);
+	if (WARN(ff->buf, "Error: calling %s in pipe-mode.\n", __func__))
+		return -1;
+
+	err = perf_session__write_buildid_table(session, ff);
 	if (err < 0) {
 		pr_debug("failed to write buildid table\n");
 		return err;
@@ -178,7 +275,7 @@ static int write_build_id(int fd, struct perf_header *h,
 	return 0;
 }
 
-static int write_hostname(int fd, struct perf_header *h __maybe_unused,
+static int write_hostname(struct feat_fd *ff,
 			  struct perf_evlist *evlist __maybe_unused)
 {
 	struct utsname uts;
@@ -188,10 +285,10 @@ static int write_hostname(int fd, struct perf_header *h __maybe_unused,
 	if (ret < 0)
 		return -1;
 
-	return do_write_string(fd, uts.nodename);
+	return do_write_string(ff, uts.nodename);
 }
 
-static int write_osrelease(int fd, struct perf_header *h __maybe_unused,
+static int write_osrelease(struct feat_fd *ff,
 			   struct perf_evlist *evlist __maybe_unused)
 {
 	struct utsname uts;
@@ -201,10 +298,10 @@ static int write_osrelease(int fd, struct perf_header *h __maybe_unused,
 	if (ret < 0)
 		return -1;
 
-	return do_write_string(fd, uts.release);
+	return do_write_string(ff, uts.release);
 }
 
-static int write_arch(int fd, struct perf_header *h __maybe_unused,
+static int write_arch(struct feat_fd *ff,
 		      struct perf_evlist *evlist __maybe_unused)
 {
 	struct utsname uts;
@@ -214,16 +311,16 @@ static int write_arch(int fd, struct perf_header *h __maybe_unused,
 	if (ret < 0)
 		return -1;
 
-	return do_write_string(fd, uts.machine);
+	return do_write_string(ff, uts.machine);
 }
 
-static int write_version(int fd, struct perf_header *h __maybe_unused,
+static int write_version(struct feat_fd *ff,
 			 struct perf_evlist *evlist __maybe_unused)
 {
-	return do_write_string(fd, perf_version_string);
+	return do_write_string(ff, perf_version_string);
 }
 
-static int __write_cpudesc(int fd, const char *cpuinfo_proc)
+static int __write_cpudesc(struct feat_fd *ff, const char *cpuinfo_proc)
 {
 	FILE *file;
 	char *buf = NULL;
@@ -273,25 +370,22 @@ static int __write_cpudesc(int fd, const char *cpuinfo_proc)
 		}
 		p++;
 	}
-	ret = do_write_string(fd, s);
+	ret = do_write_string(ff, s);
 done:
 	free(buf);
 	fclose(file);
 	return ret;
 }
 
-static int write_cpudesc(int fd, struct perf_header *h __maybe_unused,
+static int write_cpudesc(struct feat_fd *ff,
 		       struct perf_evlist *evlist __maybe_unused)
 {
-#ifndef CPUINFO_PROC
-#define CPUINFO_PROC {"model name", }
-#endif
 	const char *cpuinfo_procs[] = CPUINFO_PROC;
 	unsigned int i;
 
 	for (i = 0; i < ARRAY_SIZE(cpuinfo_procs); i++) {
 		int ret;
-		ret = __write_cpudesc(fd, cpuinfo_procs[i]);
+		ret = __write_cpudesc(ff, cpuinfo_procs[i]);
 		if (ret >= 0)
 			return ret;
 	}
@@ -299,7 +393,7 @@ static int write_cpudesc(int fd, struct perf_header *h __maybe_unused,
 }
 
 
-static int write_nrcpus(int fd, struct perf_header *h __maybe_unused,
+static int write_nrcpus(struct feat_fd *ff,
 			struct perf_evlist *evlist __maybe_unused)
 {
 	long nr;
@@ -314,14 +408,14 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused,
 
 	nra = (u32)(nr & UINT_MAX);
 
-	ret = do_write(fd, &nrc, sizeof(nrc));
+	ret = do_write(ff, &nrc, sizeof(nrc));
 	if (ret < 0)
 		return ret;
 
-	return do_write(fd, &nra, sizeof(nra));
+	return do_write(ff, &nra, sizeof(nra));
 }
 
-static int write_event_desc(int fd, struct perf_header *h __maybe_unused,
+static int write_event_desc(struct feat_fd *ff,
 			    struct perf_evlist *evlist)
 {
 	struct perf_evsel *evsel;
@@ -333,7 +427,7 @@ static int write_event_desc(int fd, struct perf_header *h __maybe_unused,
 	/*
 	 * write number of events
 	 */
-	ret = do_write(fd, &nre, sizeof(nre));
+	ret = do_write(ff, &nre, sizeof(nre));
 	if (ret < 0)
 		return ret;
 
@@ -341,12 +435,12 @@ static int write_event_desc(int fd, struct perf_header *h __maybe_unused,
 	 * size of perf_event_attr struct
 	 */
 	sz = (u32)sizeof(evsel->attr);
-	ret = do_write(fd, &sz, sizeof(sz));
+	ret = do_write(ff, &sz, sizeof(sz));
 	if (ret < 0)
 		return ret;
 
 	evlist__for_each_entry(evlist, evsel) {
-		ret = do_write(fd, &evsel->attr, sz);
+		ret = do_write(ff, &evsel->attr, sz);
 		if (ret < 0)
 			return ret;
 		/*
@@ -357,27 +451,27 @@ static int write_event_desc(int fd, struct perf_header *h __maybe_unused,
 		 * type of ids,
 		 */
 		nri = evsel->ids;
-		ret = do_write(fd, &nri, sizeof(nri));
+		ret = do_write(ff, &nri, sizeof(nri));
 		if (ret < 0)
 			return ret;
 
 		/*
 		 * write event string as passed on cmdline
 		 */
-		ret = do_write_string(fd, perf_evsel__name(evsel));
+		ret = do_write_string(ff, perf_evsel__name(evsel));
 		if (ret < 0)
 			return ret;
 		/*
 		 * write unique ids for this event
 		 */
-		ret = do_write(fd, evsel->id, evsel->ids * sizeof(u64));
+		ret = do_write(ff, evsel->id, evsel->ids * sizeof(u64));
 		if (ret < 0)
 			return ret;
 	}
 	return 0;
 }
 
-static int write_cmdline(int fd, struct perf_header *h __maybe_unused,
+static int write_cmdline(struct feat_fd *ff,
 			 struct perf_evlist *evlist __maybe_unused)
 {
 	char buf[MAXPATHLEN];
@@ -395,16 +489,16 @@ static int write_cmdline(int fd, struct perf_header *h __maybe_unused,
 	/* account for binary path */
 	n = perf_env.nr_cmdline + 1;
 
-	ret = do_write(fd, &n, sizeof(n));
+	ret = do_write(ff, &n, sizeof(n));
 	if (ret < 0)
 		return ret;
 
-	ret = do_write_string(fd, buf);
+	ret = do_write_string(ff, buf);
 	if (ret < 0)
 		return ret;
 
 	for (i = 0 ; i < perf_env.nr_cmdline; i++) {
-		ret = do_write_string(fd, perf_env.cmdline_argv[i]);
+		ret = do_write_string(ff, perf_env.cmdline_argv[i]);
 		if (ret < 0)
 			return ret;
 	}
@@ -557,8 +651,8 @@ out_free:
 	return tp;
 }
 
-static int write_cpu_topology(int fd, struct perf_header *h __maybe_unused,
-			  struct perf_evlist *evlist __maybe_unused)
+static int write_cpu_topology(struct feat_fd *ff,
+			      struct perf_evlist *evlist __maybe_unused)
 {
 	struct cpu_topo *tp;
 	u32 i;
@@ -568,21 +662,21 @@ static int write_cpu_topology(int fd, struct perf_header *h __maybe_unused,
 	if (!tp)
 		return -1;
 
-	ret = do_write(fd, &tp->core_sib, sizeof(tp->core_sib));
+	ret = do_write(ff, &tp->core_sib, sizeof(tp->core_sib));
 	if (ret < 0)
 		goto done;
 
 	for (i = 0; i < tp->core_sib; i++) {
-		ret = do_write_string(fd, tp->core_siblings[i]);
+		ret = do_write_string(ff, tp->core_siblings[i]);
 		if (ret < 0)
 			goto done;
 	}
-	ret = do_write(fd, &tp->thread_sib, sizeof(tp->thread_sib));
+	ret = do_write(ff, &tp->thread_sib, sizeof(tp->thread_sib));
 	if (ret < 0)
 		goto done;
 
 	for (i = 0; i < tp->thread_sib; i++) {
-		ret = do_write_string(fd, tp->thread_siblings[i]);
+		ret = do_write_string(ff, tp->thread_siblings[i]);
 		if (ret < 0)
 			break;
 	}
@@ -592,11 +686,11 @@ static int write_cpu_topology(int fd, struct perf_header *h __maybe_unused,
 		goto done;
 
 	for (j = 0; j < perf_env.nr_cpus_avail; j++) {
-		ret = do_write(fd, &perf_env.cpu[j].core_id,
+		ret = do_write(ff, &perf_env.cpu[j].core_id,
 			       sizeof(perf_env.cpu[j].core_id));
 		if (ret < 0)
 			return ret;
-		ret = do_write(fd, &perf_env.cpu[j].socket_id,
+		ret = do_write(ff, &perf_env.cpu[j].socket_id,
 			       sizeof(perf_env.cpu[j].socket_id));
 		if (ret < 0)
 			return ret;
@@ -608,8 +702,8 @@ done:
 
 
 
-static int write_total_mem(int fd, struct perf_header *h __maybe_unused,
-			  struct perf_evlist *evlist __maybe_unused)
+static int write_total_mem(struct feat_fd *ff,
+			   struct perf_evlist *evlist __maybe_unused)
 {
 	char *buf = NULL;
 	FILE *fp;
@@ -629,7 +723,7 @@ static int write_total_mem(int fd, struct perf_header *h __maybe_unused,
 	if (!ret) {
 		n = sscanf(buf, "%*s %"PRIu64, &mem);
 		if (n == 1)
-			ret = do_write(fd, &mem, sizeof(mem));
+			ret = do_write(ff, &mem, sizeof(mem));
 	} else
 		ret = -1;
 	free(buf);
@@ -637,7 +731,7 @@ static int write_total_mem(int fd, struct perf_header *h __maybe_unused,
 	return ret;
 }
 
-static int write_topo_node(int fd, int node)
+static int write_topo_node(struct feat_fd *ff, int node)
 {
 	char str[MAXPATHLEN];
 	char field[32];
@@ -667,11 +761,11 @@ static int write_topo_node(int fd, int node)
 	fclose(fp);
 	fp = NULL;
 
-	ret = do_write(fd, &mem_total, sizeof(u64));
+	ret = do_write(ff, &mem_total, sizeof(u64));
 	if (ret)
 		goto done;
 
-	ret = do_write(fd, &mem_free, sizeof(u64));
+	ret = do_write(ff, &mem_free, sizeof(u64));
 	if (ret)
 		goto done;
 
@@ -689,7 +783,7 @@ static int write_topo_node(int fd, int node)
 	if (p)
 		*p = '\0';
 
-	ret = do_write_string(fd, buf);
+	ret = do_write_string(ff, buf);
 done:
 	free(buf);
 	if (fp)
@@ -697,8 +791,8 @@ done:
 	return ret;
 }
 
-static int write_numa_topology(int fd, struct perf_header *h __maybe_unused,
-			  struct perf_evlist *evlist __maybe_unused)
+static int write_numa_topology(struct feat_fd *ff,
+			       struct perf_evlist *evlist __maybe_unused)
 {
 	char *buf = NULL;
 	size_t len = 0;
@@ -725,17 +819,17 @@ static int write_numa_topology(int fd, struct perf_header *h __maybe_unused,
 
 	nr = (u32)node_map->nr;
 
-	ret = do_write(fd, &nr, sizeof(nr));
+	ret = do_write(ff, &nr, sizeof(nr));
 	if (ret < 0)
 		goto done;
 
 	for (i = 0; i < nr; i++) {
 		j = (u32)node_map->map[i];
-		ret = do_write(fd, &j, sizeof(j));
+		ret = do_write(ff, &j, sizeof(j));
 		if (ret < 0)
 			break;
 
-		ret = write_topo_node(fd, i);
+		ret = write_topo_node(ff, i);
 		if (ret < 0)
 			break;
 	}
@@ -758,39 +852,40 @@ done:
  * };
  */
 
-static int write_pmu_mappings(int fd, struct perf_header *h __maybe_unused,
+static int write_pmu_mappings(struct feat_fd *ff,
 			      struct perf_evlist *evlist __maybe_unused)
 {
 	struct perf_pmu *pmu = NULL;
-	off_t offset = lseek(fd, 0, SEEK_CUR);
-	__u32 pmu_num = 0;
+	u32 pmu_num = 0;
 	int ret;
 
-	/* write real pmu_num later */
-	ret = do_write(fd, &pmu_num, sizeof(pmu_num));
+	/*
+	 * Do a first pass to count number of pmu to avoid lseek so this
+	 * works in pipe mode as well.
+	 */
+	while ((pmu = perf_pmu__scan(pmu))) {
+		if (!pmu->name)
+			continue;
+		pmu_num++;
+	}
+
+	ret = do_write(ff, &pmu_num, sizeof(pmu_num));
 	if (ret < 0)
 		return ret;
 
 	while ((pmu = perf_pmu__scan(pmu))) {
 		if (!pmu->name)
 			continue;
-		pmu_num++;
 
-		ret = do_write(fd, &pmu->type, sizeof(pmu->type));
+		ret = do_write(ff, &pmu->type, sizeof(pmu->type));
 		if (ret < 0)
 			return ret;
 
-		ret = do_write_string(fd, pmu->name);
+		ret = do_write_string(ff, pmu->name);
 		if (ret < 0)
 			return ret;
 	}
 
-	if (pwrite(fd, &pmu_num, sizeof(pmu_num), offset) != sizeof(pmu_num)) {
-		/* discard all */
-		lseek(fd, offset, SEEK_SET);
-		return -1;
-	}
-
 	return 0;
 }
 
@@ -806,14 +901,14 @@ static int write_pmu_mappings(int fd, struct perf_header *h __maybe_unused,
  *	}[nr_groups];
  * };
  */
-static int write_group_desc(int fd, struct perf_header *h __maybe_unused,
+static int write_group_desc(struct feat_fd *ff,
 			    struct perf_evlist *evlist)
 {
 	u32 nr_groups = evlist->nr_groups;
 	struct perf_evsel *evsel;
 	int ret;
 
-	ret = do_write(fd, &nr_groups, sizeof(nr_groups));
+	ret = do_write(ff, &nr_groups, sizeof(nr_groups));
 	if (ret < 0)
 		return ret;
 
@@ -824,15 +919,15 @@ static int write_group_desc(int fd, struct perf_header *h __maybe_unused,
 			u32 leader_idx = evsel->idx;
 			u32 nr_members = evsel->nr_members;
 
-			ret = do_write_string(fd, name);
+			ret = do_write_string(ff, name);
 			if (ret < 0)
 				return ret;
 
-			ret = do_write(fd, &leader_idx, sizeof(leader_idx));
+			ret = do_write(ff, &leader_idx, sizeof(leader_idx));
 			if (ret < 0)
 				return ret;
 
-			ret = do_write(fd, &nr_members, sizeof(nr_members));
+			ret = do_write(ff, &nr_members, sizeof(nr_members));
 			if (ret < 0)
 				return ret;
 		}
@@ -849,7 +944,7 @@ int __weak get_cpuid(char *buffer __maybe_unused, size_t sz __maybe_unused)
 	return -1;
 }
 
-static int write_cpuid(int fd, struct perf_header *h __maybe_unused,
+static int write_cpuid(struct feat_fd *ff,
 		       struct perf_evlist *evlist __maybe_unused)
 {
 	char buffer[64];
@@ -861,25 +956,27 @@ static int write_cpuid(int fd, struct perf_header *h __maybe_unused,
 
 	return -1;
 write_it:
-	return do_write_string(fd, buffer);
+	return do_write_string(ff, buffer);
 }
 
-static int write_branch_stack(int fd __maybe_unused,
-			      struct perf_header *h __maybe_unused,
-		       struct perf_evlist *evlist __maybe_unused)
+static int write_branch_stack(struct feat_fd *ff __maybe_unused,
+			      struct perf_evlist *evlist __maybe_unused)
 {
 	return 0;
 }
 
-static int write_auxtrace(int fd, struct perf_header *h,
+static int write_auxtrace(struct feat_fd *ff,
 			  struct perf_evlist *evlist __maybe_unused)
 {
 	struct perf_session *session;
 	int err;
 
-	session = container_of(h, struct perf_session, header);
+	if (WARN(ff->buf, "Error: calling %s in pipe-mode.\n", __func__))
+		return -1;
+
+	session = container_of(ff->ph, struct perf_session, header);
 
-	err = auxtrace_index__write(fd, &session->auxtrace_index);
+	err = auxtrace_index__write(ff->fd, &session->auxtrace_index);
 	if (err < 0)
 		pr_err("Failed to write auxtrace index\n");
 	return err;
@@ -1026,8 +1123,8 @@ static int build_caches(struct cpu_cache_level caches[], u32 size, u32 *cntp)
 
 #define MAX_CACHES 2000
 
-static int write_cache(int fd, struct perf_header *h __maybe_unused,
-			  struct perf_evlist *evlist __maybe_unused)
+static int write_cache(struct feat_fd *ff,
+		       struct perf_evlist *evlist __maybe_unused)
 {
 	struct cpu_cache_level caches[MAX_CACHES];
 	u32 cnt = 0, i, version = 1;
@@ -1039,11 +1136,11 @@ static int write_cache(int fd, struct perf_header *h __maybe_unused,
 
 	qsort(&caches, cnt, sizeof(struct cpu_cache_level), cpu_cache_level__sort);
 
-	ret = do_write(fd, &version, sizeof(u32));
+	ret = do_write(ff, &version, sizeof(u32));
 	if (ret < 0)
 		goto out;
 
-	ret = do_write(fd, &cnt, sizeof(u32));
+	ret = do_write(ff, &cnt, sizeof(u32));
 	if (ret < 0)
 		goto out;
 
@@ -1051,7 +1148,7 @@ static int write_cache(int fd, struct perf_header *h __maybe_unused,
 		struct cpu_cache_level *c = &caches[i];
 
 		#define _W(v)					\
-			ret = do_write(fd, &c->v, sizeof(u32));	\
+			ret = do_write(ff, &c->v, sizeof(u32));	\
 			if (ret < 0)				\
 				goto out;
 
@@ -1062,7 +1159,7 @@ static int write_cache(int fd, struct perf_header *h __maybe_unused,
 		#undef _W
 
 		#define _W(v)						\
-			ret = do_write_string(fd, (const char *) c->v);	\
+			ret = do_write_string(ff, (const char *) c->v);	\
 			if (ret < 0)					\
 				goto out;
 
@@ -1078,69 +1175,62 @@ out:
 	return ret;
 }
 
-static int write_stat(int fd __maybe_unused,
-		      struct perf_header *h __maybe_unused,
+static int write_stat(struct feat_fd *ff __maybe_unused,
 		      struct perf_evlist *evlist __maybe_unused)
 {
 	return 0;
 }
 
-static void print_hostname(struct perf_header *ph, int fd __maybe_unused,
-			   FILE *fp)
+static void print_hostname(struct feat_fd *ff, FILE *fp)
 {
-	fprintf(fp, "# hostname : %s\n", ph->env.hostname);
+	fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname);
 }
 
-static void print_osrelease(struct perf_header *ph, int fd __maybe_unused,
-			    FILE *fp)
+static void print_osrelease(struct feat_fd *ff, FILE *fp)
 {
-	fprintf(fp, "# os release : %s\n", ph->env.os_release);
+	fprintf(fp, "# os release : %s\n", ff->ph->env.os_release);
 }
 
-static void print_arch(struct perf_header *ph, int fd __maybe_unused, FILE *fp)
+static void print_arch(struct feat_fd *ff, FILE *fp)
 {
-	fprintf(fp, "# arch : %s\n", ph->env.arch);
+	fprintf(fp, "# arch : %s\n", ff->ph->env.arch);
 }
 
-static void print_cpudesc(struct perf_header *ph, int fd __maybe_unused,
-			  FILE *fp)
+static void print_cpudesc(struct feat_fd *ff, FILE *fp)
 {
-	fprintf(fp, "# cpudesc : %s\n", ph->env.cpu_desc);
+	fprintf(fp, "# cpudesc : %s\n", ff->ph->env.cpu_desc);
 }
 
-static void print_nrcpus(struct perf_header *ph, int fd __maybe_unused,
-			 FILE *fp)
+static void print_nrcpus(struct feat_fd *ff, FILE *fp)
 {
-	fprintf(fp, "# nrcpus online : %u\n", ph->env.nr_cpus_online);
-	fprintf(fp, "# nrcpus avail : %u\n", ph->env.nr_cpus_avail);
+	fprintf(fp, "# nrcpus online : %u\n", ff->ph->env.nr_cpus_online);
+	fprintf(fp, "# nrcpus avail : %u\n", ff->ph->env.nr_cpus_avail);
 }
 
-static void print_version(struct perf_header *ph, int fd __maybe_unused,
-			  FILE *fp)
+static void print_version(struct feat_fd *ff, FILE *fp)
 {
-	fprintf(fp, "# perf version : %s\n", ph->env.version);
+	fprintf(fp, "# perf version : %s\n", ff->ph->env.version);
 }
 
-static void print_cmdline(struct perf_header *ph, int fd __maybe_unused,
-			  FILE *fp)
+static void print_cmdline(struct feat_fd *ff, FILE *fp)
 {
 	int nr, i;
 
-	nr = ph->env.nr_cmdline;
+	nr = ff->ph->env.nr_cmdline;
 
 	fprintf(fp, "# cmdline : ");
 
 	for (i = 0; i < nr; i++)
-		fprintf(fp, "%s ", ph->env.cmdline_argv[i]);
+		fprintf(fp, "%s ", ff->ph->env.cmdline_argv[i]);
 	fputc('\n', fp);
 }
 
-static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused,
-			       FILE *fp)
+static void print_cpu_topology(struct feat_fd *ff, FILE *fp)
 {
+	struct perf_header *ph = ff->ph;
+	int cpu_nr = ph->env.nr_cpus_avail;
 	int nr, i;
 	char *str;
-	int cpu_nr = ph->env.nr_cpus_avail;
 
 	nr = ph->env.nr_sibling_cores;
 	str = ph->env.sibling_cores;
@@ -1181,31 +1271,21 @@ static void free_event_desc(struct perf_evsel *events)
 	free(events);
 }
 
-static struct perf_evsel *
-read_event_desc(struct perf_header *ph, int fd)
+static struct perf_evsel *read_event_desc(struct feat_fd *ff)
 {
 	struct perf_evsel *evsel, *events = NULL;
 	u64 *id;
 	void *buf = NULL;
 	u32 nre, sz, nr, i, j;
-	ssize_t ret;
 	size_t msz;
 
 	/* number of events */
-	ret = readn(fd, &nre, sizeof(nre));
-	if (ret != (ssize_t)sizeof(nre))
+	if (do_read_u32(ff, &nre))
 		goto error;
 
-	if (ph->needs_swap)
-		nre = bswap_32(nre);
-
-	ret = readn(fd, &sz, sizeof(sz));
-	if (ret != (ssize_t)sizeof(sz))
+	if (do_read_u32(ff, &sz))
 		goto error;
 
-	if (ph->needs_swap)
-		sz = bswap_32(sz);
-
 	/* buffer to hold on file attr struct */
 	buf = malloc(sz);
 	if (!buf)
@@ -1227,25 +1307,23 @@ read_event_desc(struct perf_header *ph, int fd)
 		 * must read entire on-file attr struct to
 		 * sync up with layout.
 		 */
-		ret = readn(fd, buf, sz);
-		if (ret != (ssize_t)sz)
+		if (__do_read(ff, buf, sz))
 			goto error;
 
-		if (ph->needs_swap)
+		if (ff->ph->needs_swap)
 			perf_event__attr_swap(buf);
 
 		memcpy(&evsel->attr, buf, msz);
 
-		ret = readn(fd, &nr, sizeof(nr));
-		if (ret != (ssize_t)sizeof(nr))
+		if (do_read_u32(ff, &nr))
 			goto error;
 
-		if (ph->needs_swap) {
-			nr = bswap_32(nr);
+		if (ff->ph->needs_swap)
 			evsel->needs_swap = true;
-		}
 
-		evsel->name = do_read_string(fd, ph);
+		evsel->name = do_read_string(ff);
+		if (!evsel->name)
+			goto error;
 
 		if (!nr)
 			continue;
@@ -1257,11 +1335,8 @@ read_event_desc(struct perf_header *ph, int fd)
 		evsel->id = id;
 
 		for (j = 0 ; j < nr; j++) {
-			ret = readn(fd, id, sizeof(*id));
-			if (ret != (ssize_t)sizeof(*id))
+			if (do_read_u64(ff, id))
 				goto error;
-			if (ph->needs_swap)
-				*id = bswap_64(*id);
 			id++;
 		}
 	}
@@ -1280,12 +1355,17 @@ static int __desc_attr__fprintf(FILE *fp, const char *name, const char *val,
 	return fprintf(fp, ", %s = %s", name, val);
 }
 
-static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
+static void print_event_desc(struct feat_fd *ff, FILE *fp)
 {
-	struct perf_evsel *evsel, *events = read_event_desc(ph, fd);
+	struct perf_evsel *evsel, *events;
 	u32 j;
 	u64 *id;
 
+	if (ff->events)
+		events = ff->events;
+	else
+		events = read_event_desc(ff);
+
 	if (!events) {
 		fprintf(fp, "# event desc: not available or unable to read\n");
 		return;
@@ -1310,22 +1390,21 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 	}
 
 	free_event_desc(events);
+	ff->events = NULL;
 }
 
-static void print_total_mem(struct perf_header *ph, int fd __maybe_unused,
-			    FILE *fp)
+static void print_total_mem(struct feat_fd *ff, FILE *fp)
 {
-	fprintf(fp, "# total memory : %Lu kB\n", ph->env.total_mem);
+	fprintf(fp, "# total memory : %llu kB\n", ff->ph->env.total_mem);
 }
 
-static void print_numa_topology(struct perf_header *ph, int fd __maybe_unused,
-				FILE *fp)
+static void print_numa_topology(struct feat_fd *ff, FILE *fp)
 {
 	int i;
 	struct numa_node *n;
 
-	for (i = 0; i < ph->env.nr_numa_nodes; i++) {
-		n = &ph->env.numa_nodes[i];
+	for (i = 0; i < ff->ph->env.nr_numa_nodes; i++) {
+		n = &ff->ph->env.numa_nodes[i];
 
 		fprintf(fp, "# node%u meminfo  : total = %"PRIu64" kB,"
 			    " free = %"PRIu64" kB\n",
@@ -1336,56 +1415,51 @@ static void print_numa_topology(struct perf_header *ph, int fd __maybe_unused,
 	}
 }
 
-static void print_cpuid(struct perf_header *ph, int fd __maybe_unused, FILE *fp)
+static void print_cpuid(struct feat_fd *ff, FILE *fp)
 {
-	fprintf(fp, "# cpuid : %s\n", ph->env.cpuid);
+	fprintf(fp, "# cpuid : %s\n", ff->ph->env.cpuid);
 }
 
-static void print_branch_stack(struct perf_header *ph __maybe_unused,
-			       int fd __maybe_unused, FILE *fp)
+static void print_branch_stack(struct feat_fd *ff __maybe_unused, FILE *fp)
 {
 	fprintf(fp, "# contains samples with branch stack\n");
 }
 
-static void print_auxtrace(struct perf_header *ph __maybe_unused,
-			   int fd __maybe_unused, FILE *fp)
+static void print_auxtrace(struct feat_fd *ff __maybe_unused, FILE *fp)
 {
 	fprintf(fp, "# contains AUX area data (e.g. instruction trace)\n");
 }
 
-static void print_stat(struct perf_header *ph __maybe_unused,
-		       int fd __maybe_unused, FILE *fp)
+static void print_stat(struct feat_fd *ff __maybe_unused, FILE *fp)
 {
 	fprintf(fp, "# contains stat data\n");
 }
 
-static void print_cache(struct perf_header *ph __maybe_unused,
-			int fd __maybe_unused, FILE *fp __maybe_unused)
+static void print_cache(struct feat_fd *ff, FILE *fp __maybe_unused)
 {
 	int i;
 
 	fprintf(fp, "# CPU cache info:\n");
-	for (i = 0; i < ph->env.caches_cnt; i++) {
+	for (i = 0; i < ff->ph->env.caches_cnt; i++) {
 		fprintf(fp, "#  ");
-		cpu_cache_level__fprintf(fp, &ph->env.caches[i]);
+		cpu_cache_level__fprintf(fp, &ff->ph->env.caches[i]);
 	}
 }
 
-static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
-			       FILE *fp)
+static void print_pmu_mappings(struct feat_fd *ff, FILE *fp)
 {
 	const char *delimiter = "# pmu mappings: ";
 	char *str, *tmp;
 	u32 pmu_num;
 	u32 type;
 
-	pmu_num = ph->env.nr_pmu_mappings;
+	pmu_num = ff->ph->env.nr_pmu_mappings;
 	if (!pmu_num) {
 		fprintf(fp, "# pmu mappings: not available\n");
 		return;
 	}
 
-	str = ph->env.pmu_mappings;
+	str = ff->ph->env.pmu_mappings;
 
 	while (pmu_num) {
 		type = strtoul(str, &tmp, 0);
@@ -1408,14 +1482,13 @@ error:
 	fprintf(fp, "# pmu mappings: unable to read\n");
 }
 
-static void print_group_desc(struct perf_header *ph, int fd __maybe_unused,
-			     FILE *fp)
+static void print_group_desc(struct feat_fd *ff, FILE *fp)
 {
 	struct perf_session *session;
 	struct perf_evsel *evsel;
 	u32 nr = 0;
 
-	session = container_of(ph, struct perf_session, header);
+	session = container_of(ff->ph, struct perf_session, header);
 
 	evlist__for_each_entry(session->evlist, evsel) {
 		if (perf_evsel__is_group_leader(evsel) &&
@@ -1588,113 +1661,61 @@ out:
 	return err;
 }
 
-static int process_tracing_data(struct perf_file_section *section __maybe_unused,
-				struct perf_header *ph __maybe_unused,
-				int fd, void *data)
-{
-	ssize_t ret = trace_report(fd, data, false);
-	return ret < 0 ? -1 : 0;
-}
-
-static int process_build_id(struct perf_file_section *section,
-			    struct perf_header *ph, int fd,
-			    void *data __maybe_unused)
-{
-	if (perf_header__read_build_ids(ph, fd, section->offset, section->size))
-		pr_debug("Failed to read buildids, continuing...\n");
-	return 0;
+/* Macro for features that simply need to read and store a string. */
+#define FEAT_PROCESS_STR_FUN(__feat, __feat_env) \
+static int process_##__feat(struct feat_fd *ff, void *data __maybe_unused) \
+{\
+	ff->ph->env.__feat_env = do_read_string(ff); \
+	return ff->ph->env.__feat_env ? 0 : -ENOMEM; \
 }
 
-static int process_hostname(struct perf_file_section *section __maybe_unused,
-			    struct perf_header *ph, int fd,
-			    void *data __maybe_unused)
-{
-	ph->env.hostname = do_read_string(fd, ph);
-	return ph->env.hostname ? 0 : -ENOMEM;
-}
+FEAT_PROCESS_STR_FUN(hostname, hostname);
+FEAT_PROCESS_STR_FUN(osrelease, os_release);
+FEAT_PROCESS_STR_FUN(version, version);
+FEAT_PROCESS_STR_FUN(arch, arch);
+FEAT_PROCESS_STR_FUN(cpudesc, cpu_desc);
+FEAT_PROCESS_STR_FUN(cpuid, cpuid);
 
-static int process_osrelease(struct perf_file_section *section __maybe_unused,
-			     struct perf_header *ph, int fd,
-			     void *data __maybe_unused)
+static int process_tracing_data(struct feat_fd *ff, void *data)
 {
-	ph->env.os_release = do_read_string(fd, ph);
-	return ph->env.os_release ? 0 : -ENOMEM;
-}
+	ssize_t ret = trace_report(ff->fd, data, false);
 
-static int process_version(struct perf_file_section *section __maybe_unused,
-			   struct perf_header *ph, int fd,
-			   void *data __maybe_unused)
-{
-	ph->env.version = do_read_string(fd, ph);
-	return ph->env.version ? 0 : -ENOMEM;
+	return ret < 0 ? -1 : 0;
 }
 
-static int process_arch(struct perf_file_section *section __maybe_unused,
-			struct perf_header *ph,	int fd,
-			void *data __maybe_unused)
+static int process_build_id(struct feat_fd *ff, void *data __maybe_unused)
 {
-	ph->env.arch = do_read_string(fd, ph);
-	return ph->env.arch ? 0 : -ENOMEM;
+	if (perf_header__read_build_ids(ff->ph, ff->fd, ff->offset, ff->size))
+		pr_debug("Failed to read buildids, continuing...\n");
+	return 0;
 }
 
-static int process_nrcpus(struct perf_file_section *section __maybe_unused,
-			  struct perf_header *ph, int fd,
-			  void *data __maybe_unused)
+static int process_nrcpus(struct feat_fd *ff, void *data __maybe_unused)
 {
-	ssize_t ret;
-	u32 nr;
-
-	ret = readn(fd, &nr, sizeof(nr));
-	if (ret != sizeof(nr))
-		return -1;
-
-	if (ph->needs_swap)
-		nr = bswap_32(nr);
-
-	ph->env.nr_cpus_avail = nr;
-
-	ret = readn(fd, &nr, sizeof(nr));
-	if (ret != sizeof(nr))
-		return -1;
+	int ret;
+	u32 nr_cpus_avail, nr_cpus_online;
 
-	if (ph->needs_swap)
-		nr = bswap_32(nr);
+	ret = do_read_u32(ff, &nr_cpus_avail);
+	if (ret)
+		return ret;
 
-	ph->env.nr_cpus_online = nr;
+	ret = do_read_u32(ff, &nr_cpus_online);
+	if (ret)
+		return ret;
+	ff->ph->env.nr_cpus_avail = (int)nr_cpus_avail;
+	ff->ph->env.nr_cpus_online = (int)nr_cpus_online;
 	return 0;
 }
 
-static int process_cpudesc(struct perf_file_section *section __maybe_unused,
-			   struct perf_header *ph, int fd,
-			   void *data __maybe_unused)
-{
-	ph->env.cpu_desc = do_read_string(fd, ph);
-	return ph->env.cpu_desc ? 0 : -ENOMEM;
-}
-
-static int process_cpuid(struct perf_file_section *section __maybe_unused,
-			 struct perf_header *ph,  int fd,
-			 void *data __maybe_unused)
-{
-	ph->env.cpuid = do_read_string(fd, ph);
-	return ph->env.cpuid ? 0 : -ENOMEM;
-}
-
-static int process_total_mem(struct perf_file_section *section __maybe_unused,
-			     struct perf_header *ph, int fd,
-			     void *data __maybe_unused)
+static int process_total_mem(struct feat_fd *ff, void *data __maybe_unused)
 {
-	uint64_t mem;
-	ssize_t ret;
+	u64 total_mem;
+	int ret;
 
-	ret = readn(fd, &mem, sizeof(mem));
-	if (ret != sizeof(mem))
+	ret = do_read_u64(ff, &total_mem);
+	if (ret)
 		return -1;
-
-	if (ph->needs_swap)
-		mem = bswap_64(mem);
-
-	ph->env.total_mem = mem;
+	ff->ph->env.total_mem = (unsigned long long)total_mem;
 	return 0;
 }
 
@@ -1731,43 +1752,42 @@ perf_evlist__set_event_name(struct perf_evlist *evlist,
 }
 
 static int
-process_event_desc(struct perf_file_section *section __maybe_unused,
-		   struct perf_header *header, int fd,
-		   void *data __maybe_unused)
+process_event_desc(struct feat_fd *ff, void *data __maybe_unused)
 {
 	struct perf_session *session;
-	struct perf_evsel *evsel, *events = read_event_desc(header, fd);
+	struct perf_evsel *evsel, *events = read_event_desc(ff);
 
 	if (!events)
 		return 0;
 
-	session = container_of(header, struct perf_session, header);
+	session = container_of(ff->ph, struct perf_session, header);
+
+	if (session->file->is_pipe) {
+		/* Save events for reading later by print_event_desc,
+		 * since they can't be read again in pipe mode. */
+		ff->events = events;
+	}
+
 	for (evsel = events; evsel->attr.size; evsel++)
 		perf_evlist__set_event_name(session->evlist, evsel);
 
-	free_event_desc(events);
+	if (!session->file->is_pipe)
+		free_event_desc(events);
 
 	return 0;
 }
 
-static int process_cmdline(struct perf_file_section *section,
-			   struct perf_header *ph, int fd,
-			   void *data __maybe_unused)
+static int process_cmdline(struct feat_fd *ff, void *data __maybe_unused)
 {
-	ssize_t ret;
 	char *str, *cmdline = NULL, **argv = NULL;
 	u32 nr, i, len = 0;
 
-	ret = readn(fd, &nr, sizeof(nr));
-	if (ret != sizeof(nr))
+	if (do_read_u32(ff, &nr))
 		return -1;
 
-	if (ph->needs_swap)
-		nr = bswap_32(nr);
-
-	ph->env.nr_cmdline = nr;
+	ff->ph->env.nr_cmdline = nr;
 
-	cmdline = zalloc(section->size + nr + 1);
+	cmdline = zalloc(ff->size + nr + 1);
 	if (!cmdline)
 		return -1;
 
@@ -1776,7 +1796,7 @@ static int process_cmdline(struct perf_file_section *section,
 		goto error;
 
 	for (i = 0; i < nr; i++) {
-		str = do_read_string(fd, ph);
+		str = do_read_string(ff);
 		if (!str)
 			goto error;
 
@@ -1785,8 +1805,8 @@ static int process_cmdline(struct perf_file_section *section,
 		len += strlen(str) + 1;
 		free(str);
 	}
-	ph->env.cmdline = cmdline;
-	ph->env.cmdline_argv = (const char **) argv;
+	ff->ph->env.cmdline = cmdline;
+	ff->ph->env.cmdline_argv = (const char **) argv;
 	return 0;
 
 error:
@@ -1795,35 +1815,29 @@ error:
 	return -1;
 }
 
-static int process_cpu_topology(struct perf_file_section *section,
-				struct perf_header *ph, int fd,
-				void *data __maybe_unused)
+static int process_cpu_topology(struct feat_fd *ff, void *data __maybe_unused)
 {
-	ssize_t ret;
 	u32 nr, i;
 	char *str;
 	struct strbuf sb;
-	int cpu_nr = ph->env.nr_cpus_avail;
+	int cpu_nr = ff->ph->env.nr_cpus_avail;
 	u64 size = 0;
+	struct perf_header *ph = ff->ph;
 
 	ph->env.cpu = calloc(cpu_nr, sizeof(*ph->env.cpu));
 	if (!ph->env.cpu)
 		return -1;
 
-	ret = readn(fd, &nr, sizeof(nr));
-	if (ret != sizeof(nr))
+	if (do_read_u32(ff, &nr))
 		goto free_cpu;
 
-	if (ph->needs_swap)
-		nr = bswap_32(nr);
-
 	ph->env.nr_sibling_cores = nr;
 	size += sizeof(u32);
 	if (strbuf_init(&sb, 128) < 0)
 		goto free_cpu;
 
 	for (i = 0; i < nr; i++) {
-		str = do_read_string(fd, ph);
+		str = do_read_string(ff);
 		if (!str)
 			goto error;
 
@@ -1835,18 +1849,14 @@ static int process_cpu_topology(struct perf_file_section *section,
 	}
 	ph->env.sibling_cores = strbuf_detach(&sb, NULL);
 
-	ret = readn(fd, &nr, sizeof(nr));
-	if (ret != sizeof(nr))
+	if (do_read_u32(ff, &nr))
 		return -1;
 
-	if (ph->needs_swap)
-		nr = bswap_32(nr);
-
 	ph->env.nr_sibling_threads = nr;
 	size += sizeof(u32);
 
 	for (i = 0; i < nr; i++) {
-		str = do_read_string(fd, ph);
+		str = do_read_string(ff);
 		if (!str)
 			goto error;
 
@@ -1862,28 +1872,20 @@ static int process_cpu_topology(struct perf_file_section *section,
 	 * The header may be from old perf,
 	 * which doesn't include core id and socket id information.
 	 */
-	if (section->size <= size) {
+	if (ff->size <= size) {
 		zfree(&ph->env.cpu);
 		return 0;
 	}
 
 	for (i = 0; i < (u32)cpu_nr; i++) {
-		ret = readn(fd, &nr, sizeof(nr));
-		if (ret != sizeof(nr))
+		if (do_read_u32(ff, &nr))
 			goto free_cpu;
 
-		if (ph->needs_swap)
-			nr = bswap_32(nr);
-
 		ph->env.cpu[i].core_id = nr;
 
-		ret = readn(fd, &nr, sizeof(nr));
-		if (ret != sizeof(nr))
+		if (do_read_u32(ff, &nr))
 			goto free_cpu;
 
-		if (ph->needs_swap)
-			nr = bswap_32(nr);
-
 		if (nr != (u32)-1 && nr > (u32)cpu_nr) {
 			pr_debug("socket_id number is too big."
 				 "You may need to upgrade the perf tool.\n");
@@ -1902,23 +1904,16 @@ free_cpu:
 	return -1;
 }
 
-static int process_numa_topology(struct perf_file_section *section __maybe_unused,
-				 struct perf_header *ph, int fd,
-				 void *data __maybe_unused)
+static int process_numa_topology(struct feat_fd *ff, void *data __maybe_unused)
 {
 	struct numa_node *nodes, *n;
-	ssize_t ret;
 	u32 nr, i;
 	char *str;
 
 	/* nr nodes */
-	ret = readn(fd, &nr, sizeof(nr));
-	if (ret != sizeof(nr))
+	if (do_read_u32(ff, &nr))
 		return -1;
 
-	if (ph->needs_swap)
-		nr = bswap_32(nr);
-
 	nodes = zalloc(sizeof(*nodes) * nr);
 	if (!nodes)
 		return -ENOMEM;
@@ -1927,25 +1922,16 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse
 		n = &nodes[i];
 
 		/* node number */
-		ret = readn(fd, &n->node, sizeof(u32));
-		if (ret != sizeof(n->node))
+		if (do_read_u32(ff, &n->node))
 			goto error;
 
-		ret = readn(fd, &n->mem_total, sizeof(u64));
-		if (ret != sizeof(u64))
+		if (do_read_u64(ff, &n->mem_total))
 			goto error;
 
-		ret = readn(fd, &n->mem_free, sizeof(u64));
-		if (ret != sizeof(u64))
+		if (do_read_u64(ff, &n->mem_free))
 			goto error;
 
-		if (ph->needs_swap) {
-			n->node      = bswap_32(n->node);
-			n->mem_total = bswap_64(n->mem_total);
-			n->mem_free  = bswap_64(n->mem_free);
-		}
-
-		str = do_read_string(fd, ph);
+		str = do_read_string(ff);
 		if (!str)
 			goto error;
 
@@ -1955,8 +1941,8 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse
 
 		free(str);
 	}
-	ph->env.nr_numa_nodes = nr;
-	ph->env.numa_nodes = nodes;
+	ff->ph->env.nr_numa_nodes = nr;
+	ff->ph->env.numa_nodes = nodes;
 	return 0;
 
 error:
@@ -1964,39 +1950,30 @@ error:
 	return -1;
 }
 
-static int process_pmu_mappings(struct perf_file_section *section __maybe_unused,
-				struct perf_header *ph, int fd,
-				void *data __maybe_unused)
+static int process_pmu_mappings(struct feat_fd *ff, void *data __maybe_unused)
 {
-	ssize_t ret;
 	char *name;
 	u32 pmu_num;
 	u32 type;
 	struct strbuf sb;
 
-	ret = readn(fd, &pmu_num, sizeof(pmu_num));
-	if (ret != sizeof(pmu_num))
+	if (do_read_u32(ff, &pmu_num))
 		return -1;
 
-	if (ph->needs_swap)
-		pmu_num = bswap_32(pmu_num);
-
 	if (!pmu_num) {
 		pr_debug("pmu mappings not available\n");
 		return 0;
 	}
 
-	ph->env.nr_pmu_mappings = pmu_num;
+	ff->ph->env.nr_pmu_mappings = pmu_num;
 	if (strbuf_init(&sb, 128) < 0)
 		return -1;
 
 	while (pmu_num) {
-		if (readn(fd, &type, sizeof(type)) != sizeof(type))
+		if (do_read_u32(ff, &type))
 			goto error;
-		if (ph->needs_swap)
-			type = bswap_32(type);
 
-		name = do_read_string(fd, ph);
+		name = do_read_string(ff);
 		if (!name)
 			goto error;
 
@@ -2007,12 +1984,12 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused
 			goto error;
 
 		if (!strcmp(name, "msr"))
-			ph->env.msr_pmu_type = type;
+			ff->ph->env.msr_pmu_type = type;
 
 		free(name);
 		pmu_num--;
 	}
-	ph->env.pmu_mappings = strbuf_detach(&sb, NULL);
+	ff->ph->env.pmu_mappings = strbuf_detach(&sb, NULL);
 	return 0;
 
 error:
@@ -2020,9 +1997,7 @@ error:
 	return -1;
 }
 
-static int process_group_desc(struct perf_file_section *section __maybe_unused,
-			      struct perf_header *ph, int fd,
-			      void *data __maybe_unused)
+static int process_group_desc(struct feat_fd *ff, void *data __maybe_unused)
 {
 	size_t ret = -1;
 	u32 i, nr, nr_groups;
@@ -2034,13 +2009,10 @@ static int process_group_desc(struct perf_file_section *section __maybe_unused,
 		u32 nr_members;
 	} *desc;
 
-	if (readn(fd, &nr_groups, sizeof(nr_groups)) != sizeof(nr_groups))
+	if (do_read_u32(ff, &nr_groups))
 		return -1;
 
-	if (ph->needs_swap)
-		nr_groups = bswap_32(nr_groups);
-
-	ph->env.nr_groups = nr_groups;
+	ff->ph->env.nr_groups = nr_groups;
 	if (!nr_groups) {
 		pr_debug("group desc not available\n");
 		return 0;
@@ -2051,26 +2023,21 @@ static int process_group_desc(struct perf_file_section *section __maybe_unused,
 		return -1;
 
 	for (i = 0; i < nr_groups; i++) {
-		desc[i].name = do_read_string(fd, ph);
+		desc[i].name = do_read_string(ff);
 		if (!desc[i].name)
 			goto out_free;
 
-		if (readn(fd, &desc[i].leader_idx, sizeof(u32)) != sizeof(u32))
+		if (do_read_u32(ff, &desc[i].leader_idx))
 			goto out_free;
 
-		if (readn(fd, &desc[i].nr_members, sizeof(u32)) != sizeof(u32))
+		if (do_read_u32(ff, &desc[i].nr_members))
 			goto out_free;
-
-		if (ph->needs_swap) {
-			desc[i].leader_idx = bswap_32(desc[i].leader_idx);
-			desc[i].nr_members = bswap_32(desc[i].nr_members);
-		}
 	}
 
 	/*
 	 * Rebuild group relationship based on the group_desc
 	 */
-	session = container_of(ph, struct perf_session, header);
+	session = container_of(ff->ph, struct perf_session, header);
 	session->evlist->nr_groups = nr_groups;
 
 	i = nr = 0;
@@ -2114,44 +2081,34 @@ out_free:
 	return ret;
 }
 
-static int process_auxtrace(struct perf_file_section *section,
-			    struct perf_header *ph, int fd,
-			    void *data __maybe_unused)
+static int process_auxtrace(struct feat_fd *ff, void *data __maybe_unused)
 {
 	struct perf_session *session;
 	int err;
 
-	session = container_of(ph, struct perf_session, header);
+	session = container_of(ff->ph, struct perf_session, header);
 
-	err = auxtrace_index__process(fd, section->size, session,
-				      ph->needs_swap);
+	err = auxtrace_index__process(ff->fd, ff->size, session,
+				      ff->ph->needs_swap);
 	if (err < 0)
 		pr_err("Failed to process auxtrace index\n");
 	return err;
 }
 
-static int process_cache(struct perf_file_section *section __maybe_unused,
-			 struct perf_header *ph __maybe_unused, int fd __maybe_unused,
-			 void *data __maybe_unused)
+static int process_cache(struct feat_fd *ff, void *data __maybe_unused)
 {
 	struct cpu_cache_level *caches;
 	u32 cnt, i, version;
 
-	if (readn(fd, &version, sizeof(version)) != sizeof(version))
+	if (do_read_u32(ff, &version))
 		return -1;
 
-	if (ph->needs_swap)
-		version = bswap_32(version);
-
 	if (version != 1)
 		return -1;
 
-	if (readn(fd, &cnt, sizeof(cnt)) != sizeof(cnt))
+	if (do_read_u32(ff, &cnt))
 		return -1;
 
-	if (ph->needs_swap)
-		cnt = bswap_32(cnt);
-
 	caches = zalloc(sizeof(*caches) * cnt);
 	if (!caches)
 		return -1;
@@ -2160,10 +2117,8 @@ static int process_cache(struct perf_file_section *section __maybe_unused,
 		struct cpu_cache_level c;
 
 		#define _R(v)						\
-			if (readn(fd, &c.v, sizeof(u32)) != sizeof(u32))\
+			if (do_read_u32(ff, &c.v))\
 				goto out_free_caches;			\
-			if (ph->needs_swap)				\
-				c.v = bswap_32(c.v);			\
 
 		_R(level)
 		_R(line_size)
@@ -2171,9 +2126,9 @@ static int process_cache(struct perf_file_section *section __maybe_unused,
 		_R(ways)
 		#undef _R
 
-		#define _R(v)				\
-			c.v = do_read_string(fd, ph);	\
-			if (!c.v)			\
+		#define _R(v)					\
+			c.v = do_read_string(ff);		\
+			if (!c.v)				\
 				goto out_free_caches;
 
 		_R(type)
@@ -2184,8 +2139,8 @@ static int process_cache(struct perf_file_section *section __maybe_unused,
 		caches[i] = c;
 	}
 
-	ph->env.caches = caches;
-	ph->env.caches_cnt = cnt;
+	ff->ph->env.caches = caches;
+	ff->ph->env.caches_cnt = cnt;
 	return 0;
 out_free_caches:
 	free(caches);
@@ -2193,48 +2148,62 @@ out_free_caches:
 }
 
 struct feature_ops {
-	int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
-	void (*print)(struct perf_header *h, int fd, FILE *fp);
-	int (*process)(struct perf_file_section *section,
-		       struct perf_header *h, int fd, void *data);
+	int (*write)(struct feat_fd *ff, struct perf_evlist *evlist);
+	void (*print)(struct feat_fd *ff, FILE *fp);
+	int (*process)(struct feat_fd *ff, void *data);
 	const char *name;
 	bool full_only;
+	bool synthesize;
 };
 
-#define FEAT_OPA(n, func) \
-	[n] = { .name = #n, .write = write_##func, .print = print_##func }
-#define FEAT_OPP(n, func) \
-	[n] = { .name = #n, .write = write_##func, .print = print_##func, \
-		.process = process_##func }
-#define FEAT_OPF(n, func) \
-	[n] = { .name = #n, .write = write_##func, .print = print_##func, \
-		.process = process_##func, .full_only = true }
+#define FEAT_OPR(n, func, __full_only) \
+	[HEADER_##n] = {					\
+		.name	    = __stringify(n),			\
+		.write	    = write_##func,			\
+		.print	    = print_##func,			\
+		.full_only  = __full_only,			\
+		.process    = process_##func,			\
+		.synthesize = true				\
+	}
+
+#define FEAT_OPN(n, func, __full_only) \
+	[HEADER_##n] = {					\
+		.name	    = __stringify(n),			\
+		.write	    = write_##func,			\
+		.print	    = print_##func,			\
+		.full_only  = __full_only,			\
+		.process    = process_##func			\
+	}
 
 /* feature_ops not implemented: */
 #define print_tracing_data	NULL
 #define print_build_id		NULL
 
+#define process_branch_stack	NULL
+#define process_stat		NULL
+
+
 static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
-	FEAT_OPP(HEADER_TRACING_DATA,	tracing_data),
-	FEAT_OPP(HEADER_BUILD_ID,	build_id),
-	FEAT_OPP(HEADER_HOSTNAME,	hostname),
-	FEAT_OPP(HEADER_OSRELEASE,	osrelease),
-	FEAT_OPP(HEADER_VERSION,	version),
-	FEAT_OPP(HEADER_ARCH,		arch),
-	FEAT_OPP(HEADER_NRCPUS,		nrcpus),
-	FEAT_OPP(HEADER_CPUDESC,	cpudesc),
-	FEAT_OPP(HEADER_CPUID,		cpuid),
-	FEAT_OPP(HEADER_TOTAL_MEM,	total_mem),
-	FEAT_OPP(HEADER_EVENT_DESC,	event_desc),
-	FEAT_OPP(HEADER_CMDLINE,	cmdline),
-	FEAT_OPF(HEADER_CPU_TOPOLOGY,	cpu_topology),
-	FEAT_OPF(HEADER_NUMA_TOPOLOGY,	numa_topology),
-	FEAT_OPA(HEADER_BRANCH_STACK,	branch_stack),
-	FEAT_OPP(HEADER_PMU_MAPPINGS,	pmu_mappings),
-	FEAT_OPP(HEADER_GROUP_DESC,	group_desc),
-	FEAT_OPP(HEADER_AUXTRACE,	auxtrace),
-	FEAT_OPA(HEADER_STAT,		stat),
-	FEAT_OPF(HEADER_CACHE,		cache),
+	FEAT_OPN(TRACING_DATA,	tracing_data,	false),
+	FEAT_OPN(BUILD_ID,	build_id,	false),
+	FEAT_OPR(HOSTNAME,	hostname,	false),
+	FEAT_OPR(OSRELEASE,	osrelease,	false),
+	FEAT_OPR(VERSION,	version,	false),
+	FEAT_OPR(ARCH,		arch,		false),
+	FEAT_OPR(NRCPUS,	nrcpus,		false),
+	FEAT_OPR(CPUDESC,	cpudesc,	false),
+	FEAT_OPR(CPUID,		cpuid,		false),
+	FEAT_OPR(TOTAL_MEM,	total_mem,	false),
+	FEAT_OPR(EVENT_DESC,	event_desc,	false),
+	FEAT_OPR(CMDLINE,	cmdline,	false),
+	FEAT_OPR(CPU_TOPOLOGY,	cpu_topology,	true),
+	FEAT_OPR(NUMA_TOPOLOGY,	numa_topology,	true),
+	FEAT_OPN(BRANCH_STACK,	branch_stack,	false),
+	FEAT_OPR(PMU_MAPPINGS,	pmu_mappings,	false),
+	FEAT_OPN(GROUP_DESC,	group_desc,	false),
+	FEAT_OPN(AUXTRACE,	auxtrace,	false),
+	FEAT_OPN(STAT,		stat,		false),
+	FEAT_OPN(CACHE,		cache,		true),
 };
 
 struct header_print_data {
@@ -2247,6 +2216,7 @@ static int perf_file_section__fprintf_info(struct perf_file_section *section,
 					   int feat, int fd, void *data)
 {
 	struct header_print_data *hd = data;
+	struct feat_fd ff;
 
 	if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) {
 		pr_debug("Failed to lseek to %" PRIu64 " offset for feature "
@@ -2260,8 +2230,13 @@ static int perf_file_section__fprintf_info(struct perf_file_section *section,
 	if (!feat_ops[feat].print)
 		return 0;
 
+	ff = (struct  feat_fd) {
+		.fd = fd,
+		.ph = ph,
+	};
+
 	if (!feat_ops[feat].full_only || hd->full)
-		feat_ops[feat].print(ph, fd, hd->fp);
+		feat_ops[feat].print(&ff, hd->fp);
 	else
 		fprintf(hd->fp, "# %s info available, use -I to display\n",
 			feat_ops[feat].name);
@@ -2302,29 +2277,32 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full)
 	return 0;
 }
 
-static int do_write_feat(int fd, struct perf_header *h, int type,
+static int do_write_feat(struct feat_fd *ff, int type,
 			 struct perf_file_section **p,
 			 struct perf_evlist *evlist)
 {
 	int err;
 	int ret = 0;
 
-	if (perf_header__has_feat(h, type)) {
+	if (perf_header__has_feat(ff->ph, type)) {
 		if (!feat_ops[type].write)
 			return -1;
 
-		(*p)->offset = lseek(fd, 0, SEEK_CUR);
+		if (WARN(ff->buf, "Error: calling %s in pipe-mode.\n", __func__))
+			return -1;
 
-		err = feat_ops[type].write(fd, h, evlist);
+		(*p)->offset = lseek(ff->fd, 0, SEEK_CUR);
+
+		err = feat_ops[type].write(ff, evlist);
 		if (err < 0) {
 			pr_debug("failed to write feature %s\n", feat_ops[type].name);
 
 			/* undo anything written */
-			lseek(fd, (*p)->offset, SEEK_SET);
+			lseek(ff->fd, (*p)->offset, SEEK_SET);
 
 			return -1;
 		}
-		(*p)->size = lseek(fd, 0, SEEK_CUR) - (*p)->offset;
+		(*p)->size = lseek(ff->fd, 0, SEEK_CUR) - (*p)->offset;
 		(*p)++;
 	}
 	return ret;
@@ -2334,12 +2312,18 @@ static int perf_header__adds_write(struct perf_header *header,
 				   struct perf_evlist *evlist, int fd)
 {
 	int nr_sections;
+	struct feat_fd ff;
 	struct perf_file_section *feat_sec, *p;
 	int sec_size;
 	u64 sec_start;
 	int feat;
 	int err;
 
+	ff = (struct feat_fd){
+		.fd  = fd,
+		.ph = header,
+	};
+
 	nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
 	if (!nr_sections)
 		return 0;
@@ -2354,7 +2338,7 @@ static int perf_header__adds_write(struct perf_header *header,
 	lseek(fd, sec_start + sec_size, SEEK_SET);
 
 	for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) {
-		if (do_write_feat(fd, header, feat, &p, evlist))
+		if (do_write_feat(&ff, feat, &p, evlist))
 			perf_header__clear_feat(header, feat);
 	}
 
@@ -2363,7 +2347,7 @@ static int perf_header__adds_write(struct perf_header *header,
 	 * may write more than needed due to dropped feature, but
 	 * this is okay, reader will skip the mising entries
 	 */
-	err = do_write(fd, feat_sec, sec_size);
+	err = do_write(&ff, feat_sec, sec_size);
 	if (err < 0)
 		pr_debug("failed to write feature section\n");
 	free(feat_sec);
@@ -2373,14 +2357,17 @@ static int perf_header__adds_write(struct perf_header *header,
 int perf_header__write_pipe(int fd)
 {
 	struct perf_pipe_file_header f_header;
+	struct feat_fd ff;
 	int err;
 
+	ff = (struct feat_fd){ .fd = fd };
+
 	f_header = (struct perf_pipe_file_header){
 		.magic	   = PERF_MAGIC,
 		.size	   = sizeof(f_header),
 	};
 
-	err = do_write(fd, &f_header, sizeof(f_header));
+	err = do_write(&ff, &f_header, sizeof(f_header));
 	if (err < 0) {
 		pr_debug("failed to write perf pipe header\n");
 		return err;
@@ -2397,21 +2384,23 @@ int perf_session__write_header(struct perf_session *session,
 	struct perf_file_attr   f_attr;
 	struct perf_header *header = &session->header;
 	struct perf_evsel *evsel;
+	struct feat_fd ff;
 	u64 attr_offset;
 	int err;
 
+	ff = (struct feat_fd){ .fd = fd};
 	lseek(fd, sizeof(f_header), SEEK_SET);
 
 	evlist__for_each_entry(session->evlist, evsel) {
 		evsel->id_offset = lseek(fd, 0, SEEK_CUR);
-		err = do_write(fd, evsel->id, evsel->ids * sizeof(u64));
+		err = do_write(&ff, evsel->id, evsel->ids * sizeof(u64));
 		if (err < 0) {
 			pr_debug("failed to write perf header\n");
 			return err;
 		}
 	}
 
-	attr_offset = lseek(fd, 0, SEEK_CUR);
+	attr_offset = lseek(ff.fd, 0, SEEK_CUR);
 
 	evlist__for_each_entry(evlist, evsel) {
 		f_attr = (struct perf_file_attr){
@@ -2421,7 +2410,7 @@ int perf_session__write_header(struct perf_session *session,
 				.size   = evsel->ids * sizeof(u64),
 			}
 		};
-		err = do_write(fd, &f_attr, sizeof(f_attr));
+		err = do_write(&ff, &f_attr, sizeof(f_attr));
 		if (err < 0) {
 			pr_debug("failed to write perf header attribute\n");
 			return err;
@@ -2456,7 +2445,7 @@ int perf_session__write_header(struct perf_session *session,
 	memcpy(&f_header.adds_features, &header->adds_features, sizeof(header->adds_features));
 
 	lseek(fd, 0, SEEK_SET);
-	err = do_write(fd, &f_header, sizeof(f_header));
+	err = do_write(&ff, &f_header, sizeof(f_header));
 	if (err < 0) {
 		pr_debug("failed to write perf header\n");
 		return err;
@@ -2710,6 +2699,13 @@ static int perf_file_section__process(struct perf_file_section *section,
 				      struct perf_header *ph,
 				      int feat, int fd, void *data)
 {
+	struct feat_fd fdd = {
+		.fd	= fd,
+		.ph	= ph,
+		.size	= section->size,
+		.offset	= section->offset,
+	};
+
 	if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) {
 		pr_debug("Failed to lseek to %" PRIu64 " offset for feature "
 			  "%d, continuing...\n", section->offset, feat);
@@ -2724,13 +2720,17 @@ static int perf_file_section__process(struct perf_file_section *section,
 	if (!feat_ops[feat].process)
 		return 0;
 
-	return feat_ops[feat].process(section, ph, fd, data);
+	return feat_ops[feat].process(&fdd, data);
 }
 
 static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
 				       struct perf_header *ph, int fd,
 				       bool repipe)
 {
+	struct feat_fd ff = {
+		.fd = STDOUT_FILENO,
+		.ph = ph,
+	};
 	ssize_t ret;
 
 	ret = readn(fd, header, sizeof(*header));
@@ -2745,7 +2745,7 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
 	if (ph->needs_swap)
 		header->size = bswap_64(header->size);
 
-	if (repipe && do_write(STDOUT_FILENO, header, sizeof(*header)) < 0)
+	if (repipe && do_write(&ff, header, sizeof(*header)) < 0)
 		return -1;
 
 	return 0;
@@ -2995,6 +2995,103 @@ int perf_event__synthesize_attr(struct perf_tool *tool,
 	return err;
 }
 
+int perf_event__synthesize_features(struct perf_tool *tool,
+				    struct perf_session *session,
+				    struct perf_evlist *evlist,
+				    perf_event__handler_t process)
+{
+	struct perf_header *header = &session->header;
+	struct feat_fd ff;
+	struct feature_event *fe;
+	size_t sz, sz_hdr;
+	int feat, ret;
+
+	sz_hdr = sizeof(fe->header);
+	sz = sizeof(union perf_event);
+	/* get a nice alignment */
+	sz = PERF_ALIGN(sz, page_size);
+
+	memset(&ff, 0, sizeof(ff));
+
+	ff.buf = malloc(sz);
+	if (!ff.buf)
+		return -ENOMEM;
+
+	ff.size = sz - sz_hdr;
+
+	for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) {
+		if (!feat_ops[feat].synthesize) {
+			pr_debug("No record header feature for header :%d\n", feat);
+			continue;
+		}
+
+		ff.offset = sizeof(*fe);
+
+		ret = feat_ops[feat].write(&ff, evlist);
+		if (ret || ff.offset <= (ssize_t)sizeof(*fe)) {
+			pr_debug("Error writing feature\n");
+			continue;
+		}
+		/* ff.buf may have changed due to realloc in do_write() */
+		fe = ff.buf;
+		memset(fe, 0, sizeof(*fe));
+
+		fe->feat_id = feat;
+		fe->header.type = PERF_RECORD_HEADER_FEATURE;
+		fe->header.size = ff.offset;
+
+		ret = process(tool, ff.buf, NULL, NULL);
+		if (ret) {
+			free(ff.buf);
+			return ret;
+		}
+	}
+	free(ff.buf);
+	return 0;
+}
+
+int perf_event__process_feature(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_session *session __maybe_unused)
+{
+	struct feat_fd ff = { .fd = 0 };
+	struct feature_event *fe = (struct feature_event *)event;
+	int type = fe->header.type;
+	u64 feat = fe->feat_id;
+
+	if (type < 0 || type >= PERF_RECORD_HEADER_MAX) {
+		pr_warning("invalid record type %d in pipe-mode\n", type);
+		return 0;
+	}
+	if (feat == HEADER_RESERVED || feat > HEADER_LAST_FEATURE) {
+		pr_warning("invalid record type %d in pipe-mode\n", type);
+		return -1;
+	}
+
+	if (!feat_ops[feat].process)
+		return 0;
+
+	ff.buf  = (void *)fe->data;
+	ff.size = event->header.size - sizeof(event->header);
+	ff.ph = &session->header;
+
+	if (feat_ops[feat].process(&ff, NULL))
+		return -1;
+
+	if (!feat_ops[feat].print || !tool->show_feat_hdr)
+		return 0;
+
+	if (!feat_ops[feat].full_only ||
+	    tool->show_feat_hdr >= SHOW_FEAT_HEADER_FULL_INFO) {
+		feat_ops[feat].print(&ff, stdout);
+	} else {
+		fprintf(stdout, "# %s info available, use -I to display\n",
+			feat_ops[feat].name);
+	}
+
+	return 0;
+}
+
 static struct event_update_event *
 event_update_event__new(size_t size, u64 type, u64 id)
 {
@@ -3253,6 +3350,7 @@ int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd,
 	union perf_event ev;
 	struct tracing_data *tdata;
 	ssize_t size = 0, aligned_size = 0, padding;
+	struct feat_fd ff;
 	int err __maybe_unused = 0;
 
 	/*
@@ -3287,7 +3385,9 @@ int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd,
 	 */
 	tracing_data_put(tdata);
 
-	write_padded(fd, NULL, 0, padding);
+	ff = (struct feat_fd){ .fd = fd };
+	if (write_padded(&ff, NULL, 0, padding))
+		return -1;
 
 	return aligned_size;
 }
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index d30109b421ee..f7a16ee527b8 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -101,6 +101,15 @@ int perf_header__process_sections(struct perf_header *header, int fd,
 
 int perf_header__fprintf_info(struct perf_session *s, FILE *fp, bool full);
 
+int perf_event__synthesize_features(struct perf_tool *tool,
+				    struct perf_session *session,
+				    struct perf_evlist *evlist,
+				    perf_event__handler_t process);
+
+int perf_event__process_feature(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_session *session);
+
 int perf_event__synthesize_attr(struct perf_tool *tool,
 				struct perf_event_attr *attr, u32 ids, u64 *id,
 				perf_event__handler_t process);
@@ -144,7 +153,12 @@ bool is_perf_magic(u64 magic);
 
 #define NAME_ALIGN 64
 
-int write_padded(int fd, const void *bf, size_t count, size_t count_aligned);
+struct feat_fd;
+
+int do_write(struct feat_fd *fd, const void *buf, size_t size);
+
+int write_padded(struct feat_fd *fd, const void *bf,
+		 size_t count, size_t count_aligned);
 
 /*
  * arch specific callback
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index cf0186a088c1..9453b2e27015 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -749,12 +749,9 @@ iter_prepare_branch_entry(struct hist_entry_iter *iter, struct addr_location *al
 }
 
 static int
-iter_add_single_branch_entry(struct hist_entry_iter *iter,
+iter_add_single_branch_entry(struct hist_entry_iter *iter __maybe_unused,
 			     struct addr_location *al __maybe_unused)
 {
-	/* to avoid calling callback function */
-	iter->he = NULL;
-
 	return 0;
 }
 
@@ -1762,6 +1759,8 @@ void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *pro
 	else
 		use_callchain = symbol_conf.use_callchain;
 
+	use_callchain |= symbol_conf.show_branchflag_count;
+
 	output_resort(evsel__hists(evsel), prog, use_callchain, NULL);
 }
 
diff --git a/tools/perf/util/intel-pt-decoder/Build b/tools/perf/util/intel-pt-decoder/Build
index 7aca5d6d7e1f..10e0814bb8d2 100644
--- a/tools/perf/util/intel-pt-decoder/Build
+++ b/tools/perf/util/intel-pt-decoder/Build
@@ -25,6 +25,6 @@ $(OUTPUT)util/intel-pt-decoder/intel-pt-insn-decoder.o: util/intel-pt-decoder/in
 
 CFLAGS_intel-pt-insn-decoder.o += -I$(OUTPUT)util/intel-pt-decoder
 
-ifneq ($(CC), clang)
+ifeq ($(CC_NO_CLANG), 1)
   CFLAGS_intel-pt-insn-decoder.o += -Wno-override-init
 endif
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index c6a15f204c03..209b0c82eff4 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -33,7 +33,7 @@ struct llvm_param llvm_param = {
 
 int perf_llvm_config(const char *var, const char *value)
 {
-	if (prefixcmp(var, "llvm."))
+	if (!strstarts(var, "llvm."))
 		return 0;
 	var += sizeof("llvm.") - 1;
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 2e9eb6aa3ce2..5c8eacaca4f4 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -705,7 +705,8 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp)
 
 	if (kdso->has_build_id) {
 		char filename[PATH_MAX];
-		if (dso__build_id_filename(kdso, filename, sizeof(filename)))
+		if (dso__build_id_filename(kdso, filename, sizeof(filename),
+					   false))
 			printed += fprintf(fp, "[0] %s\n", filename);
 	}
 
@@ -1137,7 +1138,8 @@ int __weak arch__fix_module_text_start(u64 *start __maybe_unused,
 	return 0;
 }
 
-static int machine__create_module(void *arg, const char *name, u64 start)
+static int machine__create_module(void *arg, const char *name, u64 start,
+				  u64 size)
 {
 	struct machine *machine = arg;
 	struct map *map;
@@ -1148,6 +1150,7 @@ static int machine__create_module(void *arg, const char *name, u64 start)
 	map = machine__findnew_module_map(machine, start, name);
 	if (map == NULL)
 		return -1;
+	map->end = start + size;
 
 	dso__kernel_module_get_build_id(map->dso, machine->root_dir);
 
@@ -1392,7 +1395,7 @@ int machine__process_mmap2_event(struct machine *machine,
 
 	map = map__new(machine, event->mmap2.start,
 			event->mmap2.len, event->mmap2.pgoff,
-			event->mmap2.pid, event->mmap2.maj,
+			event->mmap2.maj,
 			event->mmap2.min, event->mmap2.ino,
 			event->mmap2.ino_generation,
 			event->mmap2.prot,
@@ -1450,7 +1453,7 @@ int machine__process_mmap_event(struct machine *machine, union perf_event *event
 
 	map = map__new(machine, event->mmap.start,
 			event->mmap.len, event->mmap.pgoff,
-			event->mmap.pid, 0, 0, 0, 0, 0, 0,
+			0, 0, 0, 0, 0, 0,
 			event->mmap.filename,
 			type, thread);
 
@@ -1681,7 +1684,8 @@ static int add_callchain_ip(struct thread *thread,
 			    bool branch,
 			    struct branch_flags *flags,
 			    int nr_loop_iter,
-			    int samples)
+			    int samples,
+			    u64 branch_from)
 {
 	struct addr_location al;
 
@@ -1734,7 +1738,8 @@ static int add_callchain_ip(struct thread *thread,
 	if (symbol_conf.hide_unresolved && al.sym == NULL)
 		return 0;
 	return callchain_cursor_append(cursor, al.addr, al.map, al.sym,
-				       branch, flags, nr_loop_iter, samples);
+				       branch, flags, nr_loop_iter, samples,
+				       branch_from);
 }
 
 struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
@@ -1813,7 +1818,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 	struct ip_callchain *chain = sample->callchain;
 	int chain_nr = min(max_stack, (int)chain->nr), i;
 	u8 cpumode = PERF_RECORD_MISC_USER;
-	u64 ip;
+	u64 ip, branch_from = 0;
 
 	for (i = 0; i < chain_nr; i++) {
 		if (chain->ips[i] == PERF_CONTEXT_USER)
@@ -1855,6 +1860,8 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 					ip = lbr_stack->entries[0].to;
 					branch = true;
 					flags = &lbr_stack->entries[0].flags;
+					branch_from =
+						lbr_stack->entries[0].from;
 				}
 			} else {
 				if (j < lbr_nr) {
@@ -1869,12 +1876,15 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 					ip = lbr_stack->entries[0].to;
 					branch = true;
 					flags = &lbr_stack->entries[0].flags;
+					branch_from =
+						lbr_stack->entries[0].from;
 				}
 			}
 
 			err = add_callchain_ip(thread, cursor, parent,
 					       root_al, &cpumode, ip,
-					       branch, flags, 0, 0);
+					       branch, flags, 0, 0,
+					       branch_from);
 			if (err)
 				return (err < 0) ? err : 0;
 		}
@@ -1894,13 +1904,16 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 {
 	struct branch_stack *branch = sample->branch_stack;
 	struct ip_callchain *chain = sample->callchain;
-	int chain_nr = chain->nr;
+	int chain_nr = 0;
 	u8 cpumode = PERF_RECORD_MISC_USER;
 	int i, j, err, nr_entries;
 	int skip_idx = -1;
 	int first_call = 0;
 	int nr_loop_iter;
 
+	if (chain)
+		chain_nr = chain->nr;
+
 	if (perf_evsel__has_branch_callstack(evsel)) {
 		err = resolve_lbr_callchain_sample(thread, cursor, sample, parent,
 						   root_al, max_stack);
@@ -1938,6 +1951,10 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 		for (i = 0; i < nr; i++) {
 			if (callchain_param.order == ORDER_CALLEE) {
 				be[i] = branch->entries[i];
+
+				if (chain == NULL)
+					continue;
+
 				/*
 				 * Check for overlap into the callchain.
 				 * The return address is one off compared to
@@ -1973,24 +1990,29 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 						       root_al,
 						       NULL, be[i].to,
 						       true, &be[i].flags,
-						       nr_loop_iter, 1);
+						       nr_loop_iter, 1,
+						       be[i].from);
 			else
 				err = add_callchain_ip(thread, cursor, parent,
 						       root_al,
 						       NULL, be[i].to,
 						       true, &be[i].flags,
-						       0, 0);
+						       0, 0, be[i].from);
 
 			if (!err)
 				err = add_callchain_ip(thread, cursor, parent, root_al,
 						       NULL, be[i].from,
 						       true, &be[i].flags,
-						       0, 0);
+						       0, 0, 0);
 			if (err == -EINVAL)
 				break;
 			if (err)
 				return err;
 		}
+
+		if (chain_nr == 0)
+			return 0;
+
 		chain_nr -= nr;
 	}
 
@@ -2015,7 +2037,7 @@ check_calls:
 
 		err = add_callchain_ip(thread, cursor, parent,
 				       root_al, &cpumode, ip,
-				       false, NULL, 0, 0);
+				       false, NULL, 0, 0, 0);
 
 		if (err)
 			return (err < 0) ? err : 0;
@@ -2032,7 +2054,7 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
 		return 0;
 	return callchain_cursor_append(cursor, entry->ip,
 				       entry->map, entry->sym,
-				       false, NULL, 0, 0);
+				       false, NULL, 0, 0, 0);
 }
 
 static int thread__resolve_callchain_unwind(struct thread *thread,
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 2179b2deb730..bdaa0a4edc17 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -16,6 +16,7 @@
 #include "machine.h"
 #include <linux/string.h>
 #include "srcline.h"
+#include "namespaces.h"
 #include "unwind.h"
 
 static void __maps__insert(struct maps *maps, struct map *map);
@@ -145,11 +146,13 @@ void map__init(struct map *map, enum map_type type,
 }
 
 struct map *map__new(struct machine *machine, u64 start, u64 len,
-		     u64 pgoff, u32 pid, u32 d_maj, u32 d_min, u64 ino,
+		     u64 pgoff, u32 d_maj, u32 d_min, u64 ino,
 		     u64 ino_gen, u32 prot, u32 flags, char *filename,
 		     enum map_type type, struct thread *thread)
 {
 	struct map *map = malloc(sizeof(*map));
+	struct nsinfo *nsi = NULL;
+	struct nsinfo *nnsi;
 
 	if (map != NULL) {
 		char newfilename[PATH_MAX];
@@ -167,9 +170,11 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 		map->ino_generation = ino_gen;
 		map->prot = prot;
 		map->flags = flags;
+		nsi = nsinfo__get(thread->nsinfo);
 
-		if ((anon || no_dso) && type == MAP__FUNCTION) {
-			snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", pid);
+		if ((anon || no_dso) && nsi && type == MAP__FUNCTION) {
+			snprintf(newfilename, sizeof(newfilename),
+				 "/tmp/perf-%d.map", nsi->pid);
 			filename = newfilename;
 		}
 
@@ -179,6 +184,16 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 		}
 
 		if (vdso) {
+			/* The vdso maps are always on the host and not the
+			 * container.  Ensure that we don't use setns to look
+			 * them up.
+			 */
+			nnsi = nsinfo__copy(nsi);
+			if (nnsi) {
+				nsinfo__put(nsi);
+				nnsi->need_setns = false;
+				nsi = nnsi;
+			}
 			pgoff = 0;
 			dso = machine__findnew_vdso(machine, thread);
 		} else
@@ -200,10 +215,12 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 			if (type != MAP__FUNCTION)
 				dso__set_loaded(dso, map->type);
 		}
+		dso->nsinfo = nsi;
 		dso__put(dso);
 	}
 	return map;
 out_delete:
+	nsinfo__put(nsi);
 	free(map);
 	return NULL;
 }
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index f9e8ac8a52cd..73aacf7a7dc4 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -141,7 +141,7 @@ struct thread;
 void map__init(struct map *map, enum map_type type,
 	       u64 start, u64 end, u64 pgoff, struct dso *dso);
 struct map *map__new(struct machine *machine, u64 start, u64 len,
-		     u64 pgoff, u32 pid, u32 d_maj, u32 d_min, u64 ino,
+		     u64 pgoff, u32 d_maj, u32 d_min, u64 ino,
 		     u64 ino_gen, u32 prot, u32 flags,
 		     char *filename, enum map_type type, struct thread *thread);
 struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 06f5a3a4295c..28afe5fa84d6 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -166,11 +166,20 @@ static const char * const mem_lvl[] = {
 	"Uncached",
 };
 
+static const char * const mem_lvlnum[] = {
+	[PERF_MEM_LVLNUM_ANY_CACHE] = "Any cache",
+	[PERF_MEM_LVLNUM_LFB] = "LFB",
+	[PERF_MEM_LVLNUM_RAM] = "RAM",
+	[PERF_MEM_LVLNUM_PMEM] = "PMEM",
+	[PERF_MEM_LVLNUM_NA] = "N/A",
+};
+
 int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 {
 	size_t i, l = 0;
 	u64 m =  PERF_MEM_LVL_NA;
 	u64 hit, miss;
+	int printed;
 
 	if (mem_info)
 		m  = mem_info->data_src.mem_lvl;
@@ -184,17 +193,37 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 	/* already taken care of */
 	m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
 
+
+	if (mem_info && mem_info->data_src.mem_remote) {
+		strcat(out, "Remote ");
+		l += 7;
+	}
+
+	printed = 0;
 	for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
 		if (!(m & 0x1))
 			continue;
-		if (l) {
+		if (printed++) {
 			strcat(out, " or ");
 			l += 4;
 		}
 		l += scnprintf(out + l, sz - l, mem_lvl[i]);
 	}
-	if (*out == '\0')
-		l += scnprintf(out, sz - l, "N/A");
+
+	if (mem_info && mem_info->data_src.mem_lvl_num) {
+		int lvl = mem_info->data_src.mem_lvl_num;
+		if (printed++) {
+			strcat(out, " or ");
+			l += 4;
+		}
+		if (mem_lvlnum[lvl])
+			l += scnprintf(out + l, sz - l, mem_lvlnum[lvl]);
+		else
+			l += scnprintf(out + l, sz - l, "L%d", lvl);
+	}
+
+	if (l == 0)
+		l += scnprintf(out + l, sz - l, "N/A");
 	if (hit)
 		l += scnprintf(out + l, sz - l, " hit");
 	if (miss)
@@ -231,6 +260,14 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 		}
 		l += scnprintf(out + l, sz - l, snoop_access[i]);
 	}
+	if (mem_info &&
+	     (mem_info->data_src.mem_snoopx & PERF_MEM_SNOOPX_FWD)) {
+		if (l) {
+			strcat(out, " or ");
+			l += 4;
+		}
+		l += scnprintf(out + l, sz - l, "Fwd");
+	}
 
 	if (*out == '\0')
 		l += scnprintf(out, sz - l, "N/A");
@@ -279,6 +316,11 @@ int c2c_decode_stats(struct c2c_stats *stats, struct mem_info *mi)
 	u64 lvl    = data_src->mem_lvl;
 	u64 snoop  = data_src->mem_snoop;
 	u64 lock   = data_src->mem_lock;
+	/*
+	 * Skylake might report unknown remote level via this
+	 * bit, consider it when evaluating remote HITMs.
+	 */
+	bool mrem  = data_src->mem_remote;
 	int err = 0;
 
 #define HITM_INC(__f)		\
@@ -324,7 +366,8 @@ do {				\
 			}
 
 			if ((lvl & P(LVL, REM_RAM1)) ||
-			    (lvl & P(LVL, REM_RAM2))) {
+			    (lvl & P(LVL, REM_RAM2)) ||
+			     mrem) {
 				stats->rmt_dram++;
 				if (snoop & P(SNOOP, HIT))
 					stats->ld_shared++;
@@ -334,7 +377,8 @@ do {				\
 		}
 
 		if ((lvl & P(LVL, REM_CCE1)) ||
-		    (lvl & P(LVL, REM_CCE2))) {
+		    (lvl & P(LVL, REM_CCE2)) ||
+		     mrem) {
 			if (snoop & P(SNOOP, HIT))
 				stats->rmt_hit++;
 			else if (snoop & P(SNOOP, HITM))
diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
index 67dcbcc73c7d..a58e91197729 100644
--- a/tools/perf/util/namespaces.c
+++ b/tools/perf/util/namespaces.c
@@ -9,9 +9,14 @@
 #include "namespaces.h"
 #include "util.h"
 #include "event.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <sched.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 
 struct namespaces *namespaces__new(struct namespaces_event *event)
 {
@@ -35,3 +40,209 @@ void namespaces__free(struct namespaces *namespaces)
 {
 	free(namespaces);
 }
+
+int nsinfo__init(struct nsinfo *nsi)
+{
+	char oldns[PATH_MAX];
+	char spath[PATH_MAX];
+	char *newns = NULL;
+	char *statln = NULL;
+	struct stat old_stat;
+	struct stat new_stat;
+	FILE *f = NULL;
+	size_t linesz = 0;
+	int rv = -1;
+
+	if (snprintf(oldns, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
+		return rv;
+
+	if (asprintf(&newns, "/proc/%d/ns/mnt", nsi->pid) == -1)
+		return rv;
+
+	if (stat(oldns, &old_stat) < 0)
+		goto out;
+
+	if (stat(newns, &new_stat) < 0)
+		goto out;
+
+	/* Check if the mount namespaces differ, if so then indicate that we
+	 * want to switch as part of looking up dso/map data.
+	 */
+	if (old_stat.st_ino != new_stat.st_ino) {
+		nsi->need_setns = true;
+		nsi->mntns_path = newns;
+		newns = NULL;
+	}
+
+	/* If we're dealing with a process that is in a different PID namespace,
+	 * attempt to work out the innermost tgid for the process.
+	 */
+	if (snprintf(spath, PATH_MAX, "/proc/%d/status", nsi->pid) >= PATH_MAX)
+		goto out;
+
+	f = fopen(spath, "r");
+	if (f == NULL)
+		goto out;
+
+	while (getline(&statln, &linesz, f) != -1) {
+		/* Use tgid if CONFIG_PID_NS is not defined. */
+		if (strstr(statln, "Tgid:") != NULL) {
+			nsi->tgid = (pid_t)strtol(strrchr(statln, '\t'),
+						     NULL, 10);
+			nsi->nstgid = nsi->tgid;
+		}
+
+		if (strstr(statln, "NStgid:") != NULL) {
+			nsi->nstgid = (pid_t)strtol(strrchr(statln, '\t'),
+						     NULL, 10);
+			break;
+		}
+	}
+	rv = 0;
+
+out:
+	if (f != NULL)
+		(void) fclose(f);
+	free(statln);
+	free(newns);
+	return rv;
+}
+
+struct nsinfo *nsinfo__new(pid_t pid)
+{
+	struct nsinfo *nsi;
+
+	if (pid == 0)
+		return NULL;
+
+	nsi = calloc(1, sizeof(*nsi));
+	if (nsi != NULL) {
+		nsi->pid = pid;
+		nsi->tgid = pid;
+		nsi->nstgid = pid;
+		nsi->need_setns = false;
+		/* Init may fail if the process exits while we're trying to look
+		 * at its proc information.  In that case, save the pid but
+		 * don't try to enter the namespace.
+		 */
+		if (nsinfo__init(nsi) == -1)
+			nsi->need_setns = false;
+
+		refcount_set(&nsi->refcnt, 1);
+	}
+
+	return nsi;
+}
+
+struct nsinfo *nsinfo__copy(struct nsinfo *nsi)
+{
+	struct nsinfo *nnsi;
+
+	nnsi = calloc(1, sizeof(*nnsi));
+	if (nnsi != NULL) {
+		nnsi->pid = nsi->pid;
+		nnsi->tgid = nsi->tgid;
+		nnsi->nstgid = nsi->nstgid;
+		nnsi->need_setns = nsi->need_setns;
+		if (nsi->mntns_path) {
+			nnsi->mntns_path = strdup(nsi->mntns_path);
+			if (!nnsi->mntns_path) {
+				free(nnsi);
+				return NULL;
+			}
+		}
+		refcount_set(&nnsi->refcnt, 1);
+	}
+
+	return nnsi;
+}
+
+void nsinfo__delete(struct nsinfo *nsi)
+{
+	zfree(&nsi->mntns_path);
+	free(nsi);
+}
+
+struct nsinfo *nsinfo__get(struct nsinfo *nsi)
+{
+	if (nsi)
+		refcount_inc(&nsi->refcnt);
+	return nsi;
+}
+
+void nsinfo__put(struct nsinfo *nsi)
+{
+	if (nsi && refcount_dec_and_test(&nsi->refcnt))
+		nsinfo__delete(nsi);
+}
+
+void nsinfo__mountns_enter(struct nsinfo *nsi,
+				  struct nscookie *nc)
+{
+	char curpath[PATH_MAX];
+	int oldns = -1;
+	int newns = -1;
+
+	if (nc == NULL)
+		return;
+
+	nc->oldns = -1;
+	nc->newns = -1;
+
+	if (!nsi || !nsi->need_setns)
+		return;
+
+	if (snprintf(curpath, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX)
+		return;
+
+	oldns = open(curpath, O_RDONLY);
+	if (oldns < 0)
+		return;
+
+	newns = open(nsi->mntns_path, O_RDONLY);
+	if (newns < 0)
+		goto errout;
+
+	if (setns(newns, CLONE_NEWNS) < 0)
+		goto errout;
+
+	nc->oldns = oldns;
+	nc->newns = newns;
+	return;
+
+errout:
+	if (oldns > -1)
+		close(oldns);
+	if (newns > -1)
+		close(newns);
+}
+
+void nsinfo__mountns_exit(struct nscookie *nc)
+{
+	if (nc == NULL || nc->oldns == -1 || nc->newns == -1)
+		return;
+
+	setns(nc->oldns, CLONE_NEWNS);
+
+	if (nc->oldns > -1) {
+		close(nc->oldns);
+		nc->oldns = -1;
+	}
+
+	if (nc->newns > -1) {
+		close(nc->newns);
+		nc->newns = -1;
+	}
+}
+
+char *nsinfo__realpath(const char *path, struct nsinfo *nsi)
+{
+	char *rpath;
+	struct nscookie nsc;
+
+	nsinfo__mountns_enter(nsi, &nsc);
+	rpath = realpath(path, NULL);
+	nsinfo__mountns_exit(&nsc);
+
+	return rpath;
+}
diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h
index 468f1e9a1484..05d82601c9a6 100644
--- a/tools/perf/util/namespaces.h
+++ b/tools/perf/util/namespaces.h
@@ -11,6 +11,7 @@
 
 #include "../perf.h"
 #include <linux/list.h>
+#include <linux/refcount.h>
 
 struct namespaces_event;
 
@@ -23,4 +24,41 @@ struct namespaces {
 struct namespaces *namespaces__new(struct namespaces_event *event);
 void namespaces__free(struct namespaces *namespaces);
 
+struct nsinfo {
+	pid_t			pid;
+	pid_t			tgid;
+	pid_t			nstgid;
+	bool			need_setns;
+	char			*mntns_path;
+	refcount_t		refcnt;
+};
+
+struct nscookie {
+	int			oldns;
+	int			newns;
+};
+
+int nsinfo__init(struct nsinfo *nsi);
+struct nsinfo *nsinfo__new(pid_t pid);
+struct nsinfo *nsinfo__copy(struct nsinfo *nsi);
+void nsinfo__delete(struct nsinfo *nsi);
+
+struct nsinfo *nsinfo__get(struct nsinfo *nsi);
+void nsinfo__put(struct nsinfo *nsi);
+
+void nsinfo__mountns_enter(struct nsinfo *nsi, struct nscookie *nc);
+void nsinfo__mountns_exit(struct nscookie *nc);
+
+char *nsinfo__realpath(const char *path, struct nsinfo *nsi);
+
+static inline void __nsinfo__zput(struct nsinfo **nsip)
+{
+	if (nsip) {
+		nsinfo__put(*nsip);
+		*nsip = NULL;
+	}
+}
+
+#define nsinfo__zput(nsi) __nsinfo__zput(&nsi)
+
 #endif  /* __PERF_NAMESPACES_H */
diff --git a/tools/perf/util/parse-branch-options.c b/tools/perf/util/parse-branch-options.c
index 38fd11504015..e71fb5f31e84 100644
--- a/tools/perf/util/parse-branch-options.c
+++ b/tools/perf/util/parse-branch-options.c
@@ -28,6 +28,7 @@ static const struct branch_mode branch_modes[] = {
 	BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
 	BRANCH_OPT("ind_jmp", PERF_SAMPLE_BRANCH_IND_JUMP),
 	BRANCH_OPT("call", PERF_SAMPLE_BRANCH_CALL),
+	BRANCH_OPT("save_type", PERF_SAMPLE_BRANCH_TYPE_SAVE),
 	BRANCH_END
 };
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 01e779b91c8e..f44aeba51d1f 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -34,7 +34,7 @@
 #ifdef PARSER_DEBUG
 extern int parse_events_debug;
 #endif
-int parse_events_parse(void *data, void *scanner);
+int parse_events_parse(void *parse_state, void *scanner);
 static int get_config_terms(struct list_head *head_config,
 			    struct list_head *head_terms __maybe_unused);
 
@@ -589,7 +589,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
 }
 
 struct __add_bpf_event_param {
-	struct parse_events_evlist *data;
+	struct parse_events_state *parse_state;
 	struct list_head *list;
 	struct list_head *head_config;
 };
@@ -599,7 +599,7 @@ static int add_bpf_event(const char *group, const char *event, int fd,
 {
 	LIST_HEAD(new_evsels);
 	struct __add_bpf_event_param *param = _param;
-	struct parse_events_evlist *evlist = param->data;
+	struct parse_events_state *parse_state = param->parse_state;
 	struct list_head *list = param->list;
 	struct perf_evsel *pos;
 	int err;
@@ -607,8 +607,8 @@ static int add_bpf_event(const char *group, const char *event, int fd,
 	pr_debug("add bpf event %s:%s and attach bpf program %d\n",
 		 group, event, fd);
 
-	err = parse_events_add_tracepoint(&new_evsels, &evlist->idx, group,
-					  event, evlist->error,
+	err = parse_events_add_tracepoint(&new_evsels, &parse_state->idx, group,
+					  event, parse_state->error,
 					  param->head_config);
 	if (err) {
 		struct perf_evsel *evsel, *tmp;
@@ -632,14 +632,14 @@ static int add_bpf_event(const char *group, const char *event, int fd,
 	return 0;
 }
 
-int parse_events_load_bpf_obj(struct parse_events_evlist *data,
+int parse_events_load_bpf_obj(struct parse_events_state *parse_state,
 			      struct list_head *list,
 			      struct bpf_object *obj,
 			      struct list_head *head_config)
 {
 	int err;
 	char errbuf[BUFSIZ];
-	struct __add_bpf_event_param param = {data, list, head_config};
+	struct __add_bpf_event_param param = {parse_state, list, head_config};
 	static bool registered_unprobe_atexit = false;
 
 	if (IS_ERR(obj) || !obj) {
@@ -680,13 +680,13 @@ int parse_events_load_bpf_obj(struct parse_events_evlist *data,
 
 	return 0;
 errout:
-	data->error->help = strdup("(add -v to see detail)");
-	data->error->str = strdup(errbuf);
+	parse_state->error->help = strdup("(add -v to see detail)");
+	parse_state->error->str = strdup(errbuf);
 	return err;
 }
 
 static int
-parse_events_config_bpf(struct parse_events_evlist *data,
+parse_events_config_bpf(struct parse_events_state *parse_state,
 			struct bpf_object *obj,
 			struct list_head *head_config)
 {
@@ -705,28 +705,28 @@ parse_events_config_bpf(struct parse_events_evlist *data,
 				 "Invalid config term for BPF object");
 			errbuf[BUFSIZ - 1] = '\0';
 
-			data->error->idx = term->err_term;
-			data->error->str = strdup(errbuf);
+			parse_state->error->idx = term->err_term;
+			parse_state->error->str = strdup(errbuf);
 			return -EINVAL;
 		}
 
-		err = bpf__config_obj(obj, term, data->evlist, &error_pos);
+		err = bpf__config_obj(obj, term, parse_state->evlist, &error_pos);
 		if (err) {
-			bpf__strerror_config_obj(obj, term, data->evlist,
+			bpf__strerror_config_obj(obj, term, parse_state->evlist,
 						 &error_pos, err, errbuf,
 						 sizeof(errbuf));
-			data->error->help = strdup(
+			parse_state->error->help = strdup(
 "Hint:\tValid config terms:\n"
 "     \tmap:[<arraymap>].value<indices>=[value]\n"
 "     \tmap:[<eventmap>].event<indices>=[event]\n"
 "\n"
 "     \twhere <indices> is something like [0,3...5] or [all]\n"
 "     \t(add -v to see detail)");
-			data->error->str = strdup(errbuf);
+			parse_state->error->str = strdup(errbuf);
 			if (err == -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE)
-				data->error->idx = term->err_val;
+				parse_state->error->idx = term->err_val;
 			else
-				data->error->idx = term->err_term + error_pos;
+				parse_state->error->idx = term->err_term + error_pos;
 			return err;
 		}
 	}
@@ -762,7 +762,7 @@ split_bpf_config_terms(struct list_head *evt_head_config,
 			list_move_tail(&term->list, obj_head_config);
 }
 
-int parse_events_load_bpf(struct parse_events_evlist *data,
+int parse_events_load_bpf(struct parse_events_state *parse_state,
 			  struct list_head *list,
 			  char *bpf_file_name,
 			  bool source,
@@ -790,15 +790,15 @@ int parse_events_load_bpf(struct parse_events_evlist *data,
 						   -err, errbuf,
 						   sizeof(errbuf));
 
-		data->error->help = strdup("(add -v to see detail)");
-		data->error->str = strdup(errbuf);
+		parse_state->error->help = strdup("(add -v to see detail)");
+		parse_state->error->str = strdup(errbuf);
 		return err;
 	}
 
-	err = parse_events_load_bpf_obj(data, list, obj, head_config);
+	err = parse_events_load_bpf_obj(parse_state, list, obj, head_config);
 	if (err)
 		return err;
-	err = parse_events_config_bpf(data, obj, &obj_head_config);
+	err = parse_events_config_bpf(parse_state, obj, &obj_head_config);
 
 	/*
 	 * Caller doesn't know anything about obj_head_config,
@@ -1184,7 +1184,7 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx,
 					    err, head_config);
 }
 
-int parse_events_add_numeric(struct parse_events_evlist *data,
+int parse_events_add_numeric(struct parse_events_state *parse_state,
 			     struct list_head *list,
 			     u32 type, u64 config,
 			     struct list_head *head_config)
@@ -1197,7 +1197,7 @@ int parse_events_add_numeric(struct parse_events_evlist *data,
 	attr.config = config;
 
 	if (head_config) {
-		if (config_attr(&attr, head_config, data->error,
+		if (config_attr(&attr, head_config, parse_state->error,
 				config_term_common))
 			return -EINVAL;
 
@@ -1205,11 +1205,11 @@ int parse_events_add_numeric(struct parse_events_evlist *data,
 			return -ENOMEM;
 	}
 
-	return add_event(list, &data->idx, &attr,
+	return add_event(list, &parse_state->idx, &attr,
 			 get_config_name(head_config), &config_terms);
 }
 
-int parse_events_add_pmu(struct parse_events_evlist *data,
+int parse_events_add_pmu(struct parse_events_state *parse_state,
 			 struct list_head *list, char *name,
 			 struct list_head *head_config)
 {
@@ -1232,7 +1232,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
 
 	if (!head_config) {
 		attr.type = pmu->type;
-		evsel = __add_event(list, &data->idx, &attr, NULL, pmu->cpus, NULL);
+		evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL);
 		return evsel ? 0 : -ENOMEM;
 	}
 
@@ -1243,16 +1243,16 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
 	 * Configure hardcoded terms first, no need to check
 	 * return value when called with fail == 0 ;)
 	 */
-	if (config_attr(&attr, head_config, data->error, config_term_pmu))
+	if (config_attr(&attr, head_config, parse_state->error, config_term_pmu))
 		return -EINVAL;
 
 	if (get_config_terms(head_config, &config_terms))
 		return -ENOMEM;
 
-	if (perf_pmu__config(pmu, &attr, head_config, data->error))
+	if (perf_pmu__config(pmu, &attr, head_config, parse_state->error))
 		return -EINVAL;
 
-	evsel = __add_event(list, &data->idx, &attr,
+	evsel = __add_event(list, &parse_state->idx, &attr,
 			    get_config_name(head_config), pmu->cpus,
 			    &config_terms);
 	if (evsel) {
@@ -1267,7 +1267,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
 	return evsel ? 0 : -ENOMEM;
 }
 
-int parse_events_multi_pmu_add(struct parse_events_evlist *data,
+int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
 			       char *str, struct list_head **listp)
 {
 	struct list_head *head;
@@ -1296,7 +1296,7 @@ int parse_events_multi_pmu_add(struct parse_events_evlist *data,
 					return -1;
 				list_add_tail(&term->list, head);
 
-				if (!parse_events_add_pmu(data, list,
+				if (!parse_events_add_pmu(parse_state, list,
 						  pmu->name, head)) {
 					pr_debug("%s -> %s/%s/\n", str,
 						 pmu->name, alias->str);
@@ -1628,7 +1628,7 @@ perf_pmu__parse_check(const char *name)
 	return r ? r->type : PMU_EVENT_SYMBOL_ERR;
 }
 
-static int parse_events__scanner(const char *str, void *data, int start_token)
+static int parse_events__scanner(const char *str, void *parse_state, int start_token)
 {
 	YY_BUFFER_STATE buffer;
 	void *scanner;
@@ -1643,7 +1643,7 @@ static int parse_events__scanner(const char *str, void *data, int start_token)
 #ifdef PARSER_DEBUG
 	parse_events_debug = 1;
 #endif
-	ret = parse_events_parse(data, scanner);
+	ret = parse_events_parse(parse_state, scanner);
 
 	parse_events__flush_buffer(buffer, scanner);
 	parse_events__delete_buffer(buffer, scanner);
@@ -1656,45 +1656,45 @@ static int parse_events__scanner(const char *str, void *data, int start_token)
  */
 int parse_events_terms(struct list_head *terms, const char *str)
 {
-	struct parse_events_terms data = {
+	struct parse_events_state parse_state = {
 		.terms = NULL,
 	};
 	int ret;
 
-	ret = parse_events__scanner(str, &data, PE_START_TERMS);
+	ret = parse_events__scanner(str, &parse_state, PE_START_TERMS);
 	if (!ret) {
-		list_splice(data.terms, terms);
-		zfree(&data.terms);
+		list_splice(parse_state.terms, terms);
+		zfree(&parse_state.terms);
 		return 0;
 	}
 
-	parse_events_terms__delete(data.terms);
+	parse_events_terms__delete(parse_state.terms);
 	return ret;
 }
 
 int parse_events(struct perf_evlist *evlist, const char *str,
 		 struct parse_events_error *err)
 {
-	struct parse_events_evlist data = {
-		.list   = LIST_HEAD_INIT(data.list),
+	struct parse_events_state parse_state = {
+		.list   = LIST_HEAD_INIT(parse_state.list),
 		.idx    = evlist->nr_entries,
 		.error  = err,
 		.evlist = evlist,
 	};
 	int ret;
 
-	ret = parse_events__scanner(str, &data, PE_START_EVENTS);
+	ret = parse_events__scanner(str, &parse_state, PE_START_EVENTS);
 	perf_pmu__parse_cleanup();
 	if (!ret) {
 		struct perf_evsel *last;
 
-		if (list_empty(&data.list)) {
+		if (list_empty(&parse_state.list)) {
 			WARN_ONCE(true, "WARNING: event parser found nothing");
 			return -1;
 		}
 
-		perf_evlist__splice_list_tail(evlist, &data.list);
-		evlist->nr_groups += data.nr_groups;
+		perf_evlist__splice_list_tail(evlist, &parse_state.list);
+		evlist->nr_groups += parse_state.nr_groups;
 		last = perf_evlist__last(evlist);
 		last->cmdline_group_boundary = true;
 
@@ -2124,7 +2124,7 @@ void print_sdt_events(const char *subsys_glob, const char *event_glob,
 		return;
 	}
 	strlist__for_each_entry(nd, bidlist) {
-		pcache = probe_cache__new(nd->s);
+		pcache = probe_cache__new(nd->s, NULL);
 		if (!pcache)
 			continue;
 		list_for_each_entry(ent, &pcache->entries, node) {
@@ -2520,10 +2520,10 @@ void parse_events__clear_array(struct parse_events_array *a)
 	zfree(&a->ranges);
 }
 
-void parse_events_evlist_error(struct parse_events_evlist *data,
+void parse_events_evlist_error(struct parse_events_state *parse_state,
 			       int idx, const char *str)
 {
-	struct parse_events_error *err = data->error;
+	struct parse_events_error *err = parse_state->error;
 
 	if (!err)
 		return;
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index a235f4d6d5e5..635135125111 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -108,16 +108,13 @@ struct parse_events_error {
 	char *help;	/* optional help string */
 };
 
-struct parse_events_evlist {
+struct parse_events_state {
 	struct list_head	   list;
 	int			   idx;
 	int			   nr_groups;
 	struct parse_events_error *error;
 	struct perf_evlist	  *evlist;
-};
-
-struct parse_events_terms {
-	struct list_head *terms;
+	struct list_head	  *terms;
 };
 
 void parse_events__shrink_config_terms(void);
@@ -143,18 +140,18 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx,
 				const char *sys, const char *event,
 				struct parse_events_error *error,
 				struct list_head *head_config);
-int parse_events_load_bpf(struct parse_events_evlist *data,
+int parse_events_load_bpf(struct parse_events_state *parse_state,
 			  struct list_head *list,
 			  char *bpf_file_name,
 			  bool source,
 			  struct list_head *head_config);
 /* Provide this function for perf test */
 struct bpf_object;
-int parse_events_load_bpf_obj(struct parse_events_evlist *data,
+int parse_events_load_bpf_obj(struct parse_events_state *parse_state,
 			      struct list_head *list,
 			      struct bpf_object *obj,
 			      struct list_head *head_config);
-int parse_events_add_numeric(struct parse_events_evlist *data,
+int parse_events_add_numeric(struct parse_events_state *parse_state,
 			     struct list_head *list,
 			     u32 type, u64 config,
 			     struct list_head *head_config);
@@ -164,11 +161,11 @@ int parse_events_add_cache(struct list_head *list, int *idx,
 			   struct list_head *head_config);
 int parse_events_add_breakpoint(struct list_head *list, int *idx,
 				void *ptr, char *type, u64 len);
-int parse_events_add_pmu(struct parse_events_evlist *data,
+int parse_events_add_pmu(struct parse_events_state *parse_state,
 			 struct list_head *list, char *name,
 			 struct list_head *head_config);
 
-int parse_events_multi_pmu_add(struct parse_events_evlist *data,
+int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
 			       char *str,
 			       struct list_head **listp);
 
@@ -180,7 +177,7 @@ perf_pmu__parse_check(const char *name);
 void parse_events__set_leader(char *name, struct list_head *list);
 void parse_events_update_lists(struct list_head *list_event,
 			       struct list_head *list_all);
-void parse_events_evlist_error(struct parse_events_evlist *data,
+void parse_events_evlist_error(struct parse_events_state *parse_state,
 			       int idx, const char *str);
 
 void print_events(const char *event_glob, bool name_only, bool quiet,
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 660fca05bc93..c42edeac451f 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -53,6 +53,21 @@ static int str(yyscan_t scanner, int token)
 	return token;
 }
 
+static bool isbpf(yyscan_t scanner)
+{
+	char *text = parse_events_get_text(scanner);
+	int len = strlen(text);
+
+	if (len < 2)
+		return false;
+	if ((text[len - 1] == 'c' || text[len - 1] == 'o') &&
+	    text[len - 2] == '.')
+		return true;
+	if (len > 4 && !strcmp(text + len - 4, ".obj"))
+		return true;
+	return false;
+}
+
 /*
  * This function is called when the parser gets two kind of input:
  *
@@ -136,8 +151,8 @@ do {							\
 group		[^,{}/]*[{][^}]*[}][^,{}/]*
 event_pmu	[^,{}/]+[/][^/]*[/][^,{}/]*
 event		[^,{}/]+
-bpf_object	[^,{}]+\.(o|bpf)
-bpf_source	[^,{}]+\.c
+bpf_object	[^,{}]+\.(o|bpf)[a-zA-Z0-9._]*
+bpf_source	[^,{}]+\.c[a-zA-Z0-9._]*
 
 num_dec		[0-9]+
 num_hex		0x[a-fA-F0-9]+
@@ -307,8 +322,8 @@ r{num_raw_hex}		{ return raw(yyscanner); }
 {num_hex}		{ return value(yyscanner, 16); }
 
 {modifier_event}	{ return str(yyscanner, PE_MODIFIER_EVENT); }
-{bpf_object}		{ return str(yyscanner, PE_BPF_OBJECT); }
-{bpf_source}		{ return str(yyscanner, PE_BPF_SOURCE); }
+{bpf_object}		{ if (!isbpf(yyscanner)) REJECT; return str(yyscanner, PE_BPF_OBJECT); }
+{bpf_source}		{ if (!isbpf(yyscanner)) REJECT; return str(yyscanner, PE_BPF_SOURCE); }
 {name}			{ return pmu_str_check(yyscanner); }
 "/"			{ BEGIN(config); return '/'; }
 -			{ return '-'; }
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 04fd8c9af9f9..e81a20ea8d7d 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -1,5 +1,5 @@
 %pure-parser
-%parse-param {void *_data}
+%parse-param {void *_parse_state}
 %parse-param {void *scanner}
 %lex-param {void* scanner}
 %locations
@@ -17,7 +17,7 @@
 #include "parse-events.h"
 #include "parse-events-bison.h"
 
-void parse_events_error(YYLTYPE *loc, void *data, void *scanner, char const *msg);
+void parse_events_error(YYLTYPE *loc, void *parse_state, void *scanner, char const *msg);
 
 #define ABORT_ON(val) \
 do { \
@@ -33,11 +33,11 @@ do { \
 } while (0)
 
 static void inc_group_count(struct list_head *list,
-		       struct parse_events_evlist *data)
+		       struct parse_events_state *parse_state)
 {
 	/* Count groups only have more than 1 members */
 	if (!list_is_last(list->next, list))
-		data->nr_groups++;
+		parse_state->nr_groups++;
 }
 
 %}
@@ -115,9 +115,9 @@ PE_START_TERMS  start_terms
 
 start_events: groups
 {
-	struct parse_events_evlist *data = _data;
+	struct parse_events_state *parse_state = _parse_state;
 
-	parse_events_update_lists($1, &data->list);
+	parse_events_update_lists($1, &parse_state->list);
 }
 
 groups:
@@ -159,7 +159,7 @@ PE_NAME '{' events '}'
 {
 	struct list_head *list = $3;
 
-	inc_group_count(list, _data);
+	inc_group_count(list, _parse_state);
 	parse_events__set_leader($1, list);
 	$$ = list;
 }
@@ -168,7 +168,7 @@ PE_NAME '{' events '}'
 {
 	struct list_head *list = $2;
 
-	inc_group_count(list, _data);
+	inc_group_count(list, _parse_state);
 	parse_events__set_leader(NULL, list);
 	$$ = list;
 }
@@ -225,14 +225,13 @@ event_def: event_pmu |
 event_pmu:
 PE_NAME opt_event_config
 {
-	struct parse_events_evlist *data = _data;
 	struct list_head *list, *orig_terms, *terms;
 
 	if (parse_events_copy_term_list($2, &orig_terms))
 		YYABORT;
 
 	ALLOC_LIST(list);
-	if (parse_events_add_pmu(data, list, $1, $2)) {
+	if (parse_events_add_pmu(_parse_state, list, $1, $2)) {
 		struct perf_pmu *pmu = NULL;
 		int ok = 0;
 
@@ -245,7 +244,7 @@ PE_NAME opt_event_config
 			if (!strncmp($1, name, strlen($1))) {
 				if (parse_events_copy_term_list(orig_terms, &terms))
 					YYABORT;
-				if (!parse_events_add_pmu(data, list, pmu->name, terms))
+				if (!parse_events_add_pmu(_parse_state, list, pmu->name, terms))
 					ok++;
 				parse_events_terms__delete(terms);
 			}
@@ -262,7 +261,7 @@ PE_KERNEL_PMU_EVENT sep_dc
 {
 	struct list_head *list;
 
-	if (parse_events_multi_pmu_add(_data, $1, &list) < 0)
+	if (parse_events_multi_pmu_add(_parse_state, $1, &list) < 0)
 		YYABORT;
 	$$ = list;
 }
@@ -273,7 +272,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 	char pmu_name[128];
 
 	snprintf(&pmu_name, 128, "%s-%s", $1, $3);
-	if (parse_events_multi_pmu_add(_data, pmu_name, &list) < 0)
+	if (parse_events_multi_pmu_add(_parse_state, pmu_name, &list) < 0)
 		YYABORT;
 	$$ = list;
 }
@@ -286,62 +285,60 @@ PE_VALUE_SYM_SW
 event_legacy_symbol:
 value_sym '/' event_config '/'
 {
-	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 	int type = $1 >> 16;
 	int config = $1 & 255;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(data, list, type, config, $3));
+	ABORT_ON(parse_events_add_numeric(_parse_state, list, type, config, $3));
 	parse_events_terms__delete($3);
 	$$ = list;
 }
 |
 value_sym sep_slash_dc
 {
-	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 	int type = $1 >> 16;
 	int config = $1 & 255;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(data, list, type, config, NULL));
+	ABORT_ON(parse_events_add_numeric(_parse_state, list, type, config, NULL));
 	$$ = list;
 }
 
 event_legacy_cache:
 PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_event_config
 {
-	struct parse_events_evlist *data = _data;
-	struct parse_events_error *error = data->error;
+	struct parse_events_state *parse_state = _parse_state;
+	struct parse_events_error *error = parse_state->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5, error, $6));
+	ABORT_ON(parse_events_add_cache(list, &parse_state->idx, $1, $3, $5, error, $6));
 	parse_events_terms__delete($6);
 	$$ = list;
 }
 |
 PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config
 {
-	struct parse_events_evlist *data = _data;
-	struct parse_events_error *error = data->error;
+	struct parse_events_state *parse_state = _parse_state;
+	struct parse_events_error *error = parse_state->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL, error, $4));
+	ABORT_ON(parse_events_add_cache(list, &parse_state->idx, $1, $3, NULL, error, $4));
 	parse_events_terms__delete($4);
 	$$ = list;
 }
 |
 PE_NAME_CACHE_TYPE opt_event_config
 {
-	struct parse_events_evlist *data = _data;
-	struct parse_events_error *error = data->error;
+	struct parse_events_state *parse_state = _parse_state;
+	struct parse_events_error *error = parse_state->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL, error, $2));
+	ABORT_ON(parse_events_add_cache(list, &parse_state->idx, $1, NULL, NULL, error, $2));
 	parse_events_terms__delete($2);
 	$$ = list;
 }
@@ -349,44 +346,44 @@ PE_NAME_CACHE_TYPE opt_event_config
 event_legacy_mem:
 PE_PREFIX_MEM PE_VALUE '/' PE_VALUE ':' PE_MODIFIER_BP sep_dc
 {
-	struct parse_events_evlist *data = _data;
+	struct parse_events_state *parse_state = _parse_state;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_breakpoint(list, &data->idx,
+	ABORT_ON(parse_events_add_breakpoint(list, &parse_state->idx,
 					     (void *) $2, $6, $4));
 	$$ = list;
 }
 |
 PE_PREFIX_MEM PE_VALUE '/' PE_VALUE sep_dc
 {
-	struct parse_events_evlist *data = _data;
+	struct parse_events_state *parse_state = _parse_state;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_breakpoint(list, &data->idx,
+	ABORT_ON(parse_events_add_breakpoint(list, &parse_state->idx,
 					     (void *) $2, NULL, $4));
 	$$ = list;
 }
 |
 PE_PREFIX_MEM PE_VALUE ':' PE_MODIFIER_BP sep_dc
 {
-	struct parse_events_evlist *data = _data;
+	struct parse_events_state *parse_state = _parse_state;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_breakpoint(list, &data->idx,
+	ABORT_ON(parse_events_add_breakpoint(list, &parse_state->idx,
 					     (void *) $2, $4, 0));
 	$$ = list;
 }
 |
 PE_PREFIX_MEM PE_VALUE sep_dc
 {
-	struct parse_events_evlist *data = _data;
+	struct parse_events_state *parse_state = _parse_state;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_breakpoint(list, &data->idx,
+	ABORT_ON(parse_events_add_breakpoint(list, &parse_state->idx,
 					     (void *) $2, NULL, 0));
 	$$ = list;
 }
@@ -394,15 +391,15 @@ PE_PREFIX_MEM PE_VALUE sep_dc
 event_legacy_tracepoint:
 tracepoint_name opt_event_config
 {
-	struct parse_events_evlist *data = _data;
-	struct parse_events_error *error = data->error;
+	struct parse_events_state *parse_state = _parse_state;
+	struct parse_events_error *error = parse_state->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
 	if (error)
 		error->idx = @1.first_column;
 
-	if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
+	if (parse_events_add_tracepoint(list, &parse_state->idx, $1.sys, $1.event,
 					error, $2))
 		return -1;
 
@@ -432,11 +429,10 @@ PE_NAME ':' PE_NAME
 event_legacy_numeric:
 PE_VALUE ':' PE_VALUE opt_event_config
 {
-	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, $4));
+	ABORT_ON(parse_events_add_numeric(_parse_state, list, (u32)$1, $3, $4));
 	parse_events_terms__delete($4);
 	$$ = list;
 }
@@ -444,11 +440,10 @@ PE_VALUE ':' PE_VALUE opt_event_config
 event_legacy_raw:
 PE_RAW opt_event_config
 {
-	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, $2));
+	ABORT_ON(parse_events_add_numeric(_parse_state, list, PERF_TYPE_RAW, $1, $2));
 	parse_events_terms__delete($2);
 	$$ = list;
 }
@@ -456,23 +451,22 @@ PE_RAW opt_event_config
 event_bpf_file:
 PE_BPF_OBJECT opt_event_config
 {
-	struct parse_events_evlist *data = _data;
-	struct parse_events_error *error = data->error;
+	struct parse_events_state *parse_state = _parse_state;
+	struct parse_events_error *error = parse_state->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_load_bpf(data, list, $1, false, $2));
+	ABORT_ON(parse_events_load_bpf(parse_state, list, $1, false, $2));
 	parse_events_terms__delete($2);
 	$$ = list;
 }
 |
 PE_BPF_SOURCE opt_event_config
 {
-	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_load_bpf(data, list, $1, true, $2));
+	ABORT_ON(parse_events_load_bpf(_parse_state, list, $1, true, $2));
 	parse_events_terms__delete($2);
 	$$ = list;
 }
@@ -494,8 +488,8 @@ opt_event_config:
 
 start_terms: event_config
 {
-	struct parse_events_terms *data = _data;
-	data->terms = $1;
+	struct parse_events_state *parse_state = _parse_state;
+	parse_state->terms = $1;
 }
 
 event_config:
@@ -685,9 +679,9 @@ sep_slash_dc: '/' | ':' |
 
 %%
 
-void parse_events_error(YYLTYPE *loc, void *data,
+void parse_events_error(YYLTYPE *loc, void *parse_state,
 			void *scanner __maybe_unused,
 			char const *msg __maybe_unused)
 {
-	parse_events_evlist_error(data, loc->last_column, "parser error");
+	parse_events_evlist_error(parse_state, loc->last_column, "parser error");
 }
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index a2670e9d652d..b7aaf9b2294d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -184,13 +184,19 @@ static struct map *kernel_get_module_map(const char *module)
 	return NULL;
 }
 
-struct map *get_target_map(const char *target, bool user)
+struct map *get_target_map(const char *target, struct nsinfo *nsi, bool user)
 {
 	/* Init maps of given executable or kernel */
-	if (user)
-		return dso__new_map(target);
-	else
+	if (user) {
+		struct map *map;
+
+		map = dso__new_map(target);
+		if (map && map->dso)
+			map->dso->nsinfo = nsinfo__get(nsi);
+		return map;
+	} else {
 		return kernel_get_module_map(target);
+	}
 }
 
 static int convert_exec_to_group(const char *exec, char **result)
@@ -366,7 +372,8 @@ found:
 static int find_alternative_probe_point(struct debuginfo *dinfo,
 					struct perf_probe_point *pp,
 					struct perf_probe_point *result,
-					const char *target, bool uprobes)
+					const char *target, struct nsinfo *nsi,
+					bool uprobes)
 {
 	struct map *map = NULL;
 	struct symbol *sym;
@@ -377,7 +384,7 @@ static int find_alternative_probe_point(struct debuginfo *dinfo,
 	if (!pp->function || pp->file)
 		return -ENOTSUP;
 
-	map = get_target_map(target, uprobes);
+	map = get_target_map(target, nsi, uprobes);
 	if (!map)
 		return -EINVAL;
 
@@ -421,8 +428,8 @@ static int get_alternative_probe_event(struct debuginfo *dinfo,
 
 	memcpy(tmp, &pev->point, sizeof(*tmp));
 	memset(&pev->point, 0, sizeof(pev->point));
-	ret = find_alternative_probe_point(dinfo, tmp, &pev->point,
-					   pev->target, pev->uprobes);
+	ret = find_alternative_probe_point(dinfo, tmp, &pev->point, pev->target,
+					   pev->nsi, pev->uprobes);
 	if (ret < 0)
 		memcpy(&pev->point, tmp, sizeof(*tmp));
 
@@ -444,7 +451,7 @@ static int get_alternative_line_range(struct debuginfo *dinfo,
 	if (lr->end != INT_MAX)
 		len = lr->end - lr->start;
 	ret = find_alternative_probe_point(dinfo, &pp, &result,
-					   target, user);
+					   target, NULL, user);
 	if (!ret) {
 		lr->function = result.function;
 		lr->file = result.file;
@@ -457,12 +464,14 @@ static int get_alternative_line_range(struct debuginfo *dinfo,
 }
 
 /* Open new debuginfo of given module */
-static struct debuginfo *open_debuginfo(const char *module, bool silent)
+static struct debuginfo *open_debuginfo(const char *module, struct nsinfo *nsi,
+					bool silent)
 {
 	const char *path = module;
 	char reason[STRERR_BUFSIZE];
 	struct debuginfo *ret = NULL;
 	struct dso *dso = NULL;
+	struct nscookie nsc;
 	int err;
 
 	if (!module || !strchr(module, '/')) {
@@ -480,6 +489,7 @@ static struct debuginfo *open_debuginfo(const char *module, bool silent)
 		}
 		path = dso->long_name;
 	}
+	nsinfo__mountns_enter(nsi, &nsc);
 	ret = debuginfo__new(path);
 	if (!ret && !silent) {
 		pr_warning("The %s file has no debug information.\n", path);
@@ -489,6 +499,7 @@ static struct debuginfo *open_debuginfo(const char *module, bool silent)
 			pr_warning("Rebuild with -g, ");
 		pr_warning("or install an appropriate debuginfo package.\n");
 	}
+	nsinfo__mountns_exit(&nsc);
 	return ret;
 }
 
@@ -516,7 +527,7 @@ static struct debuginfo *debuginfo_cache__open(const char *module, bool silent)
 		goto out;
 	}
 
-	debuginfo_cache = open_debuginfo(module, silent);
+	debuginfo_cache = open_debuginfo(module, NULL, silent);
 	if (!debuginfo_cache)
 		zfree(&debuginfo_cache_path);
 out:
@@ -531,14 +542,18 @@ static void debuginfo_cache__exit(void)
 }
 
 
-static int get_text_start_address(const char *exec, unsigned long *address)
+static int get_text_start_address(const char *exec, unsigned long *address,
+				  struct nsinfo *nsi)
 {
 	Elf *elf;
 	GElf_Ehdr ehdr;
 	GElf_Shdr shdr;
 	int fd, ret = -ENOENT;
+	struct nscookie nsc;
 
+	nsinfo__mountns_enter(nsi, &nsc);
 	fd = open(exec, O_RDONLY);
+	nsinfo__mountns_exit(&nsc);
 	if (fd < 0)
 		return -errno;
 
@@ -582,7 +597,7 @@ static int find_perf_probe_point_from_dwarf(struct probe_trace_point *tp,
 			ret = -EINVAL;
 			goto error;
 		}
-		ret = get_text_start_address(tp->module, &stext);
+		ret = get_text_start_address(tp->module, &stext, NULL);
 		if (ret < 0)
 			goto error;
 		addr += stext;
@@ -659,7 +674,7 @@ post_process_offline_probe_trace_events(struct probe_trace_event *tevs,
 
 	/* Prepare a map for offline binary */
 	map = dso__new_map(pathname);
-	if (!map || get_text_start_address(pathname, &stext) < 0) {
+	if (!map || get_text_start_address(pathname, &stext, NULL) < 0) {
 		pr_warning("Failed to get ELF symbols for %s\n", pathname);
 		return -EINVAL;
 	}
@@ -676,7 +691,8 @@ post_process_offline_probe_trace_events(struct probe_trace_event *tevs,
 }
 
 static int add_exec_to_probe_trace_events(struct probe_trace_event *tevs,
-					  int ntevs, const char *exec)
+					  int ntevs, const char *exec,
+					  struct nsinfo *nsi)
 {
 	int i, ret = 0;
 	unsigned long stext = 0;
@@ -684,7 +700,7 @@ static int add_exec_to_probe_trace_events(struct probe_trace_event *tevs,
 	if (!exec)
 		return 0;
 
-	ret = get_text_start_address(exec, &stext);
+	ret = get_text_start_address(exec, &stext, nsi);
 	if (ret < 0)
 		return ret;
 
@@ -715,7 +731,7 @@ post_process_module_probe_trace_events(struct probe_trace_event *tevs,
 	if (!module)
 		return 0;
 
-	map = get_target_map(module, false);
+	map = get_target_map(module, NULL, false);
 	if (!map || debuginfo__get_text_offset(dinfo, &text_offs, true) < 0) {
 		pr_warning("Failed to get ELF symbols for %s\n", module);
 		return -EINVAL;
@@ -802,7 +818,8 @@ static int post_process_probe_trace_events(struct perf_probe_event *pev,
 	int ret;
 
 	if (uprobe)
-		ret = add_exec_to_probe_trace_events(tevs, ntevs, module);
+		ret = add_exec_to_probe_trace_events(tevs, ntevs, module,
+						     pev->nsi);
 	else if (module)
 		/* Currently ref_reloc_sym based probe is not for drivers */
 		ret = post_process_module_probe_trace_events(tevs, ntevs,
@@ -825,7 +842,7 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 	struct debuginfo *dinfo;
 	int ntevs, ret = 0;
 
-	dinfo = open_debuginfo(pev->target, !need_dwarf);
+	dinfo = open_debuginfo(pev->target, pev->nsi, !need_dwarf);
 	if (!dinfo) {
 		if (need_dwarf)
 			return -ENOENT;
@@ -945,7 +962,7 @@ static int __show_line_range(struct line_range *lr, const char *module,
 	char sbuf[STRERR_BUFSIZE];
 
 	/* Search a line range */
-	dinfo = open_debuginfo(module, false);
+	dinfo = open_debuginfo(module, NULL, false);
 	if (!dinfo)
 		return -ENOENT;
 
@@ -1021,14 +1038,18 @@ end:
 	return ret;
 }
 
-int show_line_range(struct line_range *lr, const char *module, bool user)
+int show_line_range(struct line_range *lr, const char *module,
+		    struct nsinfo *nsi, bool user)
 {
 	int ret;
+	struct nscookie nsc;
 
 	ret = init_probe_symbol_maps(user);
 	if (ret < 0)
 		return ret;
+	nsinfo__mountns_enter(nsi, &nsc);
 	ret = __show_line_range(lr, module, user);
+	nsinfo__mountns_exit(&nsc);
 	exit_probe_symbol_maps();
 
 	return ret;
@@ -1111,7 +1132,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
 	if (ret < 0)
 		return ret;
 
-	dinfo = open_debuginfo(pevs->target, false);
+	dinfo = open_debuginfo(pevs->target, pevs->nsi, false);
 	if (!dinfo) {
 		ret = -ENOENT;
 		goto out;
@@ -1155,6 +1176,7 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 
 int show_line_range(struct line_range *lr __maybe_unused,
 		    const char *module __maybe_unused,
+		    struct nsinfo *nsi __maybe_unused,
 		    bool user __maybe_unused)
 {
 	pr_warning("Debuginfo-analysis is not supported.\n");
@@ -2373,7 +2395,7 @@ kprobe_blacklist__find_by_address(struct list_head *blacklist,
 	struct kprobe_blacklist_node *node;
 
 	list_for_each_entry(node, blacklist, list) {
-		if (node->start <= address && address <= node->end)
+		if (node->start <= address && address < node->end)
 			return node;
 	}
 
@@ -2703,6 +2725,7 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 	struct probe_trace_event *tev = NULL;
 	struct probe_cache *cache = NULL;
 	struct strlist *namelist[2] = {NULL, NULL};
+	struct nscookie nsc;
 
 	up = pev->uprobes ? 1 : 0;
 	fd[up] = __open_probe_file_and_namelist(up, &namelist[up]);
@@ -2729,7 +2752,9 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 		if (ret < 0)
 			break;
 
+		nsinfo__mountns_enter(pev->nsi, &nsc);
 		ret = probe_file__add_event(fd[up], tev);
+		nsinfo__mountns_exit(&nsc);
 		if (ret < 0)
 			break;
 
@@ -2744,7 +2769,7 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 	if (ret == -EINVAL && pev->uprobes)
 		warn_uprobe_event_compat(tev);
 	if (ret == 0 && probe_conf.cache) {
-		cache = probe_cache__new(pev->target);
+		cache = probe_cache__new(pev->target, pev->nsi);
 		if (!cache ||
 		    probe_cache__add_entry(cache, pev, tevs, ntevs) < 0 ||
 		    probe_cache__commit(cache) < 0)
@@ -2805,7 +2830,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	int ret, i, j, skipped = 0;
 	char *mod_name;
 
-	map = get_target_map(pev->target, pev->uprobes);
+	map = get_target_map(pev->target, pev->nsi, pev->uprobes);
 	if (!map) {
 		ret = -EINVAL;
 		goto out;
@@ -3094,7 +3119,7 @@ static int find_cached_events(struct perf_probe_event *pev,
 	int ntevs = 0;
 	int ret = 0;
 
-	cache = probe_cache__new(target);
+	cache = probe_cache__new(target, pev->nsi);
 	/* Return 0 ("not found") if the target has no probe cache. */
 	if (!cache)
 		return 0;
@@ -3184,7 +3209,7 @@ static int find_probe_trace_events_from_cache(struct perf_probe_event *pev,
 		else
 			return find_cached_events(pev, tevs, pev->target);
 	}
-	cache = probe_cache__new(pev->target);
+	cache = probe_cache__new(pev->target, pev->nsi);
 	if (!cache)
 		return 0;
 
@@ -3345,13 +3370,16 @@ int apply_perf_probe_events(struct perf_probe_event *pevs, int npevs)
 void cleanup_perf_probe_events(struct perf_probe_event *pevs, int npevs)
 {
 	int i, j;
+	struct perf_probe_event *pev;
 
 	/* Loop 3: cleanup and free trace events  */
 	for (i = 0; i < npevs; i++) {
+		pev = &pevs[i];
 		for (j = 0; j < pevs[i].ntevs; j++)
 			clear_probe_trace_event(&pevs[i].tevs[j]);
 		zfree(&pevs[i].tevs);
 		pevs[i].ntevs = 0;
+		nsinfo__zput(pev->nsi);
 		clear_perf_probe_event(&pevs[i]);
 	}
 }
@@ -3409,8 +3437,8 @@ out:
 	return ret;
 }
 
-int show_available_funcs(const char *target, struct strfilter *_filter,
-					bool user)
+int show_available_funcs(const char *target, struct nsinfo *nsi,
+			 struct strfilter *_filter, bool user)
 {
         struct rb_node *nd;
 	struct map *map;
@@ -3421,7 +3449,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
 		return ret;
 
 	/* Get a symbol map */
-	map = get_target_map(target, user);
+	map = get_target_map(target, nsi, user);
 	if (!map) {
 		pr_err("Failed to get a map for %s\n", (target) ? : "kernel");
 		return -EINVAL;
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 5812947418dd..078681d12168 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -4,6 +4,7 @@
 #include <linux/compiler.h>
 #include <stdbool.h>
 #include "intlist.h"
+#include "namespaces.h"
 
 /* Probe related configurations */
 struct probe_conf {
@@ -92,6 +93,7 @@ struct perf_probe_event {
 	struct perf_probe_arg	*args;	/* Arguments */
 	struct probe_trace_event *tevs;
 	int			ntevs;
+	struct nsinfo		*nsi;	/* Target namespace */
 };
 
 /* Line range */
@@ -163,10 +165,12 @@ int show_perf_probe_event(const char *group, const char *event,
 			  struct perf_probe_event *pev,
 			  const char *module, bool use_stdout);
 int show_perf_probe_events(struct strfilter *filter);
-int show_line_range(struct line_range *lr, const char *module, bool user);
+int show_line_range(struct line_range *lr, const char *module,
+		    struct nsinfo *nsi, bool user);
 int show_available_vars(struct perf_probe_event *pevs, int npevs,
 			struct strfilter *filter);
-int show_available_funcs(const char *module, struct strfilter *filter, bool user);
+int show_available_funcs(const char *module, struct nsinfo *nsi,
+			 struct strfilter *filter, bool user);
 void arch__fix_tev_from_maps(struct perf_probe_event *pev,
 			     struct probe_trace_event *tev, struct map *map,
 			     struct symbol *sym);
@@ -180,7 +184,7 @@ int e_snprintf(char *str, size_t size, const char *format, ...) __printf(3, 4);
 int copy_to_probe_trace_arg(struct probe_trace_arg *tvar,
 			    struct perf_probe_arg *pvar);
 
-struct map *get_target_map(const char *target, bool user);
+struct map *get_target_map(const char *target, struct nsinfo *nsi, bool user);
 
 void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
 					   int ntevs);
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index d679389e627c..cdf8d83a484c 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -412,13 +412,15 @@ int probe_cache_entry__get_event(struct probe_cache_entry *entry,
 }
 
 /* For the kernel probe caches, pass target = NULL or DSO__NAME_KALLSYMS */
-static int probe_cache__open(struct probe_cache *pcache, const char *target)
+static int probe_cache__open(struct probe_cache *pcache, const char *target,
+			     struct nsinfo *nsi)
 {
 	char cpath[PATH_MAX];
 	char sbuildid[SBUILD_ID_SIZE];
 	char *dir_name = NULL;
 	bool is_kallsyms = false;
 	int ret, fd;
+	struct nscookie nsc;
 
 	if (target && build_id_cache__cached(target)) {
 		/* This is a cached buildid */
@@ -431,8 +433,11 @@ static int probe_cache__open(struct probe_cache *pcache, const char *target)
 		target = DSO__NAME_KALLSYMS;
 		is_kallsyms = true;
 		ret = sysfs__sprintf_build_id("/", sbuildid);
-	} else
+	} else {
+		nsinfo__mountns_enter(nsi, &nsc);
 		ret = filename__sprintf_build_id(target, sbuildid);
+		nsinfo__mountns_exit(&nsc);
+	}
 
 	if (ret < 0) {
 		pr_debug("Failed to get build-id from %s.\n", target);
@@ -441,7 +446,7 @@ static int probe_cache__open(struct probe_cache *pcache, const char *target)
 
 	/* If we have no buildid cache, make it */
 	if (!build_id_cache__cached(sbuildid)) {
-		ret = build_id_cache__add_s(sbuildid, target,
+		ret = build_id_cache__add_s(sbuildid, target, nsi,
 					    is_kallsyms, NULL);
 		if (ret < 0) {
 			pr_debug("Failed to add build-id cache: %s\n", target);
@@ -449,7 +454,7 @@ static int probe_cache__open(struct probe_cache *pcache, const char *target)
 		}
 	}
 
-	dir_name = build_id_cache__cachedir(sbuildid, target, is_kallsyms,
+	dir_name = build_id_cache__cachedir(sbuildid, target, nsi, is_kallsyms,
 					    false);
 found:
 	if (!dir_name) {
@@ -554,7 +559,7 @@ void probe_cache__delete(struct probe_cache *pcache)
 	free(pcache);
 }
 
-struct probe_cache *probe_cache__new(const char *target)
+struct probe_cache *probe_cache__new(const char *target, struct nsinfo *nsi)
 {
 	struct probe_cache *pcache = probe_cache__alloc();
 	int ret;
@@ -562,7 +567,7 @@ struct probe_cache *probe_cache__new(const char *target)
 	if (!pcache)
 		return NULL;
 
-	ret = probe_cache__open(pcache, target);
+	ret = probe_cache__open(pcache, target, nsi);
 	if (ret < 0) {
 		pr_debug("Cache open error: %d\n", ret);
 		goto out_err;
@@ -974,7 +979,7 @@ int probe_cache__show_all_caches(struct strfilter *filter)
 		return -EINVAL;
 	}
 	strlist__for_each_entry(nd, bidlist) {
-		pcache = probe_cache__new(nd->s);
+		pcache = probe_cache__new(nd->s, NULL);
 		if (!pcache)
 			continue;
 		if (!list_empty(&pcache->entries)) {
diff --git a/tools/perf/util/probe-file.h b/tools/perf/util/probe-file.h
index 5ecc9d3925db..2ca4163abafe 100644
--- a/tools/perf/util/probe-file.h
+++ b/tools/perf/util/probe-file.h
@@ -51,7 +51,7 @@ int probe_file__del_strlist(int fd, struct strlist *namelist);
 int probe_cache_entry__get_event(struct probe_cache_entry *entry,
 				 struct probe_trace_event **tevs);
 
-struct probe_cache *probe_cache__new(const char *target);
+struct probe_cache *probe_cache__new(const char *target, struct nsinfo *nsi);
 int probe_cache__add_entry(struct probe_cache *pcache,
 			   struct perf_probe_event *pev,
 			   struct probe_trace_event *tevs, int ntevs);
@@ -69,7 +69,7 @@ int probe_cache__show_all_caches(struct strfilter *filter);
 bool probe_type_is_available(enum probe_type type);
 bool kretprobe_offset_is_supported(void);
 #else	/* ! HAVE_LIBELF_SUPPORT */
-static inline struct probe_cache *probe_cache__new(const char *tgt __maybe_unused)
+static inline struct probe_cache *probe_cache__new(const char *tgt __maybe_unused, struct nsinfo *nsi __maybe_unused)
 {
 	return NULL;
 }
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index 9f3b0d9754a8..e66dc495809a 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -10,6 +10,7 @@ util/ctype.c
 util/evlist.c
 util/evsel.c
 util/cpumap.c
+util/namespaces.c
 ../lib/bitmap.c
 ../lib/find_bit.c
 ../lib/hweight.c
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 57b7a00e6f16..c7187f067d31 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -116,6 +116,34 @@ static PyObject *get_handler(const char *handler_name)
 	return handler;
 }
 
+static int get_argument_count(PyObject *handler)
+{
+	int arg_count = 0;
+
+	/*
+	 * The attribute for the code object is func_code in Python 2,
+	 * whereas it is __code__ in Python 3.0+.
+	 */
+	PyObject *code_obj = PyObject_GetAttrString(handler,
+		"func_code");
+	if (PyErr_Occurred()) {
+		PyErr_Clear();
+		code_obj = PyObject_GetAttrString(handler,
+			"__code__");
+	}
+	PyErr_Clear();
+	if (code_obj) {
+		PyObject *arg_count_obj = PyObject_GetAttrString(code_obj,
+			"co_argcount");
+		if (arg_count_obj) {
+			arg_count = (int) PyInt_AsLong(arg_count_obj);
+			Py_DECREF(arg_count_obj);
+		}
+		Py_DECREF(code_obj);
+	}
+	return arg_count;
+}
+
 static void call_object(PyObject *handler, PyObject *args, const char *die_msg)
 {
 	PyObject *retval;
@@ -391,13 +419,115 @@ exit:
 	return pylist;
 }
 
+static PyObject *get_sample_value_as_tuple(struct sample_read_value *value)
+{
+	PyObject *t;
+
+	t = PyTuple_New(2);
+	if (!t)
+		Py_FatalError("couldn't create Python tuple");
+	PyTuple_SetItem(t, 0, PyLong_FromUnsignedLongLong(value->id));
+	PyTuple_SetItem(t, 1, PyLong_FromUnsignedLongLong(value->value));
+	return t;
+}
+
+static void set_sample_read_in_dict(PyObject *dict_sample,
+					 struct perf_sample *sample,
+					 struct perf_evsel *evsel)
+{
+	u64 read_format = evsel->attr.read_format;
+	PyObject *values;
+	unsigned int i;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		pydict_set_item_string_decref(dict_sample, "time_enabled",
+			PyLong_FromUnsignedLongLong(sample->read.time_enabled));
+	}
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		pydict_set_item_string_decref(dict_sample, "time_running",
+			PyLong_FromUnsignedLongLong(sample->read.time_running));
+	}
+
+	if (read_format & PERF_FORMAT_GROUP)
+		values = PyList_New(sample->read.group.nr);
+	else
+		values = PyList_New(1);
+
+	if (!values)
+		Py_FatalError("couldn't create Python list");
+
+	if (read_format & PERF_FORMAT_GROUP) {
+		for (i = 0; i < sample->read.group.nr; i++) {
+			PyObject *t = get_sample_value_as_tuple(&sample->read.group.values[i]);
+			PyList_SET_ITEM(values, i, t);
+		}
+	} else {
+		PyObject *t = get_sample_value_as_tuple(&sample->read.one);
+		PyList_SET_ITEM(values, 0, t);
+	}
+	pydict_set_item_string_decref(dict_sample, "values", values);
+}
+
+static PyObject *get_perf_sample_dict(struct perf_sample *sample,
+					 struct perf_evsel *evsel,
+					 struct addr_location *al,
+					 PyObject *callchain)
+{
+	PyObject *dict, *dict_sample;
+
+	dict = PyDict_New();
+	if (!dict)
+		Py_FatalError("couldn't create Python dictionary");
+
+	dict_sample = PyDict_New();
+	if (!dict_sample)
+		Py_FatalError("couldn't create Python dictionary");
+
+	pydict_set_item_string_decref(dict, "ev_name", PyString_FromString(perf_evsel__name(evsel)));
+	pydict_set_item_string_decref(dict, "attr", PyString_FromStringAndSize(
+			(const char *)&evsel->attr, sizeof(evsel->attr)));
+
+	pydict_set_item_string_decref(dict_sample, "pid",
+			PyInt_FromLong(sample->pid));
+	pydict_set_item_string_decref(dict_sample, "tid",
+			PyInt_FromLong(sample->tid));
+	pydict_set_item_string_decref(dict_sample, "cpu",
+			PyInt_FromLong(sample->cpu));
+	pydict_set_item_string_decref(dict_sample, "ip",
+			PyLong_FromUnsignedLongLong(sample->ip));
+	pydict_set_item_string_decref(dict_sample, "time",
+			PyLong_FromUnsignedLongLong(sample->time));
+	pydict_set_item_string_decref(dict_sample, "period",
+			PyLong_FromUnsignedLongLong(sample->period));
+	set_sample_read_in_dict(dict_sample, sample, evsel);
+	pydict_set_item_string_decref(dict, "sample", dict_sample);
+
+	pydict_set_item_string_decref(dict, "raw_buf", PyString_FromStringAndSize(
+			(const char *)sample->raw_data, sample->raw_size));
+	pydict_set_item_string_decref(dict, "comm",
+			PyString_FromString(thread__comm_str(al->thread)));
+	if (al->map) {
+		pydict_set_item_string_decref(dict, "dso",
+			PyString_FromString(al->map->dso->name));
+	}
+	if (al->sym) {
+		pydict_set_item_string_decref(dict, "symbol",
+			PyString_FromString(al->sym->name));
+	}
+
+	pydict_set_item_string_decref(dict, "callchain", callchain);
+
+	return dict;
+}
+
 static void python_process_tracepoint(struct perf_sample *sample,
 				      struct perf_evsel *evsel,
 				      struct addr_location *al)
 {
 	struct event_format *event = evsel->tp_format;
 	PyObject *handler, *context, *t, *obj = NULL, *callchain;
-	PyObject *dict = NULL;
+	PyObject *dict = NULL, *all_entries_dict = NULL;
 	static char handler_name[256];
 	struct format_field *field;
 	unsigned long s, ns;
@@ -407,10 +537,7 @@ static void python_process_tracepoint(struct perf_sample *sample,
 	void *data = sample->raw_data;
 	unsigned long long nsecs = sample->time;
 	const char *comm = thread__comm_str(al->thread);
-
-	t = PyTuple_New(MAX_FIELDS);
-	if (!t)
-		Py_FatalError("couldn't create Python tuple");
+	const char *default_handler_name = "trace_unhandled";
 
 	if (!event) {
 		snprintf(handler_name, sizeof(handler_name),
@@ -427,10 +554,19 @@ static void python_process_tracepoint(struct perf_sample *sample,
 
 	handler = get_handler(handler_name);
 	if (!handler) {
+		handler = get_handler(default_handler_name);
+		if (!handler)
+			return;
 		dict = PyDict_New();
 		if (!dict)
 			Py_FatalError("couldn't create Python dict");
 	}
+
+	t = PyTuple_New(MAX_FIELDS);
+	if (!t)
+		Py_FatalError("couldn't create Python tuple");
+
+
 	s = nsecs / NSEC_PER_SEC;
 	ns = nsecs - s * NSEC_PER_SEC;
 
@@ -444,8 +580,10 @@ static void python_process_tracepoint(struct perf_sample *sample,
 
 	/* ip unwinding */
 	callchain = python_process_callchain(sample, evsel, al);
+	/* Need an additional reference for the perf_sample dict */
+	Py_INCREF(callchain);
 
-	if (handler) {
+	if (!dict) {
 		PyTuple_SetItem(t, n++, PyInt_FromLong(cpu));
 		PyTuple_SetItem(t, n++, PyInt_FromLong(s));
 		PyTuple_SetItem(t, n++, PyInt_FromLong(ns));
@@ -484,26 +622,35 @@ static void python_process_tracepoint(struct perf_sample *sample,
 		} else { /* FIELD_IS_NUMERIC */
 			obj = get_field_numeric_entry(event, field, data);
 		}
-		if (handler)
+		if (!dict)
 			PyTuple_SetItem(t, n++, obj);
 		else
 			pydict_set_item_string_decref(dict, field->name, obj);
 
 	}
 
-	if (!handler)
+	if (dict)
 		PyTuple_SetItem(t, n++, dict);
 
+	if (get_argument_count(handler) == (int) n + 1) {
+		all_entries_dict = get_perf_sample_dict(sample, evsel, al,
+			callchain);
+		PyTuple_SetItem(t, n++,	all_entries_dict);
+	} else {
+		Py_DECREF(callchain);
+	}
+
 	if (_PyTuple_Resize(&t, n) == -1)
 		Py_FatalError("error resizing Python tuple");
 
-	if (handler) {
+	if (!dict) {
 		call_object(handler, t, handler_name);
 	} else {
-		try_call_object("trace_unhandled", t);
+		call_object(handler, t, default_handler_name);
 		Py_DECREF(dict);
 	}
 
+	Py_XDECREF(all_entries_dict);
 	Py_DECREF(t);
 }
 
@@ -795,10 +942,16 @@ static void python_process_general_event(struct perf_sample *sample,
 					 struct perf_evsel *evsel,
 					 struct addr_location *al)
 {
-	PyObject *handler, *t, *dict, *callchain, *dict_sample;
+	PyObject *handler, *t, *dict, *callchain;
 	static char handler_name[64];
 	unsigned n = 0;
 
+	snprintf(handler_name, sizeof(handler_name), "%s", "process_event");
+
+	handler = get_handler(handler_name);
+	if (!handler)
+		return;
+
 	/*
 	 * Use the MAX_FIELDS to make the function expandable, though
 	 * currently there is only one item for the tuple.
@@ -807,61 +960,16 @@ static void python_process_general_event(struct perf_sample *sample,
 	if (!t)
 		Py_FatalError("couldn't create Python tuple");
 
-	dict = PyDict_New();
-	if (!dict)
-		Py_FatalError("couldn't create Python dictionary");
-
-	dict_sample = PyDict_New();
-	if (!dict_sample)
-		Py_FatalError("couldn't create Python dictionary");
-
-	snprintf(handler_name, sizeof(handler_name), "%s", "process_event");
-
-	handler = get_handler(handler_name);
-	if (!handler)
-		goto exit;
-
-	pydict_set_item_string_decref(dict, "ev_name", PyString_FromString(perf_evsel__name(evsel)));
-	pydict_set_item_string_decref(dict, "attr", PyString_FromStringAndSize(
-			(const char *)&evsel->attr, sizeof(evsel->attr)));
-
-	pydict_set_item_string_decref(dict_sample, "pid",
-			PyInt_FromLong(sample->pid));
-	pydict_set_item_string_decref(dict_sample, "tid",
-			PyInt_FromLong(sample->tid));
-	pydict_set_item_string_decref(dict_sample, "cpu",
-			PyInt_FromLong(sample->cpu));
-	pydict_set_item_string_decref(dict_sample, "ip",
-			PyLong_FromUnsignedLongLong(sample->ip));
-	pydict_set_item_string_decref(dict_sample, "time",
-			PyLong_FromUnsignedLongLong(sample->time));
-	pydict_set_item_string_decref(dict_sample, "period",
-			PyLong_FromUnsignedLongLong(sample->period));
-	pydict_set_item_string_decref(dict, "sample", dict_sample);
-
-	pydict_set_item_string_decref(dict, "raw_buf", PyString_FromStringAndSize(
-			(const char *)sample->raw_data, sample->raw_size));
-	pydict_set_item_string_decref(dict, "comm",
-			PyString_FromString(thread__comm_str(al->thread)));
-	if (al->map) {
-		pydict_set_item_string_decref(dict, "dso",
-			PyString_FromString(al->map->dso->name));
-	}
-	if (al->sym) {
-		pydict_set_item_string_decref(dict, "symbol",
-			PyString_FromString(al->sym->name));
-	}
-
 	/* ip unwinding */
 	callchain = python_process_callchain(sample, evsel, al);
-	pydict_set_item_string_decref(dict, "callchain", callchain);
+	dict = get_perf_sample_dict(sample, evsel, al, callchain);
 
 	PyTuple_SetItem(t, n++, dict);
 	if (_PyTuple_Resize(&t, n) == -1)
 		Py_FatalError("error resizing Python tuple");
 
 	call_object(handler, t, handler_name);
-exit:
+
 	Py_DECREF(dict);
 	Py_DECREF(t);
 }
@@ -1259,6 +1367,12 @@ static int python_generate_script(struct pevent *pevent, const char *outfile)
 
 			fprintf(ofp, "%s", f->name);
 		}
+		if (not_first++)
+			fprintf(ofp, ", ");
+		if (++count % 5 == 0)
+			fprintf(ofp, "\n\t\t");
+		fprintf(ofp, "perf_sample_dict");
+
 		fprintf(ofp, "):\n");
 
 		fprintf(ofp, "\t\tprint_header(event_name, common_cpu, "
@@ -1328,6 +1442,9 @@ static int python_generate_script(struct pevent *pevent, const char *outfile)
 
 		fprintf(ofp, ")\n\n");
 
+		fprintf(ofp, "\t\tprint 'Sample: {'+"
+			"get_dict_as_string(perf_sample_dict['sample'], ', ')+'}'\n\n");
+
 		fprintf(ofp, "\t\tfor node in common_callchain:");
 		fprintf(ofp, "\n\t\t\tif 'sym' in node:");
 		fprintf(ofp, "\n\t\t\t\tprint \"\\t[%%x] %%s\" %% (node['ip'], node['sym']['name'])");
@@ -1338,15 +1455,20 @@ static int python_generate_script(struct pevent *pevent, const char *outfile)
 	}
 
 	fprintf(ofp, "def trace_unhandled(event_name, context, "
-		"event_fields_dict):\n");
+		"event_fields_dict, perf_sample_dict):\n");
 
-	fprintf(ofp, "\t\tprint ' '.join(['%%s=%%s'%%(k,str(v))"
-		"for k,v in sorted(event_fields_dict.items())])\n\n");
+	fprintf(ofp, "\t\tprint get_dict_as_string(event_fields_dict)\n");
+	fprintf(ofp, "\t\tprint 'Sample: {'+"
+		"get_dict_as_string(perf_sample_dict['sample'], ', ')+'}'\n\n");
 
 	fprintf(ofp, "def print_header("
 		"event_name, cpu, secs, nsecs, pid, comm):\n"
 		"\tprint \"%%-20s %%5u %%05u.%%09u %%8u %%-20s \" %% \\\n\t"
-		"(event_name, cpu, secs, nsecs, pid, comm),\n");
+		"(event_name, cpu, secs, nsecs, pid, comm),\n\n");
+
+	fprintf(ofp, "def get_dict_as_string(a_dict, delimiter=' '):\n"
+		"\treturn delimiter.join"
+		"(['%%s=%%s'%%(k,str(v))for k,v in sorted(a_dict.items())])\n");
 
 	fclose(ofp);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index d19c40a81040..ac863691605f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -428,6 +428,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->stat_round = process_stat_round_stub;
 	if (tool->time_conv == NULL)
 		tool->time_conv = process_event_op2_stub;
+	if (tool->feature == NULL)
+		tool->feature = process_event_op2_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -1125,6 +1127,30 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
 		sample_read__printf(sample, evsel->attr.read_format);
 }
 
+static void dump_read(struct perf_evsel *evsel, union perf_event *event)
+{
+	struct read_event *read_event = &event->read;
+	u64 read_format;
+
+	if (!dump_trace)
+		return;
+
+	printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid,
+	       evsel ? perf_evsel__name(evsel) : "FAIL",
+	       event->read.value);
+
+	read_format = evsel->attr.read_format;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		printf("... time enabled : %" PRIu64 "\n", read_event->time_enabled);
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		printf("... time running : %" PRIu64 "\n", read_event->time_running);
+
+	if (read_format & PERF_FORMAT_ID)
+		printf("... id           : %" PRIu64 "\n", read_event->id);
+}
+
 static struct machine *machines__find_for_cpumode(struct machines *machines,
 					       union perf_event *event,
 					       struct perf_sample *sample)
@@ -1269,6 +1295,7 @@ static int machines__deliver_event(struct machines *machines,
 			evlist->stats.total_lost_samples += event->lost_samples.lost;
 		return tool->lost_samples(tool, event, sample, machine);
 	case PERF_RECORD_READ:
+		dump_read(evsel, event);
 		return tool->read(tool, event, sample, evsel, machine);
 	case PERF_RECORD_THROTTLE:
 		return tool->throttle(tool, event, sample, machine);
@@ -1371,6 +1398,8 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 	case PERF_RECORD_TIME_CONV:
 		session->time_conv = event->time_conv;
 		return tool->time_conv(tool, event, session);
+	case PERF_RECORD_HEADER_FEATURE:
+		return tool->feature(tool, event, session);
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/perf/util/setns.c b/tools/perf/util/setns.c
new file mode 100644
index 000000000000..ce8fc290fce8
--- /dev/null
+++ b/tools/perf/util/setns.c
@@ -0,0 +1,8 @@
+#include "util.h"
+#include <unistd.h>
+#include <sys/syscall.h>
+
+int setns(int fd, int nstype)
+{
+	return syscall(__NR_setns, fd, nstype);
+}
diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
new file mode 100644
index 000000000000..453f6f6f29f3
--- /dev/null
+++ b/tools/perf/util/smt.c
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <linux/bitops.h>
+#include "api/fs/fs.h"
+#include "smt.h"
+
+int smt_on(void)
+{
+	static bool cached;
+	static int cached_result;
+	int cpu;
+	int ncpu;
+
+	if (cached)
+		return cached_result;
+
+	ncpu = sysconf(_SC_NPROCESSORS_CONF);
+	for (cpu = 0; cpu < ncpu; cpu++) {
+		unsigned long long siblings;
+		char *str;
+		size_t strlen;
+		char fn[256];
+
+		snprintf(fn, sizeof fn,
+			"devices/system/cpu/cpu%d/topology/thread_siblings",
+			cpu);
+		if (sysfs__read_str(fn, &str, &strlen) < 0)
+			continue;
+		/* Entry is hex, but does not have 0x, so need custom parser */
+		siblings = strtoull(str, NULL, 16);
+		free(str);
+		if (hweight64(siblings) > 1) {
+			cached_result = 1;
+			cached = true;
+			break;
+		}
+	}
+	if (!cached) {
+		cached_result = 0;
+		cached = true;
+	}
+	return cached_result;
+}
diff --git a/tools/perf/util/smt.h b/tools/perf/util/smt.h
new file mode 100644
index 000000000000..b8414b7bcbc8
--- /dev/null
+++ b/tools/perf/util/smt.h
@@ -0,0 +1,6 @@
+#ifndef SMT_H
+#define SMT_H 1
+
+int smt_on(void);
+
+#endif
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 8b327c955a4f..12359bd986db 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2563,7 +2563,7 @@ static const char *get_default_sort_order(struct perf_evlist *evlist)
 
 	BUG_ON(sort__mode >= ARRAY_SIZE(default_sort_orders));
 
-	if (evlist == NULL)
+	if (evlist == NULL || perf_evlist__empty(evlist))
 		goto out_no_evlist;
 
 	evlist__for_each_entry(evlist, evsel) {
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index ebc88a74e67b..ed8e8d2de942 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -155,6 +155,9 @@ static void find_address_in_section(bfd *abfd, asection *section, void *data)
 	a2l->found = bfd_find_nearest_line(abfd, section, a2l->syms, pc - vma,
 					   &a2l->filename, &a2l->funcname,
 					   &a2l->line);
+
+	if (a2l->filename && !strlen(a2l->filename))
+		a2l->filename = NULL;
 }
 
 static struct a2l_data *addr2line_init(const char *path)
@@ -248,6 +251,9 @@ static int addr2line(const char *dso_name, u64 addr,
 					     &a2l->funcname, &a2l->line) &&
 		       cnt++ < MAX_INLINE_NEST) {
 
+			if (a2l->filename && !strlen(a2l->filename))
+				a2l->filename = NULL;
+
 			if (node != NULL) {
 				if (inline_list__append_dso_a2l(dso, node))
 					return 0;
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 719d6cb86952..a04cf56d3517 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -70,7 +70,11 @@ static int saved_value_cmp(struct rb_node *rb_node, const void *entry)
 		return a->ctx - b->ctx;
 	if (a->cpu != b->cpu)
 		return a->cpu - b->cpu;
-	return a->evsel - b->evsel;
+	if (a->evsel == b->evsel)
+		return 0;
+	if ((char *)a->evsel < (char *)b->evsel)
+		return -1;
+	return +1;
 }
 
 static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused,
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 53b9a994a3dc..35e9848734d6 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -128,6 +128,10 @@ static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
 
 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
 {
+	struct perf_stat_evsel *ps = evsel->priv;
+
+	if (ps)
+		free(ps->group_data);
 	zfree(&evsel->priv);
 }
 
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 7522bf10b03e..eacaf958e19d 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -28,8 +28,9 @@ enum perf_stat_evsel_id {
 };
 
 struct perf_stat_evsel {
-	struct stats		res_stats[3];
-	enum perf_stat_evsel_id	id;
+	struct stats		 res_stats[3];
+	enum perf_stat_evsel_id	 id;
+	u64			*group_data;
 };
 
 enum aggr_mode {
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 502505cf236a..5c39f420111e 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -259,7 +259,7 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
 {
 	uint32_t nr_rel_entries, idx;
 	GElf_Sym sym;
-	u64 plt_offset;
+	u64 plt_offset, plt_header_size, plt_entry_size;
 	GElf_Shdr shdr_plt;
 	struct symbol *f;
 	GElf_Shdr shdr_rel_plt, shdr_dynsym;
@@ -326,6 +326,23 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
 
 	nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
 	plt_offset = shdr_plt.sh_offset;
+	switch (ehdr.e_machine) {
+		case EM_ARM:
+			plt_header_size = 20;
+			plt_entry_size = 12;
+			break;
+
+		case EM_AARCH64:
+			plt_header_size = 32;
+			plt_entry_size = 16;
+			break;
+
+		default: /* FIXME: s390/alpha/mips/parisc/poperpc/sh/sparc/xtensa need to be checked */
+			plt_header_size = shdr_plt.sh_entsize;
+			plt_entry_size = shdr_plt.sh_entsize;
+			break;
+	}
+	plt_offset += plt_header_size;
 
 	if (shdr_rel_plt.sh_type == SHT_RELA) {
 		GElf_Rela pos_mem, *pos;
@@ -335,7 +352,6 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
 			const char *elf_name = NULL;
 			char *demangled = NULL;
 			symidx = GELF_R_SYM(pos->r_info);
-			plt_offset += shdr_plt.sh_entsize;
 			gelf_getsym(syms, symidx, &sym);
 
 			elf_name = elf_sym__name(&sym, symstrs);
@@ -346,11 +362,12 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
 				 "%s@plt", elf_name);
 			free(demangled);
 
-			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+			f = symbol__new(plt_offset, plt_entry_size,
 					STB_GLOBAL, sympltname);
 			if (!f)
 				goto out_elf_end;
 
+			plt_offset += plt_entry_size;
 			symbols__insert(&dso->symbols[map->type], f);
 			++nr;
 		}
@@ -361,7 +378,6 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
 			const char *elf_name = NULL;
 			char *demangled = NULL;
 			symidx = GELF_R_SYM(pos->r_info);
-			plt_offset += shdr_plt.sh_entsize;
 			gelf_getsym(syms, symidx, &sym);
 
 			elf_name = elf_sym__name(&sym, symstrs);
@@ -372,11 +388,12 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map *
 				 "%s@plt", elf_name);
 			free(demangled);
 
-			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+			f = symbol__new(plt_offset, plt_entry_size,
 					STB_GLOBAL, sympltname);
 			if (!f)
 				goto out_elf_end;
 
+			plt_offset += plt_entry_size;
 			symbols__insert(&dso->symbols[map->type], f);
 			++nr;
 		}
@@ -391,7 +408,7 @@ out_elf_end:
 	return 0;
 }
 
-char *dso__demangle_sym(struct dso *dso, int kmodule, char *elf_name)
+char *dso__demangle_sym(struct dso *dso, int kmodule, const char *elf_name)
 {
 	return demangle_sym(dso, kmodule, elf_name);
 }
@@ -793,6 +810,12 @@ static u64 ref_reloc(struct kmap *kmap)
 void __weak arch__sym_update(struct symbol *s __maybe_unused,
 		GElf_Sym *sym __maybe_unused) { }
 
+void __weak arch__adjust_sym_map_offset(GElf_Sym *sym, GElf_Shdr *shdr,
+				       struct map *map __maybe_unused)
+{
+	sym->st_value -= shdr->sh_addr - shdr->sh_offset;
+}
+
 int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 		  struct symsrc *runtime_ss, int kmodule)
 {
@@ -973,7 +996,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 
 			/* Adjust symbol to map to file offset */
 			if (adjust_kernel_syms)
-				sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+				arch__adjust_sym_map_offset(&sym, &shdr, map);
 
 			if (strcmp(section_name,
 				   (curr_dso->short_name +
@@ -1442,7 +1465,7 @@ static int kcore_copy__parse_kallsyms(struct kcore_copy_info *kci,
 
 static int kcore_copy__process_modules(void *arg,
 				       const char *name __maybe_unused,
-				       u64 start)
+				       u64 start, u64 size __maybe_unused)
 {
 	struct kcore_copy_info *kci = arg;
 
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c
index 40bf5d4c0bfd..1a5aa35b0100 100644
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c
@@ -377,7 +377,7 @@ void symbol__elf_init(void)
 
 char *dso__demangle_sym(struct dso *dso __maybe_unused,
 			int kmodule __maybe_unused,
-			char *elf_name __maybe_unused)
+			const char *elf_name __maybe_unused)
 {
 	return NULL;
 }
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index e7a98dbd2aed..5909ee4c7ade 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -18,6 +18,7 @@
 #include "symbol.h"
 #include "strlist.h"
 #include "intlist.h"
+#include "namespaces.h"
 #include "header.h"
 #include "path.h"
 #include "sane_ctype.h"
@@ -52,6 +53,7 @@ static enum dso_binary_type binary_type_symtab[] = {
 	DSO_BINARY_TYPE__JAVA_JIT,
 	DSO_BINARY_TYPE__DEBUGLINK,
 	DSO_BINARY_TYPE__BUILD_ID_CACHE,
+	DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO,
 	DSO_BINARY_TYPE__FEDORA_DEBUGINFO,
 	DSO_BINARY_TYPE__UBUNTU_DEBUGINFO,
 	DSO_BINARY_TYPE__BUILDID_DEBUGINFO,
@@ -231,7 +233,8 @@ void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
 		goto out_unlock;
 
 	for (next = map__next(curr); next; next = map__next(curr)) {
-		curr->end = next->start;
+		if (!curr->end)
+			curr->end = next->start;
 		curr = next;
 	}
 
@@ -239,7 +242,8 @@ void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
 	 * We still haven't the actual symbols, so guess the
 	 * last map final address.
 	 */
-	curr->end = ~0ULL;
+	if (!curr->end)
+		curr->end = ~0ULL;
 
 out_unlock:
 	pthread_rwlock_unlock(&maps->lock);
@@ -550,7 +554,7 @@ void dso__sort_by_name(struct dso *dso, enum map_type type)
 
 int modules__parse(const char *filename, void *arg,
 		   int (*process_module)(void *arg, const char *name,
-					 u64 start))
+					 u64 start, u64 size))
 {
 	char *line = NULL;
 	size_t n;
@@ -563,8 +567,8 @@ int modules__parse(const char *filename, void *arg,
 
 	while (1) {
 		char name[PATH_MAX];
-		u64 start;
-		char *sep;
+		u64 start, size;
+		char *sep, *endptr;
 		ssize_t line_len;
 
 		line_len = getline(&line, &n, file);
@@ -596,7 +600,11 @@ int modules__parse(const char *filename, void *arg,
 
 		scnprintf(name, sizeof(name), "[%s]", line);
 
-		err = process_module(arg, name, start);
+		size = strtoul(sep + 1, &endptr, 0);
+		if (*endptr != ' ' && *endptr != '\t')
+			continue;
+
+		err = process_module(arg, name, start, size);
 		if (err)
 			break;
 	}
@@ -943,7 +951,8 @@ static struct module_info *find_module(const char *name,
 	return NULL;
 }
 
-static int __read_proc_modules(void *arg, const char *name, u64 start)
+static int __read_proc_modules(void *arg, const char *name, u64 start,
+			       u64 size __maybe_unused)
 {
 	struct rb_root *modules = arg;
 	struct module_info *mi;
@@ -1325,14 +1334,15 @@ int dso__load_kallsyms(struct dso *dso, const char *filename,
 	return __dso__load_kallsyms(dso, filename, map, false);
 }
 
-static int dso__load_perf_map(struct dso *dso, struct map *map)
+static int dso__load_perf_map(const char *map_path, struct dso *dso,
+			      struct map *map)
 {
 	char *line = NULL;
 	size_t n;
 	FILE *file;
 	int nr_syms = 0;
 
-	file = fopen(dso->long_name, "r");
+	file = fopen(map_path, "r");
 	if (file == NULL)
 		goto out_failure;
 
@@ -1416,6 +1426,7 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod,
 		return kmod && dso->symtab_type == type;
 
 	case DSO_BINARY_TYPE__BUILD_ID_CACHE:
+	case DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO:
 		return true;
 
 	case DSO_BINARY_TYPE__NOT_FOUND:
@@ -1424,6 +1435,44 @@ static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod,
 	}
 }
 
+/* Checks for the existence of the perf-<pid>.map file in two different
+ * locations.  First, if the process is a separate mount namespace, check in
+ * that namespace using the pid of the innermost pid namespace.  If's not in a
+ * namespace, or the file can't be found there, try in the mount namespace of
+ * the tracing process using our view of its pid.
+ */
+static int dso__find_perf_map(char *filebuf, size_t bufsz,
+			      struct nsinfo **nsip)
+{
+	struct nscookie nsc;
+	struct nsinfo *nsi;
+	struct nsinfo *nnsi;
+	int rc = -1;
+
+	nsi = *nsip;
+
+	if (nsi->need_setns) {
+		snprintf(filebuf, bufsz, "/tmp/perf-%d.map", nsi->nstgid);
+		nsinfo__mountns_enter(nsi, &nsc);
+		rc = access(filebuf, R_OK);
+		nsinfo__mountns_exit(&nsc);
+		if (rc == 0)
+			return rc;
+	}
+
+	nnsi = nsinfo__copy(nsi);
+	if (nnsi) {
+		nsinfo__put(nsi);
+
+		nnsi->need_setns = false;
+		snprintf(filebuf, bufsz, "/tmp/perf-%d.map", nnsi->tgid);
+		*nsip = nnsi;
+		rc = 0;
+	}
+
+	return rc;
+}
+
 int dso__load(struct dso *dso, struct map *map)
 {
 	char *name;
@@ -1435,8 +1484,21 @@ int dso__load(struct dso *dso, struct map *map)
 	struct symsrc ss_[2];
 	struct symsrc *syms_ss = NULL, *runtime_ss = NULL;
 	bool kmod;
+	bool perfmap;
 	unsigned char build_id[BUILD_ID_SIZE];
+	struct nscookie nsc;
+	char newmapname[PATH_MAX];
+	const char *map_path = dso->long_name;
+
+	perfmap = strncmp(dso->name, "/tmp/perf-", 10) == 0;
+	if (perfmap) {
+		if (dso->nsinfo && (dso__find_perf_map(newmapname,
+		    sizeof(newmapname), &dso->nsinfo) == 0)) {
+			map_path = newmapname;
+		}
+	}
 
+	nsinfo__mountns_enter(dso->nsinfo, &nsc);
 	pthread_mutex_lock(&dso->lock);
 
 	/* check again under the dso->lock */
@@ -1461,19 +1523,19 @@ int dso__load(struct dso *dso, struct map *map)
 
 	dso->adjust_symbols = 0;
 
-	if (strncmp(dso->name, "/tmp/perf-", 10) == 0) {
+	if (perfmap) {
 		struct stat st;
 
-		if (lstat(dso->name, &st) < 0)
+		if (lstat(map_path, &st) < 0)
 			goto out;
 
 		if (!symbol_conf.force && st.st_uid && (st.st_uid != geteuid())) {
 			pr_warning("File %s not owned by current user or root, "
-				   "ignoring it (use -f to override).\n", dso->name);
+				   "ignoring it (use -f to override).\n", map_path);
 			goto out;
 		}
 
-		ret = dso__load_perf_map(dso, map);
+		ret = dso__load_perf_map(map_path, dso, map);
 		dso->symtab_type = ret > 0 ? DSO_BINARY_TYPE__JAVA_JIT :
 					     DSO_BINARY_TYPE__NOT_FOUND;
 		goto out;
@@ -1511,9 +1573,15 @@ int dso__load(struct dso *dso, struct map *map)
 	for (i = 0; i < DSO_BINARY_TYPE__SYMTAB_CNT; i++) {
 		struct symsrc *ss = &ss_[ss_pos];
 		bool next_slot = false;
+		bool is_reg;
+		bool nsexit;
+		int sirc;
 
 		enum dso_binary_type symtab_type = binary_type_symtab[i];
 
+		nsexit = (symtab_type == DSO_BINARY_TYPE__BUILD_ID_CACHE ||
+		    symtab_type == DSO_BINARY_TYPE__BUILD_ID_CACHE_DEBUGINFO);
+
 		if (!dso__is_compatible_symtab_type(dso, kmod, symtab_type))
 			continue;
 
@@ -1521,12 +1589,20 @@ int dso__load(struct dso *dso, struct map *map)
 						   root_dir, name, PATH_MAX))
 			continue;
 
-		if (!is_regular_file(name))
-			continue;
+		if (nsexit)
+			nsinfo__mountns_exit(&nsc);
+
+		is_reg = is_regular_file(name);
+		sirc = symsrc__init(ss, dso, name, symtab_type);
 
-		/* Name is now the name of the next image to try */
-		if (symsrc__init(ss, dso, name, symtab_type) < 0)
+		if (nsexit)
+			nsinfo__mountns_enter(dso->nsinfo, &nsc);
+
+		if (!is_reg || sirc < 0) {
+			if (sirc >= 0)
+				symsrc__destroy(ss);
 			continue;
+		}
 
 		if (!syms_ss && symsrc__has_symtab(ss)) {
 			syms_ss = ss;
@@ -1584,6 +1660,7 @@ out_free:
 out:
 	dso__set_loaded(dso, map->type);
 	pthread_mutex_unlock(&dso->lock);
+	nsinfo__mountns_exit(&nsc);
 
 	return ret;
 }
@@ -1660,7 +1737,7 @@ int dso__load_vmlinux_path(struct dso *dso, struct map *map)
 	}
 
 	if (!symbol_conf.ignore_vmlinux_buildid)
-		filename = dso__build_id_filename(dso, NULL, 0);
+		filename = dso__build_id_filename(dso, NULL, 0, false);
 	if (filename != NULL) {
 		err = dso__load_vmlinux(dso, map, filename, true);
 		if (err > 0)
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 41ebba9a2eb2..d00a012cfdfb 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -273,7 +273,7 @@ int filename__read_build_id(const char *filename, void *bf, size_t size);
 int sysfs__read_build_id(const char *filename, void *bf, size_t size);
 int modules__parse(const char *filename, void *arg,
 		   int (*process_module)(void *arg, const char *name,
-					 u64 start));
+					 u64 start, u64 size));
 int filename__read_debuglink(const char *filename, char *debuglink,
 			     size_t size);
 
@@ -306,7 +306,7 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss,
 				struct map *map);
 
-char *dso__demangle_sym(struct dso *dso, int kmodule, char *elf_name);
+char *dso__demangle_sym(struct dso *dso, int kmodule, const char *elf_name);
 
 void __symbols__insert(struct rb_root *symbols, struct symbol *sym, bool kernel);
 void symbols__insert(struct rb_root *symbols, struct symbol *sym);
@@ -343,6 +343,9 @@ int setup_intlist(struct intlist **list, const char *list_str,
 #ifdef HAVE_LIBELF_SUPPORT
 bool elf__needs_adjust_symbols(GElf_Ehdr ehdr);
 void arch__sym_update(struct symbol *s, GElf_Sym *sym);
+void arch__adjust_sym_map_offset(GElf_Sym *sym,
+				 GElf_Shdr *shdr __maybe_unused,
+				 struct map *map __maybe_unused);
 #endif
 
 #define SYMBOL_A 0
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 378c418ca0c1..aee9a42102ba 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -59,6 +59,8 @@ struct thread *thread__new(pid_t pid, pid_t tid)
 		list_add(&comm->list, &thread->comm_list);
 		refcount_set(&thread->refcnt, 1);
 		RB_CLEAR_NODE(&thread->rb_node);
+		/* Thread holds first ref to nsdata. */
+		thread->nsinfo = nsinfo__new(pid);
 	}
 
 	return thread;
@@ -91,6 +93,7 @@ void thread__delete(struct thread *thread)
 		comm__free(comm);
 	}
 	unwind__finish_access(thread);
+	nsinfo__zput(thread->nsinfo);
 
 	free(thread);
 }
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 4eb849e9098f..cb1a5dd5c2b9 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -34,6 +34,7 @@ struct thread {
 
 	void			*priv;
 	struct thread_stack	*ts;
+	struct nsinfo		*nsinfo;
 #ifdef HAVE_LIBUNWIND_SUPPORT
 	void				*addr_space;
 	struct unwind_libunwind_ops	*unwind_libunwind_ops;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 829471a1c6d7..d549e50db397 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -34,6 +34,12 @@ typedef int (*event_oe)(struct perf_tool *tool, union perf_event *event,
 typedef s64 (*event_op3)(struct perf_tool *tool, union perf_event *event,
 			 struct perf_session *session);
 
+enum show_feature_header {
+	SHOW_FEAT_NO_HEADER = 0,
+	SHOW_FEAT_HEADER,
+	SHOW_FEAT_HEADER_FULL_INFO,
+};
+
 struct perf_tool {
 	event_sample	sample,
 			read;
@@ -63,11 +69,13 @@ struct perf_tool {
 			cpu_map,
 			stat_config,
 			stat,
-			stat_round;
+			stat_round,
+			feature;
 	event_op3	auxtrace;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
 	bool		namespace_events;
+	enum show_feature_header show_feat_hdr;
 };
 
 #endif /* __PERF_TOOL_H */
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 988111e0bab5..4c360daa4e24 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -143,13 +143,17 @@ out:
 	return list;
 }
 
-static int slow_copyfile(const char *from, const char *to)
+static int slow_copyfile(const char *from, const char *to, struct nsinfo *nsi)
 {
 	int err = -1;
 	char *line = NULL;
 	size_t n;
-	FILE *from_fp = fopen(from, "r"), *to_fp;
+	FILE *from_fp, *to_fp;
+	struct nscookie nsc;
 
+	nsinfo__mountns_enter(nsi, &nsc);
+	from_fp = fopen(from, "r");
+	nsinfo__mountns_exit(&nsc);
 	if (from_fp == NULL)
 		goto out;
 
@@ -198,15 +202,21 @@ int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size)
 	return size ? -1 : 0;
 }
 
-int copyfile_mode(const char *from, const char *to, mode_t mode)
+static int copyfile_mode_ns(const char *from, const char *to, mode_t mode,
+			    struct nsinfo *nsi)
 {
 	int fromfd, tofd;
 	struct stat st;
-	int err = -1;
+	int err;
 	char *tmp = NULL, *ptr = NULL;
+	struct nscookie nsc;
 
-	if (stat(from, &st))
+	nsinfo__mountns_enter(nsi, &nsc);
+	err = stat(from, &st);
+	nsinfo__mountns_exit(&nsc);
+	if (err)
 		goto out;
+	err = -1;
 
 	/* extra 'x' at the end is to reserve space for '.' */
 	if (asprintf(&tmp, "%s.XXXXXXx", to) < 0) {
@@ -227,11 +237,13 @@ int copyfile_mode(const char *from, const char *to, mode_t mode)
 		goto out_close_to;
 
 	if (st.st_size == 0) { /* /proc? do it slowly... */
-		err = slow_copyfile(from, tmp);
+		err = slow_copyfile(from, tmp, nsi);
 		goto out_close_to;
 	}
 
+	nsinfo__mountns_enter(nsi, &nsc);
 	fromfd = open(from, O_RDONLY);
+	nsinfo__mountns_exit(&nsc);
 	if (fromfd < 0)
 		goto out_close_to;
 
@@ -248,6 +260,16 @@ out:
 	return err;
 }
 
+int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi)
+{
+	return copyfile_mode_ns(from, to, 0755, nsi);
+}
+
+int copyfile_mode(const char *from, const char *to, mode_t mode)
+{
+	return copyfile_mode_ns(from, to, mode, NULL);
+}
+
 int copyfile(const char *from, const char *to)
 {
 	return copyfile_mode(from, to, 0755);
@@ -259,6 +281,7 @@ static ssize_t ion(bool is_read, int fd, void *buf, size_t n)
 	size_t left = n;
 
 	while (left) {
+		/* buf must be treated as const if !is_read. */
 		ssize_t ret = is_read ? read(fd, buf, left) :
 					write(fd, buf, left);
 
@@ -286,9 +309,10 @@ ssize_t readn(int fd, void *buf, size_t n)
 /*
  * Write exactly 'n' bytes or return an error.
  */
-ssize_t writen(int fd, void *buf, size_t n)
+ssize_t writen(int fd, const void *buf, size_t n)
 {
-	return ion(false, fd, buf, n);
+	/* ion does not modify buf. */
+	return ion(false, fd, (void *)buf, n);
 }
 
 size_t hex_width(u64 v)
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 2c9e58a45310..b136c271125f 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -12,6 +12,7 @@
 #include <stdarg.h>
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include "namespaces.h"
 
 /* General helper functions */
 void usage(const char *err) __noreturn;
@@ -33,10 +34,11 @@ struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dire
 bool lsdir_no_dot_filter(const char *name, struct dirent *d);
 int copyfile(const char *from, const char *to);
 int copyfile_mode(const char *from, const char *to, mode_t mode);
+int copyfile_ns(const char *from, const char *to, struct nsinfo *nsi);
 int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size);
 
 ssize_t readn(int fd, void *buf, size_t n);
-ssize_t writen(int fd, void *buf, size_t n);
+ssize_t writen(int fd, const void *buf, size_t n);
 
 size_t hex_width(u64 v);
 int hex2u64(const char *ptr, u64 *val);
@@ -58,4 +60,8 @@ const char *perf_tip(const char *dirpath);
 int sched_getcpu(void);
 #endif
 
+#ifndef HAVE_SETNS_SUPPORT
+int setns(int fd, int nstype);
+#endif
+
 #endif /* GIT_COMPAT_UTIL_H */
diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c
index 5de2e15e2eda..8a32bb0095e5 100644
--- a/tools/perf/util/values.c
+++ b/tools/perf/util/values.c
@@ -12,7 +12,7 @@ int perf_read_values_init(struct perf_read_values *values)
 	values->threads_max = 16;
 	values->pid = malloc(values->threads_max * sizeof(*values->pid));
 	values->tid = malloc(values->threads_max * sizeof(*values->tid));
-	values->value = malloc(values->threads_max * sizeof(*values->value));
+	values->value = zalloc(values->threads_max * sizeof(*values->value));
 	if (!values->pid || !values->tid || !values->value) {
 		pr_debug("failed to allocate read_values threads arrays");
 		goto out_free_pid;
@@ -98,15 +98,16 @@ static int perf_read_values__findnew_thread(struct perf_read_values *values,
 			return i;
 	}
 
-	i = values->threads + 1;
-	values->value[i] = malloc(values->counters_max * sizeof(**values->value));
+	i = values->threads;
+
+	values->value[i] = zalloc(values->counters_max * sizeof(**values->value));
 	if (!values->value[i]) {
 		pr_debug("failed to allocate read_values counters array");
 		return -ENOMEM;
 	}
 	values->pid[i] = pid;
 	values->tid[i] = tid;
-	values->threads = i;
+	values->threads = i + 1;
 
 	return i;
 }
@@ -130,12 +131,16 @@ static int perf_read_values__enlarge_counters(struct perf_read_values *values)
 
 	for (i = 0; i < values->threads; i++) {
 		u64 *value = realloc(values->value[i], counters_max * sizeof(**values->value));
+		int j;
 
-		if (value) {
+		if (!value) {
 			pr_debug("failed to enlarge read_values ->values array");
 			goto out_free_name;
 		}
 
+		for (j = values->counters_max; j < counters_max; j++)
+			value[j] = 0;
+
 		values->value[i] = value;
 	}
 
@@ -187,7 +192,7 @@ int perf_read_values_add_value(struct perf_read_values *values,
 	if (cindex < 0)
 		return cindex;
 
-	values->value[tindex][cindex] = value;
+	values->value[tindex][cindex] += value;
 	return 0;
 }
 
diff --git a/tools/perf/util/xyarray.c b/tools/perf/util/xyarray.c
index 7251fdbabced..c8f415d9877b 100644
--- a/tools/perf/util/xyarray.c
+++ b/tools/perf/util/xyarray.c
@@ -12,6 +12,8 @@ struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size)
 		xy->entry_size = entry_size;
 		xy->row_size   = row_size;
 		xy->entries    = xlen * ylen;
+		xy->max_x      = xlen;
+		xy->max_y      = ylen;
 	}
 
 	return xy;
diff --git a/tools/perf/util/xyarray.h b/tools/perf/util/xyarray.h
index 7f30af371b7e..4ba726c90870 100644
--- a/tools/perf/util/xyarray.h
+++ b/tools/perf/util/xyarray.h
@@ -7,6 +7,8 @@ struct xyarray {
 	size_t row_size;
 	size_t entry_size;
 	size_t entries;
+	size_t max_x;
+	size_t max_y;
 	char contents[];
 };
 
@@ -19,4 +21,14 @@ static inline void *xyarray__entry(struct xyarray *xy, int x, int y)
 	return &xy->contents[x * xy->row_size + y * xy->entry_size];
 }
 
+static inline int xyarray__max_y(struct xyarray *xy)
+{
+	return xy->max_x;
+}
+
+static inline int xyarray__max_x(struct xyarray *xy)
+{
+	return xy->max_y;
+}
+
 #endif /* _PERF_XYARRAY_H_ */
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index ccad8ce925e4..1e8b6116ba3c 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -39,7 +39,9 @@ EXTRA_WARNINGS += -Wundef
 EXTRA_WARNINGS += -Wwrite-strings
 EXTRA_WARNINGS += -Wformat
 
-ifneq ($(CC), clang)
+CC_NO_CLANG := $(shell $(CC) -dM -E -x c /dev/null | grep -Fq "__clang__"; echo $$?)
+
+ifeq ($(CC_NO_CLANG), 1)
 EXTRA_WARNINGS += -Wstrict-aliasing=3
 endif
 
diff --git a/tools/testing/selftests/rcutorture/bin/config_override.sh b/tools/testing/selftests/rcutorture/bin/config_override.sh
new file mode 100755
index 000000000000..49fa51726ce3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/config_override.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#
+# config_override.sh base override
+#
+# Combines base and override, removing any Kconfig options from base
+# that conflict with any in override, concatenating what remains and
+# sending the result to standard output.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2017
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+base=$1
+if test -r $base
+then
+	:
+else
+	echo Base file $base unreadable!!!
+	exit 1
+fi
+
+override=$2
+if test -r $override
+then
+	:
+else
+	echo Override file $override unreadable!!!
+	exit 1
+fi
+
+T=/tmp/config_override.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+sed < $override -e 's/^/grep -v "/' -e 's/=.*$/="/' |
+	awk '
+	{
+		if (last)
+			print last " |";
+		last = $0;
+	}
+	END {
+		if (last)
+			print last;
+	}' > $T/script
+sh $T/script < $base
+cat $override
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 1426a9b97494..07a13779eece 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -66,9 +66,34 @@ configfrag_boot_params () {
 
 # configfrag_boot_cpus bootparam-string config-fragment-file config-cpus
 #
-# Decreases number of CPUs based on any maxcpus= boot parameters specified.
+# Decreases number of CPUs based on any nr_cpus= boot parameters specified.
 configfrag_boot_cpus () {
 	local bootargs="`configfrag_boot_params "$1" "$2"`"
+	local nr_cpus
+	if echo "${bootargs}" | grep -q 'nr_cpus=[0-9]'
+	then
+		nr_cpus="`echo "${bootargs}" | sed -e 's/^.*nr_cpus=\([0-9]*\).*$/\1/'`"
+		if test "$3" -gt "$nr_cpus"
+		then
+			echo $nr_cpus
+		else
+			echo $3
+		fi
+	else
+		echo $3
+	fi
+}
+
+# configfrag_boot_maxcpus bootparam-string config-fragment-file config-cpus
+#
+# Decreases number of CPUs based on any maxcpus= boot parameters specified.
+# This allows tests where additional CPUs come online later during the
+# test run.  However, the torture parameters will be set based on the
+# number of CPUs initially present, so the scripting should schedule
+# test runs based on the maxcpus= boot parameter controlling the initial
+# number of CPUs instead of on the ultimate number of CPUs.
+configfrag_boot_maxcpus () {
+	local bootargs="`configfrag_boot_params "$1" "$2"`"
 	local maxcpus
 	if echo "${bootargs}" | grep -q 'maxcpus=[0-9]'
 	then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
index c29f2ec0bf9f..46752c164676 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
@@ -2,7 +2,7 @@
 #
 # Build a kvm-ready Linux kernel from the tree in the current directory.
 #
-# Usage: kvm-build.sh config-template build-dir more-configs
+# Usage: kvm-build.sh config-template build-dir
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -34,24 +34,17 @@ then
 	echo "kvm-build.sh :$builddir: Not a writable directory, cannot build into it"
 	exit 1
 fi
-moreconfigs=${3}
-if test -z "$moreconfigs" -o ! -r "$moreconfigs"
-then
-	echo "kvm-build.sh :$moreconfigs: Not a readable file"
-	exit 1
-fi
 
 T=/tmp/test-linux.sh.$$
 trap 'rm -rf $T' 0
 mkdir $T
 
-grep -v 'CONFIG_[A-Z]*_TORTURE_TEST=' < ${config_template} > $T/config
+cp ${config_template} $T/config
 cat << ___EOF___ >> $T/config
 CONFIG_INITRAMFS_SOURCE="$TORTURE_INITRD"
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_CONSOLE=y
 ___EOF___
-cat $moreconfigs >> $T/config
 
 configinit.sh $T/config O=$builddir
 retval=$?
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 93eede4e8fbe..0af36a721b9c 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -40,7 +40,7 @@
 
 T=/tmp/kvm-test-1-run.sh.$$
 trap 'rm -rf $T' 0
-touch $T
+mkdir $T
 
 . $KVM/bin/functions.sh
 . $CONFIGFRAG/ver_functions.sh
@@ -60,37 +60,33 @@ then
 	echo "kvm-test-1-run.sh :$resdir: Not a writable directory, cannot store results into it"
 	exit 1
 fi
-cp $config_template $resdir/ConfigFragment
 echo ' ---' `date`: Starting build
 echo ' ---' Kconfig fragment at: $config_template >> $resdir/log
+touch $resdir/ConfigFragment.input $resdir/ConfigFragment
 if test -r "$config_dir/CFcommon"
 then
-	cat < $config_dir/CFcommon >> $T
+	echo " --- $config_dir/CFcommon" >> $resdir/ConfigFragment.input
+	cat < $config_dir/CFcommon >> $resdir/ConfigFragment.input
+	config_override.sh $config_dir/CFcommon $config_template > $T/Kc1
+	grep '#CHECK#' $config_dir/CFcommon >> $resdir/ConfigFragment
+else
+	cp $config_template $T/Kc1
 fi
-# Optimizations below this point
-# CONFIG_USB=n
-# CONFIG_SECURITY=n
-# CONFIG_NFS_FS=n
-# CONFIG_SOUND=n
-# CONFIG_INPUT_JOYSTICK=n
-# CONFIG_INPUT_TABLET=n
-# CONFIG_INPUT_TOUCHSCREEN=n
-# CONFIG_INPUT_MISC=n
-# CONFIG_INPUT_MOUSE=n
-# # CONFIG_NET=n # disables console access, so accept the slower build.
-# CONFIG_SCSI=n
-# CONFIG_ATA=n
-# CONFIG_FAT_FS=n
-# CONFIG_MSDOS_FS=n
-# CONFIG_VFAT_FS=n
-# CONFIG_ISO9660_FS=n
-# CONFIG_QUOTA=n
-# CONFIG_HID=n
-# CONFIG_CRYPTO=n
-# CONFIG_PCCARD=n
-# CONFIG_PCMCIA=n
-# CONFIG_CARDBUS=n
-# CONFIG_YENTA=n
+echo " --- $config_template" >> $resdir/ConfigFragment.input
+cat $config_template >> $resdir/ConfigFragment.input
+grep '#CHECK#' $config_template >> $resdir/ConfigFragment
+if test -n "$TORTURE_KCONFIG_ARG"
+then
+	echo $TORTURE_KCONFIG_ARG | tr -s " " "\012" > $T/cmdline
+	echo " --- --kconfig argument" >> $resdir/ConfigFragment.input
+	cat $T/cmdline >> $resdir/ConfigFragment.input
+	config_override.sh $T/Kc1 $T/cmdline > $T/Kc2
+	# Note that "#CHECK#" is not permitted on commandline.
+else
+	cp $T/Kc1 $T/Kc2
+fi
+cat $T/Kc2 >> $resdir/ConfigFragment
+
 base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'`
 if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux
 then
@@ -100,7 +96,9 @@ then
 	KERNEL=$base_resdir/${BOOT_IMAGE##*/} # use the last component of ${BOOT_IMAGE}
 	ln -s $base_resdir/Make*.out $resdir  # for kvm-recheck.sh
 	ln -s $base_resdir/.config $resdir  # for kvm-recheck.sh
-elif kvm-build.sh $config_template $builddir $T
+	# Arch-independent indicator
+	touch $resdir/builtkernel
+elif kvm-build.sh $T/Kc2 $builddir
 then
 	# Had to build a kernel for this test.
 	QEMU="`identify_qemu $builddir/vmlinux`"
@@ -112,6 +110,8 @@ then
 	then
 		cp $builddir/$BOOT_IMAGE $resdir
 		KERNEL=$resdir/${BOOT_IMAGE##*/}
+		# Arch-independent indicator
+		touch $resdir/builtkernel
 	else
 		echo No identifiable boot image, not running KVM, see $resdir.
 		echo Do the torture scripts know about your architecture?
@@ -149,7 +149,7 @@ fi
 
 # Generate -smp qemu argument.
 qemu_args="-enable-kvm -nographic $qemu_args"
-cpu_count=`configNR_CPUS.sh $config_template`
+cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment`
 cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"`
 vcpus=`identify_qemu_vcpus`
 if test $cpu_count -gt $vcpus
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 50091de3a911..b55895fb10ed 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -41,6 +41,7 @@ PATH=${KVM}/bin:$PATH; export PATH
 TORTURE_DEFCONFIG=defconfig
 TORTURE_BOOT_IMAGE=""
 TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD
+TORTURE_KCONFIG_ARG=""
 TORTURE_KMAKE_ARG=""
 TORTURE_SHUTDOWN_GRACE=180
 TORTURE_SUITE=rcu
@@ -65,6 +66,7 @@ usage () {
 	echo "       --duration minutes"
 	echo "       --interactive"
 	echo "       --jitter N [ maxsleep (us) [ maxspin (us) ] ]"
+	echo "       --kconfig Kconfig-options"
 	echo "       --kmake-arg kernel-make-arguments"
 	echo "       --mac nn:nn:nn:nn:nn:nn"
 	echo "       --no-initrd"
@@ -129,6 +131,11 @@ do
 		jitter="$2"
 		shift
 		;;
+	--kconfig)
+		checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$'
+		TORTURE_KCONFIG_ARG="$2"
+		shift
+		;;
 	--kmake-arg)
 		checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
 		TORTURE_KMAKE_ARG="$2"
@@ -205,6 +212,7 @@ do
 	then
 		cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1`
 		cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
+		cpu_count=`configfrag_boot_maxcpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
 		for ((cur_rep=0;cur_rep<$config_reps;cur_rep++))
 		do
 			echo $CF1 $cpu_count >> $T/cfgcpu
@@ -275,6 +283,7 @@ TORTURE_BOOT_IMAGE="$TORTURE_BOOT_IMAGE"; export TORTURE_BOOT_IMAGE
 TORTURE_BUILDONLY="$TORTURE_BUILDONLY"; export TORTURE_BUILDONLY
 TORTURE_DEFCONFIG="$TORTURE_DEFCONFIG"; export TORTURE_DEFCONFIG
 TORTURE_INITRD="$TORTURE_INITRD"; export TORTURE_INITRD
+TORTURE_KCONFIG_ARG="$TORTURE_KCONFIG_ARG"; export TORTURE_KCONFIG_ARG
 TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG
 TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD
 TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE
@@ -324,6 +333,7 @@ function dump(first, pastlast, batchnum)
 {
 	print "echo ----Start batch " batchnum ": `date`";
 	print "echo ----Start batch " batchnum ": `date` >> " rd "/log";
+	print "needqemurun="
 	jn=1
 	for (j = first; j < pastlast; j++) {
 		builddir=KVM "/b" jn
@@ -359,10 +369,11 @@ function dump(first, pastlast, batchnum)
 	for (j = 1; j < jn; j++) {
 		builddir=KVM "/b" j
 		print "rm -f " builddir ".ready"
-		print "if test -z \"$TORTURE_BUILDONLY\""
+		print "if test -f \"" rd cfr[j] "/builtkernel\""
 		print "then"
-		print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date`";
-		print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log";
+		print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date`";
+		print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date` >> " rd "/log";
+		print "\tneedqemurun=1"
 		print "fi"
 	}
 	njitter = 0;
@@ -377,13 +388,22 @@ function dump(first, pastlast, batchnum)
 		njitter = 0;
 		print "echo Build-only run, so suppressing jitter >> " rd "/log"
 	}
-	for (j = 0; j < njitter; j++)
-		print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&"
-	print "wait"
-	print "if test -z \"$TORTURE_BUILDONLY\""
+	if (TORTURE_BUILDONLY) {
+		print "needqemurun="
+	}
+	print "if test -n \"$needqemurun\""
 	print "then"
+	print "\techo ---- Starting kernels. `date`";
+	print "\techo ---- Starting kernels. `date` >> " rd "/log";
+	for (j = 0; j < njitter; j++)
+		print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&"
+	print "\twait"
 	print "\techo ---- All kernel runs complete. `date`";
 	print "\techo ---- All kernel runs complete. `date` >> " rd "/log";
+	print "else"
+	print "\twait"
+	print "\techo ---- No kernel runs. `date`";
+	print "\techo ---- No kernel runs. `date` >> " rd "/log";
 	print "fi"
 	for (j = 1; j < jn; j++) {
 		builddir=KVM "/b" j
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot
index 6804f9dcfc1b..be7728db42fd 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot
@@ -1 +1 @@
-rcutorture.torture_type=rcu_busted
+rcutorture.torture_type=busted
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot
deleted file mode 100644
index 84a7d51b7481..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot
+++ /dev/null
@@ -1 +0,0 @@
-rcutorture.torture_type=srcud
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
index 6bc24e99862f..c15ada821e45 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
@@ -4,6 +4,7 @@ CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
 #CHECK#CONFIG_TINY_SRCU=y
 CONFIG_RCU_TRACE=n
-CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_PREEMPT_COUNT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
index 1d14e1383016..9f3a4d28e508 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
@@ -1,4 +1,4 @@
-rcutorture.torture_type=rcu_bh maxcpus=8
+rcutorture.torture_type=rcu_bh maxcpus=8 nr_cpus=43
 rcutree.gp_preinit_delay=3
 rcutree.gp_init_delay=3
 rcutree.gp_cleanup_delay=3
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 9ad3f89c8dc7..af6fca03602f 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -69,11 +69,11 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE
 CONFIG_PREEMPT_RCU
 CONFIG_TREE_RCU
 CONFIG_TINY_RCU
+CONFIG_TASKS_RCU
 
 	These are controlled by CONFIG_PREEMPT and/or CONFIG_SMP.
 
 CONFIG_SRCU
-CONFIG_TASKS_RCU
 
 	Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable.
 
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
index b4967d875236..f249e042b3b5 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -285,9 +285,12 @@ static void *threadproc(void *ctx)
 	}
 }
 
-static void set_gs_and_switch_to(unsigned long local, unsigned long remote)
+static void set_gs_and_switch_to(unsigned long local,
+				 unsigned short force_sel,
+				 unsigned long remote)
 {
 	unsigned long base;
+	unsigned short sel_pre_sched, sel_post_sched;
 
 	bool hard_zero = false;
 	if (local == HARD_ZERO) {
@@ -297,6 +300,8 @@ static void set_gs_and_switch_to(unsigned long local, unsigned long remote)
 
 	printf("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n",
 	       local, hard_zero ? " and clear gs" : "", remote);
+	if (force_sel)
+		printf("\tBefore schedule, set selector to 0x%hx\n", force_sel);
 	if (syscall(SYS_arch_prctl, ARCH_SET_GS, local) != 0)
 		err(1, "ARCH_SET_GS");
 	if (hard_zero)
@@ -307,18 +312,35 @@ static void set_gs_and_switch_to(unsigned long local, unsigned long remote)
 		printf("[FAIL]\tGSBASE wasn't set as expected\n");
 	}
 
+	if (force_sel) {
+		asm volatile ("mov %0, %%gs" : : "rm" (force_sel));
+		sel_pre_sched = force_sel;
+		local = read_base(GS);
+
+		/*
+		 * Signal delivery seems to mess up weird selectors.  Put it
+		 * back.
+		 */
+		asm volatile ("mov %0, %%gs" : : "rm" (force_sel));
+	} else {
+		asm volatile ("mov %%gs, %0" : "=rm" (sel_pre_sched));
+	}
+
 	remote_base = remote;
 	ftx = 1;
 	syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
 	while (ftx != 0)
 		syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
 
+	asm volatile ("mov %%gs, %0" : "=rm" (sel_post_sched));
 	base = read_base(GS);
-	if (base == local) {
-		printf("[OK]\tGSBASE remained 0x%lx\n", local);
+	if (base == local && sel_pre_sched == sel_post_sched) {
+		printf("[OK]\tGS/BASE remained 0x%hx/0x%lx\n",
+		       sel_pre_sched, local);
 	} else {
 		nerrs++;
-		printf("[FAIL]\tGSBASE changed to 0x%lx\n", base);
+		printf("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx\n",
+		       sel_pre_sched, local, sel_post_sched, base);
 	}
 }
 
@@ -381,8 +403,15 @@ int main()
 
 	for (int local = 0; local < 4; local++) {
 		for (int remote = 0; remote < 4; remote++) {
-			set_gs_and_switch_to(bases_with_hard_zero[local],
-					     bases_with_hard_zero[remote]);
+			for (unsigned short s = 0; s < 5; s++) {
+				unsigned short sel = s;
+				if (s == 4)
+					asm ("mov %%ss, %0" : "=rm" (sel));
+				set_gs_and_switch_to(
+					bases_with_hard_zero[local],
+					sel,
+					bases_with_hard_zero[remote]);
+			}
 		}
 	}