140 files changed, 5076 insertions, 1324 deletions
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
new file mode 100644
index 000000000000..5550bfdcce5f
--- /dev/null
+++ b/Documentation/atomic_bitops.txt
@@ -0,0 +1,66 @@
+
+On atomic bitops.
+
+
+While our bitmap_{}() functions are non-atomic, we have a number of operations
+operating on single bits in a bitmap that are atomic.
+
+
+API
+---
+
+The single bit operations are:
+
+Non-RMW ops:
+
+  test_bit()
+
+RMW atomic operations without return value:
+
+  {set,clear,change}_bit()
+  clear_bit_unlock()
+
+RMW atomic operations with return value:
+
+  test_and_{set,clear,change}_bit()
+  test_and_set_bit_lock()
+
+Barriers:
+
+  smp_mb__{before,after}_atomic()
+
+
+All RMW atomic operations have a '__' prefixed variant which is non-atomic.
+
+
+SEMANTICS
+---------
+
+Non-atomic ops:
+
+In particular __clear_bit_unlock() suffers the same issue as atomic_set(),
+which is why the generic version maps to clear_bit_unlock(), see atomic_t.txt.
+
+
+RMW ops:
+
+The test_and_{}_bit() operations return the original value of the bit.
+
+
+ORDERING
+--------
+
+Like with atomic_t, the rule of thumb is:
+
+ - non-RMW operations are unordered;
+
+ - RMW operations that have no return value are unordered;
+
+ - RMW operations that have a return value are fully ordered.
+
+Except for test_and_set_bit_lock() which has ACQUIRE semantics and
+clear_bit_unlock() which has RELEASE semantics.
+
+Since a platform only has a single means of achieving atomic operations
+the same barriers as for atomic_t are used, see atomic_t.txt.
+
diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
new file mode 100644
index 000000000000..913396ac5824
--- /dev/null
+++ b/Documentation/atomic_t.txt
@@ -0,0 +1,242 @@
+
+On atomic types (atomic_t atomic64_t and atomic_long_t).
+
+The atomic type provides an interface to the architecture's means of atomic
+RMW operations between CPUs (atomic operations on MMIO are not supported and
+can lead to fatal traps on some platforms).
+
+API
+---
+
+The 'full' API consists of (atomic64_ and atomic_long_ prefixes omitted for
+brevity):
+
+Non-RMW ops:
+
+  atomic_read(), atomic_set()
+  atomic_read_acquire(), atomic_set_release()
+
+
+RMW atomic operations:
+
+Arithmetic:
+
+  atomic_{add,sub,inc,dec}()
+  atomic_{add,sub,inc,dec}_return{,_relaxed,_acquire,_release}()
+  atomic_fetch_{add,sub,inc,dec}{,_relaxed,_acquire,_release}()
+
+
+Bitwise:
+
+  atomic_{and,or,xor,andnot}()
+  atomic_fetch_{and,or,xor,andnot}{,_relaxed,_acquire,_release}()
+
+
+Swap:
+
+  atomic_xchg{,_relaxed,_acquire,_release}()
+  atomic_cmpxchg{,_relaxed,_acquire,_release}()
+  atomic_try_cmpxchg{,_relaxed,_acquire,_release}()
+
+
+Reference count (but please see refcount_t):
+
+  atomic_add_unless(), atomic_inc_not_zero()
+  atomic_sub_and_test(), atomic_dec_and_test()
+
+
+Misc:
+
+  atomic_inc_and_test(), atomic_add_negative()
+  atomic_dec_unless_positive(), atomic_inc_unless_negative()
+
+
+Barriers:
+
+  smp_mb__{before,after}_atomic()
+
+
+
+SEMANTICS
+---------
+
+Non-RMW ops:
+
+The non-RMW ops are (typically) regular LOADs and STOREs and are canonically
+implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and
+smp_store_release() respectively.
+
+The one detail to this is that atomic_set{}() should be observable to the RMW
+ops. That is:
+
+  C atomic-set
+
+  {
+    atomic_set(v, 1);
+  }
+
+  P1(atomic_t *v)
+  {
+    atomic_add_unless(v, 1, 0);
+  }
+
+  P2(atomic_t *v)
+  {
+    atomic_set(v, 0);
+  }
+
+  exists
+  (v=2)
+
+In this case we would expect the atomic_set() from CPU1 to either happen
+before the atomic_add_unless(), in which case that latter one would no-op, or
+_after_ in which case we'd overwrite its result. In no case is "2" a valid
+outcome.
+
+This is typically true on 'normal' platforms, where a regular competing STORE
+will invalidate a LL/SC or fail a CMPXCHG.
+
+The obvious case where this is not so is when we need to implement atomic ops
+with a lock:
+
+  CPU0						CPU1
+
+  atomic_add_unless(v, 1, 0);
+    lock();
+    ret = READ_ONCE(v->counter); // == 1
+						atomic_set(v, 0);
+    if (ret != u)				  WRITE_ONCE(v->counter, 0);
+      WRITE_ONCE(v->counter, ret + 1);
+    unlock();
+
+the typical solution is to then implement atomic_set{}() with atomic_xchg().
+
+
+RMW ops:
+
+These come in various forms:
+
+ - plain operations without return value: atomic_{}()
+
+ - operations which return the modified value: atomic_{}_return()
+
+   these are limited to the arithmetic operations because those are
+   reversible. Bitops are irreversible and therefore the modified value
+   is of dubious utility.
+
+ - operations which return the original value: atomic_fetch_{}()
+
+ - swap operations: xchg(), cmpxchg() and try_cmpxchg()
+
+ - misc; the special purpose operations that are commonly used and would,
+   given the interface, normally be implemented using (try_)cmpxchg loops but
+   are time critical and can, (typically) on LL/SC architectures, be more
+   efficiently implemented.
+
+All these operations are SMP atomic; that is, the operations (for a single
+atomic variable) can be fully ordered and no intermediate state is lost or
+visible.
+
+
+ORDERING  (go read memory-barriers.txt first)
+--------
+
+The rule of thumb:
+
+ - non-RMW operations are unordered;
+
+ - RMW operations that have no return value are unordered;
+
+ - RMW operations that have a return value are fully ordered;
+
+ - RMW operations that are conditional are unordered on FAILURE,
+   otherwise the above rules apply.
+
+Except of course when an operation has an explicit ordering like:
+
+ {}_relaxed: unordered
+ {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE
+ {}_release: the W of the RMW (or atomic_set)  is a  RELEASE
+
+Where 'unordered' is against other memory locations. Address dependencies are
+not defeated.
+
+Fully ordered primitives are ordered against everything prior and everything
+subsequent. Therefore a fully ordered primitive is like having an smp_mb()
+before and an smp_mb() after the primitive.
+
+
+The barriers:
+
+  smp_mb__{before,after}_atomic()
+
+only apply to the RMW ops and can be used to augment/upgrade the ordering
+inherent to the used atomic op. These barriers provide a full smp_mb().
+
+These helper barriers exist because architectures have varying implicit
+ordering on their SMP atomic primitives. For example our TSO architectures
+provide full ordered atomics and these barriers are no-ops.
+
+Thus:
+
+  atomic_fetch_add();
+
+is equivalent to:
+
+  smp_mb__before_atomic();
+  atomic_fetch_add_relaxed();
+  smp_mb__after_atomic();
+
+However the atomic_fetch_add() might be implemented more efficiently.
+
+Further, while something like:
+
+  smp_mb__before_atomic();
+  atomic_dec(&X);
+
+is a 'typical' RELEASE pattern, the barrier is strictly stronger than
+a RELEASE. Similarly for something like:
+
+  atomic_inc(&X);
+  smp_mb__after_atomic();
+
+is an ACQUIRE pattern (though very much not typical), but again the barrier is
+strictly stronger than ACQUIRE. As illustrated:
+
+  C strong-acquire
+
+  {
+  }
+
+  P1(int *x, atomic_t *y)
+  {
+    r0 = READ_ONCE(*x);
+    smp_rmb();
+    r1 = atomic_read(y);
+  }
+
+  P2(int *x, atomic_t *y)
+  {
+    atomic_inc(y);
+    smp_mb__after_atomic();
+    WRITE_ONCE(*x, 1);
+  }
+
+  exists
+  (r0=1 /\ r1=0)
+
+This should not happen; but a hypothetical atomic_inc_acquire() --
+(void)atomic_fetch_inc_acquire() for instance -- would allow the outcome,
+since then:
+
+  P1			P2
+
+			t = LL.acq *y (0)
+			t++;
+			*x = 1;
+  r0 = *x (1)
+  RMB
+  r1 = *y (0)
+			SC *y, t;
+
+is allowed.
diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt
new file mode 100644
index 000000000000..bdf1423d5f99
--- /dev/null
+++ b/Documentation/locking/crossrelease.txt
@@ -0,0 +1,874 @@
+Crossrelease
+============
+
+Started by Byungchul Park <byungchul.park@lge.com>
+
+Contents:
+
+ (*) Background
+
+     - What causes deadlock
+     - How lockdep works
+
+ (*) Limitation
+
+     - Limit lockdep
+     - Pros from the limitation
+     - Cons from the limitation
+     - Relax the limitation
+
+ (*) Crossrelease
+
+     - Introduce crossrelease
+     - Introduce commit
+
+ (*) Implementation
+
+     - Data structures
+     - How crossrelease works
+
+ (*) Optimizations
+
+     - Avoid duplication
+     - Lockless for hot paths
+
+ (*) APPENDIX A: What lockdep does to work aggresively
+
+ (*) APPENDIX B: How to avoid adding false dependencies
+
+
+==========
+Background
+==========
+
+What causes deadlock
+--------------------
+
+A deadlock occurs when a context is waiting for an event to happen,
+which is impossible because another (or the) context who can trigger the
+event is also waiting for another (or the) event to happen, which is
+also impossible due to the same reason.
+
+For example:
+
+   A context going to trigger event C is waiting for event A to happen.
+   A context going to trigger event A is waiting for event B to happen.
+   A context going to trigger event B is waiting for event C to happen.
+
+A deadlock occurs when these three wait operations run at the same time,
+because event C cannot be triggered if event A does not happen, which in
+turn cannot be triggered if event B does not happen, which in turn
+cannot be triggered if event C does not happen. After all, no event can
+be triggered since any of them never meets its condition to wake up.
+
+A dependency might exist between two waiters and a deadlock might happen
+due to an incorrect releationship between dependencies. Thus, we must
+define what a dependency is first. A dependency exists between them if:
+
+   1. There are two waiters waiting for each event at a given time.
+   2. The only way to wake up each waiter is to trigger its event.
+   3. Whether one can be woken up depends on whether the other can.
+
+Each wait in the example creates its dependency like:
+
+   Event C depends on event A.
+   Event A depends on event B.
+   Event B depends on event C.
+
+   NOTE: Precisely speaking, a dependency is one between whether a
+   waiter for an event can be woken up and whether another waiter for
+   another event can be woken up. However from now on, we will describe
+   a dependency as if it's one between an event and another event for
+   simplicity.
+
+And they form circular dependencies like:
+
+    -> C -> A -> B -
+   /                \
+   \                /
+    ----------------
+
+   where 'A -> B' means that event A depends on event B.
+
+Such circular dependencies lead to a deadlock since no waiter can meet
+its condition to wake up as described.
+
+CONCLUSION
+
+Circular dependencies cause a deadlock.
+
+
+How lockdep works
+-----------------
+
+Lockdep tries to detect a deadlock by checking dependencies created by
+lock operations, acquire and release. Waiting for a lock corresponds to
+waiting for an event, and releasing a lock corresponds to triggering an
+event in the previous section.
+
+In short, lockdep does:
+
+   1. Detect a new dependency.
+   2. Add the dependency into a global graph.
+   3. Check if that makes dependencies circular.
+   4. Report a deadlock or its possibility if so.
+
+For example, consider a graph built by lockdep that looks like:
+
+   A -> B -
+           \
+            -> E
+           /
+   C -> D -
+
+   where A, B,..., E are different lock classes.
+
+Lockdep will add a dependency into the graph on detection of a new
+dependency. For example, it will add a dependency 'E -> C' when a new
+dependency between lock E and lock C is detected. Then the graph will be:
+
+       A -> B -
+               \
+                -> E -
+               /      \
+    -> C -> D -        \
+   /                   /
+   \                  /
+    ------------------
+
+   where A, B,..., E are different lock classes.
+
+This graph contains a subgraph which demonstrates circular dependencies:
+
+                -> E -
+               /      \
+    -> C -> D -        \
+   /                   /
+   \                  /
+    ------------------
+
+   where C, D and E are different lock classes.
+
+This is the condition under which a deadlock might occur. Lockdep
+reports it on detection after adding a new dependency. This is the way
+how lockdep works.
+
+CONCLUSION
+
+Lockdep detects a deadlock or its possibility by checking if circular
+dependencies were created after adding each new dependency.
+
+
+==========
+Limitation
+==========
+
+Limit lockdep
+-------------
+
+Limiting lockdep to work on only typical locks e.g. spin locks and
+mutexes, which are released within the acquire context, the
+implementation becomes simple but its capacity for detection becomes
+limited. Let's check pros and cons in next section.
+
+
+Pros from the limitation
+------------------------
+
+Given the limitation, when acquiring a lock, locks in a held_locks
+cannot be released if the context cannot acquire it so has to wait to
+acquire it, which means all waiters for the locks in the held_locks are
+stuck. It's an exact case to create dependencies between each lock in
+the held_locks and the lock to acquire.
+
+For example:
+
+   CONTEXT X
+   ---------
+   acquire A
+   acquire B /* Add a dependency 'A -> B' */
+   release B
+   release A
+
+   where A and B are different lock classes.
+
+When acquiring lock A, the held_locks of CONTEXT X is empty thus no
+dependency is added. But when acquiring lock B, lockdep detects and adds
+a new dependency 'A -> B' between lock A in the held_locks and lock B.
+They can be simply added whenever acquiring each lock.
+
+And data required by lockdep exists in a local structure, held_locks
+embedded in task_struct. Forcing to access the data within the context,
+lockdep can avoid racy problems without explicit locks while handling
+the local data.
+
+Lastly, lockdep only needs to keep locks currently being held, to build
+a dependency graph. However, relaxing the limitation, it needs to keep
+even locks already released, because a decision whether they created
+dependencies might be long-deferred.
+
+To sum up, we can expect several advantages from the limitation:
+
+   1. Lockdep can easily identify a dependency when acquiring a lock.
+   2. Races are avoidable while accessing local locks in a held_locks.
+   3. Lockdep only needs to keep locks currently being held.
+
+CONCLUSION
+
+Given the limitation, the implementation becomes simple and efficient.
+
+
+Cons from the limitation
+------------------------
+
+Given the limitation, lockdep is applicable only to typical locks. For
+example, page locks for page access or completions for synchronization
+cannot work with lockdep.
+
+Can we detect deadlocks below, under the limitation?
+
+Example 1:
+
+   CONTEXT X	   CONTEXT Y	   CONTEXT Z
+   ---------	   ---------	   ----------
+		   mutex_lock A
+   lock_page B
+		   lock_page B
+				   mutex_lock A /* DEADLOCK */
+				   unlock_page B held by X
+		   unlock_page B
+		   mutex_unlock A
+				   mutex_unlock A
+
+   where A and B are different lock classes.
+
+No, we cannot.
+
+Example 2:
+
+   CONTEXT X		   CONTEXT Y
+   ---------		   ---------
+			   mutex_lock A
+   mutex_lock A
+			   wait_for_complete B /* DEADLOCK */
+   complete B
+			   mutex_unlock A
+   mutex_unlock A
+
+   where A is a lock class and B is a completion variable.
+
+No, we cannot.
+
+CONCLUSION
+
+Given the limitation, lockdep cannot detect a deadlock or its
+possibility caused by page locks or completions.
+
+
+Relax the limitation
+--------------------
+
+Under the limitation, things to create dependencies are limited to
+typical locks. However, synchronization primitives like page locks and
+completions, which are allowed to be released in any context, also
+create dependencies and can cause a deadlock. So lockdep should track
+these locks to do a better job. We have to relax the limitation for
+these locks to work with lockdep.
+
+Detecting dependencies is very important for lockdep to work because
+adding a dependency means adding an opportunity to check whether it
+causes a deadlock. The more lockdep adds dependencies, the more it
+thoroughly works. Thus Lockdep has to do its best to detect and add as
+many true dependencies into a graph as possible.
+
+For example, considering only typical locks, lockdep builds a graph like:
+
+   A -> B -
+           \
+            -> E
+           /
+   C -> D -
+
+   where A, B,..., E are different lock classes.
+
+On the other hand, under the relaxation, additional dependencies might
+be created and added. Assuming additional 'FX -> C' and 'E -> GX' are
+added thanks to the relaxation, the graph will be:
+
+         A -> B -
+                 \
+                  -> E -> GX
+                 /
+   FX -> C -> D -
+
+   where A, B,..., E, FX and GX are different lock classes, and a suffix
+   'X' is added on non-typical locks.
+
+The latter graph gives us more chances to check circular dependencies
+than the former. However, it might suffer performance degradation since
+relaxing the limitation, with which design and implementation of lockdep
+can be efficient, might introduce inefficiency inevitably. So lockdep
+should provide two options, strong detection and efficient detection.
+
+Choosing efficient detection:
+
+   Lockdep works with only locks restricted to be released within the
+   acquire context. However, lockdep works efficiently.
+
+Choosing strong detection:
+
+   Lockdep works with all synchronization primitives. However, lockdep
+   suffers performance degradation.
+
+CONCLUSION
+
+Relaxing the limitation, lockdep can add additional dependencies giving
+additional opportunities to check circular dependencies.
+
+
+============
+Crossrelease
+============
+
+Introduce crossrelease
+----------------------
+
+In order to allow lockdep to handle additional dependencies by what
+might be released in any context, namely 'crosslock', we have to be able
+to identify those created by crosslocks. The proposed 'crossrelease'
+feature provoides a way to do that.
+
+Crossrelease feature has to do:
+
+   1. Identify dependencies created by crosslocks.
+   2. Add the dependencies into a dependency graph.
+
+That's all. Once a meaningful dependency is added into graph, then
+lockdep would work with the graph as it did. The most important thing
+crossrelease feature has to do is to correctly identify and add true
+dependencies into the global graph.
+
+A dependency e.g. 'A -> B' can be identified only in the A's release
+context because a decision required to identify the dependency can be
+made only in the release context. That is to decide whether A can be
+released so that a waiter for A can be woken up. It cannot be made in
+other than the A's release context.
+
+It's no matter for typical locks because each acquire context is same as
+its release context, thus lockdep can decide whether a lock can be
+released in the acquire context. However for crosslocks, lockdep cannot
+make the decision in the acquire context but has to wait until the
+release context is identified.
+
+Therefore, deadlocks by crosslocks cannot be detected just when it
+happens, because those cannot be identified until the crosslocks are
+released. However, deadlock possibilities can be detected and it's very
+worth. See 'APPENDIX A' section to check why.
+
+CONCLUSION
+
+Using crossrelease feature, lockdep can work with what might be released
+in any context, namely crosslock.
+
+
+Introduce commit
+----------------
+
+Since crossrelease defers the work adding true dependencies of
+crosslocks until they are actually released, crossrelease has to queue
+all acquisitions which might create dependencies with the crosslocks.
+Then it identifies dependencies using the queued data in batches at a
+proper time. We call it 'commit'.
+
+There are four types of dependencies:
+
+1. TT type: 'typical lock A -> typical lock B'
+
+   Just when acquiring B, lockdep can see it's in the A's release
+   context. So the dependency between A and B can be identified
+   immediately. Commit is unnecessary.
+
+2. TC type: 'typical lock A -> crosslock BX'
+
+   Just when acquiring BX, lockdep can see it's in the A's release
+   context. So the dependency between A and BX can be identified
+   immediately. Commit is unnecessary, too.
+
+3. CT type: 'crosslock AX -> typical lock B'
+
+   When acquiring B, lockdep cannot identify the dependency because
+   there's no way to know if it's in the AX's release context. It has
+   to wait until the decision can be made. Commit is necessary.
+
+4. CC type: 'crosslock AX -> crosslock BX'
+
+   When acquiring BX, lockdep cannot identify the dependency because
+   there's no way to know if it's in the AX's release context. It has
+   to wait until the decision can be made. Commit is necessary.
+   But, handling CC type is not implemented yet. It's a future work.
+
+Lockdep can work without commit for typical locks, but commit step is
+necessary once crosslocks are involved. Introducing commit, lockdep
+performs three steps. What lockdep does in each step is:
+
+1. Acquisition: For typical locks, lockdep does what it originally did
+   and queues the lock so that CT type dependencies can be checked using
+   it at the commit step. For crosslocks, it saves data which will be
+   used at the commit step and increases a reference count for it.
+
+2. Commit: No action is reauired for typical locks. For crosslocks,
+   lockdep adds CT type dependencies using the data saved at the
+   acquisition step.
+
+3. Release: No changes are required for typical locks. When a crosslock
+   is released, it decreases a reference count for it.
+
+CONCLUSION
+
+Crossrelease introduces commit step to handle dependencies of crosslocks
+in batches at a proper time.
+
+
+==============
+Implementation
+==============
+
+Data structures
+---------------
+
+Crossrelease introduces two main data structures.
+
+1. hist_lock
+
+   This is an array embedded in task_struct, for keeping lock history so
+   that dependencies can be added using them at the commit step. Since
+   it's local data, it can be accessed locklessly in the owner context.
+   The array is filled at the acquisition step and consumed at the
+   commit step. And it's managed in circular manner.
+
+2. cross_lock
+
+   One per lockdep_map exists. This is for keeping data of crosslocks
+   and used at the commit step.
+
+
+How crossrelease works
+----------------------
+
+It's the key of how crossrelease works, to defer necessary works to an
+appropriate point in time and perform in at once at the commit step.
+Let's take a look with examples step by step, starting from how lockdep
+works without crossrelease for typical locks.
+
+   acquire A /* Push A onto held_locks */
+   acquire B /* Push B onto held_locks and add 'A -> B' */
+   acquire C /* Push C onto held_locks and add 'B -> C' */
+   release C /* Pop C from held_locks */
+   release B /* Pop B from held_locks */
+   release A /* Pop A from held_locks */
+
+   where A, B and C are different lock classes.
+
+   NOTE: This document assumes that readers already understand how
+   lockdep works without crossrelease thus omits details. But there's
+   one thing to note. Lockdep pretends to pop a lock from held_locks
+   when releasing it. But it's subtly different from the original pop
+   operation because lockdep allows other than the top to be poped.
+
+In this case, lockdep adds 'the top of held_locks -> the lock to acquire'
+dependency every time acquiring a lock.
+
+After adding 'A -> B', a dependency graph will be:
+
+   A -> B
+
+   where A and B are different lock classes.
+
+And after adding 'B -> C', the graph will be:
+
+   A -> B -> C
+
+   where A, B and C are different lock classes.
+
+Let's performs commit step even for typical locks to add dependencies.
+Of course, commit step is not necessary for them, however, it would work
+well because this is a more general way.
+
+   acquire A
+   /*
+    * Queue A into hist_locks
+    *
+    * In hist_locks: A
+    * In graph: Empty
+    */
+
+   acquire B
+   /*
+    * Queue B into hist_locks
+    *
+    * In hist_locks: A, B
+    * In graph: Empty
+    */
+
+   acquire C
+   /*
+    * Queue C into hist_locks
+    *
+    * In hist_locks: A, B, C
+    * In graph: Empty
+    */
+
+   commit C
+   /*
+    * Add 'C -> ?'
+    * Answer the following to decide '?'
+    * What has been queued since acquire C: Nothing
+    *
+    * In hist_locks: A, B, C
+    * In graph: Empty
+    */
+
+   release C
+
+   commit B
+   /*
+    * Add 'B -> ?'
+    * Answer the following to decide '?'
+    * What has been queued since acquire B: C
+    *
+    * In hist_locks: A, B, C
+    * In graph: 'B -> C'
+    */
+
+   release B
+
+   commit A
+   /*
+    * Add 'A -> ?'
+    * Answer the following to decide '?'
+    * What has been queued since acquire A: B, C
+    *
+    * In hist_locks: A, B, C
+    * In graph: 'B -> C', 'A -> B', 'A -> C'
+    */
+
+   release A
+
+   where A, B and C are different lock classes.
+
+In this case, dependencies are added at the commit step as described.
+
+After commits for A, B and C, the graph will be:
+
+   A -> B -> C
+
+   where A, B and C are different lock classes.
+
+   NOTE: A dependency 'A -> C' is optimized out.
+
+We can see the former graph built without commit step is same as the
+latter graph built using commit steps. Of course the former way leads to
+earlier finish for building the graph, which means we can detect a
+deadlock or its possibility sooner. So the former way would be prefered
+when possible. But we cannot avoid using the latter way for crosslocks.
+
+Let's look at how commit steps work for crosslocks. In this case, the
+commit step is performed only on crosslock AX as real. And it assumes
+that the AX release context is different from the AX acquire context.
+
+   BX RELEASE CONTEXT		   BX ACQUIRE CONTEXT
+   ------------------		   ------------------
+				   acquire A
+				   /*
+				    * Push A onto held_locks
+				    * Queue A into hist_locks
+				    *
+				    * In held_locks: A
+				    * In hist_locks: A
+				    * In graph: Empty
+				    */
+
+				   acquire BX
+				   /*
+				    * Add 'the top of held_locks -> BX'
+				    *
+				    * In held_locks: A
+				    * In hist_locks: A
+				    * In graph: 'A -> BX'
+				    */
+
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+   It must be guaranteed that the following operations are seen after
+   acquiring BX globally. It can be done by things like barrier.
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   acquire C
+   /*
+    * Push C onto held_locks
+    * Queue C into hist_locks
+    *
+    * In held_locks: C
+    * In hist_locks: C
+    * In graph: 'A -> BX'
+    */
+
+   release C
+   /*
+    * Pop C from held_locks
+    *
+    * In held_locks: Empty
+    * In hist_locks: C
+    * In graph: 'A -> BX'
+    */
+				   acquire D
+				   /*
+				    * Push D onto held_locks
+				    * Queue D into hist_locks
+				    * Add 'the top of held_locks -> D'
+				    *
+				    * In held_locks: A, D
+				    * In hist_locks: A, D
+				    * In graph: 'A -> BX', 'A -> D'
+				    */
+   acquire E
+   /*
+    * Push E onto held_locks
+    * Queue E into hist_locks
+    *
+    * In held_locks: E
+    * In hist_locks: C, E
+    * In graph: 'A -> BX', 'A -> D'
+    */
+
+   release E
+   /*
+    * Pop E from held_locks
+    *
+    * In held_locks: Empty
+    * In hist_locks: D, E
+    * In graph: 'A -> BX', 'A -> D'
+    */
+				   release D
+				   /*
+				    * Pop D from held_locks
+				    *
+				    * In held_locks: A
+				    * In hist_locks: A, D
+				    * In graph: 'A -> BX', 'A -> D'
+				    */
+   commit BX
+   /*
+    * Add 'BX -> ?'
+    * What has been queued since acquire BX: C, E
+    *
+    * In held_locks: Empty
+    * In hist_locks: D, E
+    * In graph: 'A -> BX', 'A -> D',
+    *           'BX -> C', 'BX -> E'
+    */
+
+   release BX
+   /*
+    * In held_locks: Empty
+    * In hist_locks: D, E
+    * In graph: 'A -> BX', 'A -> D',
+    *           'BX -> C', 'BX -> E'
+    */
+				   release A
+				   /*
+				    * Pop A from held_locks
+				    *
+				    * In held_locks: Empty
+				    * In hist_locks: A, D
+				    * In graph: 'A -> BX', 'A -> D',
+				    *           'BX -> C', 'BX -> E'
+				    */
+
+   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
+   added on crosslocks.
+
+Crossrelease considers all acquisitions after acqiuring BX are
+candidates which might create dependencies with BX. True dependencies
+will be determined when identifying the release context of BX. Meanwhile,
+all typical locks are queued so that they can be used at the commit step.
+And then two dependencies 'BX -> C' and 'BX -> E' are added at the
+commit step when identifying the release context.
+
+The final graph will be, with crossrelease:
+
+               -> C
+              /
+       -> BX -
+      /       \
+   A -         -> E
+      \
+       -> D
+
+   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
+   added on crosslocks.
+
+However, the final graph will be, without crossrelease:
+
+   A -> D
+
+   where A and D are different lock classes.
+
+The former graph has three more dependencies, 'A -> BX', 'BX -> C' and
+'BX -> E' giving additional opportunities to check if they cause
+deadlocks. This way lockdep can detect a deadlock or its possibility
+caused by crosslocks.
+
+CONCLUSION
+
+We checked how crossrelease works with several examples.
+
+
+=============
+Optimizations
+=============
+
+Avoid duplication
+-----------------
+
+Crossrelease feature uses a cache like what lockdep already uses for
+dependency chains, but this time it's for caching CT type dependencies.
+Once that dependency is cached, the same will never be added again.
+
+
+Lockless for hot paths
+----------------------
+
+To keep all locks for later use at the commit step, crossrelease adopts
+a local array embedded in task_struct, which makes access to the data
+lockless by forcing it to happen only within the owner context. It's
+like how lockdep handles held_locks. Lockless implmentation is important
+since typical locks are very frequently acquired and released.
+
+
+=================================================
+APPENDIX A: What lockdep does to work aggresively
+=================================================
+
+A deadlock actually occurs when all wait operations creating circular
+dependencies run at the same time. Even though they don't, a potential
+deadlock exists if the problematic dependencies exist. Thus it's
+meaningful to detect not only an actual deadlock but also its potential
+possibility. The latter is rather valuable. When a deadlock occurs
+actually, we can identify what happens in the system by some means or
+other even without lockdep. However, there's no way to detect possiblity
+without lockdep unless the whole code is parsed in head. It's terrible.
+Lockdep does the both, and crossrelease only focuses on the latter.
+
+Whether or not a deadlock actually occurs depends on several factors.
+For example, what order contexts are switched in is a factor. Assuming
+circular dependencies exist, a deadlock would occur when contexts are
+switched so that all wait operations creating the dependencies run
+simultaneously. Thus to detect a deadlock possibility even in the case
+that it has not occured yet, lockdep should consider all possible
+combinations of dependencies, trying to:
+
+1. Use a global dependency graph.
+
+   Lockdep combines all dependencies into one global graph and uses them,
+   regardless of which context generates them or what order contexts are
+   switched in. Aggregated dependencies are only considered so they are
+   prone to be circular if a problem exists.
+
+2. Check dependencies between classes instead of instances.
+
+   What actually causes a deadlock are instances of lock. However,
+   lockdep checks dependencies between classes instead of instances.
+   This way lockdep can detect a deadlock which has not happened but
+   might happen in future by others but the same class.
+
+3. Assume all acquisitions lead to waiting.
+
+   Although locks might be acquired without waiting which is essential
+   to create dependencies, lockdep assumes all acquisitions lead to
+   waiting since it might be true some time or another.
+
+CONCLUSION
+
+Lockdep detects not only an actual deadlock but also its possibility,
+and the latter is more valuable.
+
+
+==================================================
+APPENDIX B: How to avoid adding false dependencies
+==================================================
+
+Remind what a dependency is. A dependency exists if:
+
+   1. There are two waiters waiting for each event at a given time.
+   2. The only way to wake up each waiter is to trigger its event.
+   3. Whether one can be woken up depends on whether the other can.
+
+For example:
+
+   acquire A
+   acquire B /* A dependency 'A -> B' exists */
+   release B
+   release A
+
+   where A and B are different lock classes.
+
+A depedency 'A -> B' exists since:
+
+   1. A waiter for A and a waiter for B might exist when acquiring B.
+   2. Only way to wake up each is to release what it waits for.
+   3. Whether the waiter for A can be woken up depends on whether the
+      other can. IOW, TASK X cannot release A if it fails to acquire B.
+
+For another example:
+
+   TASK X			   TASK Y
+   ------			   ------
+				   acquire AX
+   acquire B /* A dependency 'AX -> B' exists */
+   release B
+   release AX held by Y
+
+   where AX and B are different lock classes, and a suffix 'X' is added
+   on crosslocks.
+
+Even in this case involving crosslocks, the same rule can be applied. A
+depedency 'AX -> B' exists since:
+
+   1. A waiter for AX and a waiter for B might exist when acquiring B.
+   2. Only way to wake up each is to release what it waits for.
+   3. Whether the waiter for AX can be woken up depends on whether the
+      other can. IOW, TASK X cannot release AX if it fails to acquire B.
+
+Let's take a look at more complicated example:
+
+   TASK X			   TASK Y
+   ------			   ------
+   acquire B
+   release B
+   fork Y
+				   acquire AX
+   acquire C /* A dependency 'AX -> C' exists */
+   release C
+   release AX held by Y
+
+   where AX, B and C are different lock classes, and a suffix 'X' is
+   added on crosslocks.
+
+Does a dependency 'AX -> B' exist? Nope.
+
+Two waiters are essential to create a dependency. However, waiters for
+AX and B to create 'AX -> B' cannot exist at the same time in this
+example. Thus the dependency 'AX -> B' cannot be created.
+
+It would be ideal if the full set of true ones can be considered. But
+we can ensure nothing but what actually happened. Relying on what
+actually happens at runtime, we can anyway add only true ones, though
+they might be a subset of true ones. It's similar to how lockdep works
+for typical locks. There might be more true dependencies than what
+lockdep has detected in runtime. Lockdep has no choice but to rely on
+what actually happens. Crossrelease also relies on it.
+
+CONCLUSION
+
+Relying on what actually happens, lockdep can avoid adding false
+dependencies.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index c4ddfcd5ee32..d1d1716f904b 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -498,11 +498,11 @@ And a couple of implicit varieties:
      This means that ACQUIRE acts as a minimal "acquire" operation and
      RELEASE acts as a minimal "release" operation.
 
-A subset of the atomic operations described in core-api/atomic_ops.rst have
-ACQUIRE and RELEASE variants in addition to fully-ordered and relaxed (no
-barrier semantics) definitions.  For compound atomics performing both a load
-and a store, ACQUIRE semantics apply only to the load and RELEASE semantics
-apply only to the store portion of the operation.
+A subset of the atomic operations described in atomic_t.txt have ACQUIRE and
+RELEASE variants in addition to fully-ordered and relaxed (no barrier
+semantics) definitions.  For compound atomics performing both a load and a
+store, ACQUIRE semantics apply only to the load and RELEASE semantics apply
+only to the store portion of the operation.
 
 Memory barriers are only required where there's a possibility of interaction
 between two CPUs or between a CPU and a device.  If it can be guaranteed that
@@ -1876,8 +1876,7 @@ There are some more advanced barrier functions:
      This makes sure that the death mark on the object is perceived to be set
      *before* the reference counter is decremented.
 
-     See Documentation/core-api/atomic_ops.rst for more information.  See the
-     "Atomic operations" subsection for information on where to use these.
+     See Documentation/atomic_{t,bitops}.txt for more information.
 
 
  (*) lockless_dereference();
@@ -1982,10 +1981,7 @@ for each construct.  These operations all imply certain barriers:
      ACQUIRE operation has completed.
 
      Memory operations issued before the ACQUIRE may be completed after
-     the ACQUIRE operation has completed.  An smp_mb__before_spinlock(),
-     combined with a following ACQUIRE, orders prior stores against
-     subsequent loads and stores.  Note that this is weaker than smp_mb()!
-     The smp_mb__before_spinlock() primitive is free on many architectures.
+     the ACQUIRE operation has completed.
 
  (2) RELEASE operation implication:
 
@@ -2503,88 +2499,7 @@ operations are noted specially as some of them imply full memory barriers and
 some don't, but they're very heavily relied on as a group throughout the
 kernel.
 
-Any atomic operation that modifies some state in memory and returns information
-about the state (old or new) implies an SMP-conditional general memory barrier
-(smp_mb()) on each side of the actual operation (with the exception of
-explicit lock operations, described later).  These include:
-
-	xchg();
-	atomic_xchg();			atomic_long_xchg();
-	atomic_inc_return();		atomic_long_inc_return();
-	atomic_dec_return();		atomic_long_dec_return();
-	atomic_add_return();		atomic_long_add_return();
-	atomic_sub_return();		atomic_long_sub_return();
-	atomic_inc_and_test();		atomic_long_inc_and_test();
-	atomic_dec_and_test();		atomic_long_dec_and_test();
-	atomic_sub_and_test();		atomic_long_sub_and_test();
-	atomic_add_negative();		atomic_long_add_negative();
-	test_and_set_bit();
-	test_and_clear_bit();
-	test_and_change_bit();
-
-	/* when succeeds */
-	cmpxchg();
-	atomic_cmpxchg();		atomic_long_cmpxchg();
-	atomic_add_unless();		atomic_long_add_unless();
-
-These are used for such things as implementing ACQUIRE-class and RELEASE-class
-operations and adjusting reference counters towards object destruction, and as
-such the implicit memory barrier effects are necessary.
-
-
-The following operations are potential problems as they do _not_ imply memory
-barriers, but might be used for implementing such things as RELEASE-class
-operations:
-
-	atomic_set();
-	set_bit();
-	clear_bit();
-	change_bit();
-
-With these the appropriate explicit memory barrier should be used if necessary
-(smp_mb__before_atomic() for instance).
-
-
-The following also do _not_ imply memory barriers, and so may require explicit
-memory barriers under some circumstances (smp_mb__before_atomic() for
-instance):
-
-	atomic_add();
-	atomic_sub();
-	atomic_inc();
-	atomic_dec();
-
-If they're used for statistics generation, then they probably don't need memory
-barriers, unless there's a coupling between statistical data.
-
-If they're used for reference counting on an object to control its lifetime,
-they probably don't need memory barriers because either the reference count
-will be adjusted inside a locked section, or the caller will already hold
-sufficient references to make the lock, and thus a memory barrier unnecessary.
-
-If they're used for constructing a lock of some description, then they probably
-do need memory barriers as a lock primitive generally has to do things in a
-specific order.
-
-Basically, each usage case has to be carefully considered as to whether memory
-barriers are needed or not.
-
-The following operations are special locking primitives:
-
-	test_and_set_bit_lock();
-	clear_bit_unlock();
-	__clear_bit_unlock();
-
-These implement ACQUIRE-class and RELEASE-class operations.  These should be
-used in preference to other operations when implementing locking primitives,
-because their implementations can be optimised on many architectures.
-
-[!] Note that special memory barrier primitives are available for these
-situations because on some CPUs the atomic instructions used imply full memory
-barriers, and so barrier instructions are superfluous in conjunction with them,
-and in such cases the special barrier primitives will be no-ops.
-
-See Documentation/core-api/atomic_ops.rst for more information.
+See Documentation/atomic_t.txt for more information.
 
 
 ACCESSING DEVICES
diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index b83dfa1c0602..ab16efe0c79d 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -149,6 +149,26 @@ static_branch_inc(), will change the branch back to true. Likewise, if the
 key is initialized false, a 'static_branch_inc()', will change the branch to
 true. And then a 'static_branch_dec()', will again make the branch false.
 
+The state and the reference count can be retrieved with 'static_key_enabled()'
+and 'static_key_count()'.  In general, if you use these functions, they
+should be protected with the same mutex used around the enable/disable
+or increment/decrement function.
+
+Note that switching branches results in some locks being taken,
+particularly the CPU hotplug lock (in order to avoid races against
+CPUs being brought in the kernel whilst the kernel is getting
+patched). Calling the static key API from within a hotplug notifier is
+thus a sure deadlock recipe. In order to still allow use of the
+functionnality, the following functions are provided:
+
+	static_key_enable_cpuslocked()
+	static_key_disable_cpuslocked()
+	static_branch_enable_cpuslocked()
+	static_branch_disable_cpuslocked()
+
+These functions are *not* general purpose, and must only be used when
+you really know that you're in the above context, and no other.
+
 Where an array of keys is required, it can be defined as::
 
 	DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 38310dcd6620..bc80fc0e210f 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -1956,10 +1956,7 @@ MMIO 쓰기 배리어
      뒤에 완료됩니다.
 
      ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에
-     완료될 수 있습니다.  smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는
-     코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서
-     맞춥니다.  이건 smp_mb() 보다 완화된 것임을 기억하세요!  많은 아키텍쳐에서
-     smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다.
+     완료될 수 있습니다.
 
  (2) RELEASE 오퍼레이션의 영향:
 
diff --git a/arch/Kconfig b/arch/Kconfig
index 21d0089117fe..2520ca5b42eb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -931,6 +931,18 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
 	bool
 
+config ARCH_HAS_REFCOUNT
+	bool
+	help
+	  An architecture selects this when it has implemented refcount_t
+	  using open coded assembly primitives that provide an optimized
+	  refcount_t implementation, possibly at the expense of some full
+	  refcount state checks of CONFIG_REFCOUNT_FULL=y.
+
+	  The refcount overflow check behavior, however, must be retained.
+	  Catching overflows is the primary security concern for protecting
+	  against bugs in reference counts.
+
 config REFCOUNT_FULL
 	bool "Perform full reference count validation at the expense of speed"
 	help
diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index fb01dfb760c2..05a70edd57b6 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -25,18 +25,10 @@
 	:	"r" (uaddr), "r"(oparg)				\
 	:	"memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 54b54da6384c..11859287c52a 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i)
 	atomic_ops_unlock(flags);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 #endif
 
 /*
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 11e1b1f3acda..eb887dd13e74 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -73,20 +73,11 @@
 
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
 #ifndef CONFIG_ARC_HAS_LLSC
 	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
 #endif
@@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	preempt_enable();
 #endif
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 6795368ad023..cc414382dab4 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 #endif /* !SMP */
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 #ifndef CONFIG_SMP
 	preempt_disable();
 #endif
@@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	preempt_enable();
 #endif
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index f32b42e8725d..5bb2fd4674e7 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -48,20 +48,10 @@ do {									\
 } while (0)
 
 static inline int
-futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (int)(encoded_op << 8) >> 20;
-	int cmparg = (int)(encoded_op << 20) >> 20;
 	int oldval = 0, ret, tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1U << (oparg & 0x1f);
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index cae331d553f8..ae4241ab19a8 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -358,14 +358,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
 
-/*
- * Accesses appearing in program order before a spin_lock() operation
- * can be reordered with accesses inside the critical section, by virtue
- * of arch_spin_lock being constructed using acquire semantics.
- *
- * In cases where this is problematic (e.g. try_to_wake_up), an
- * smp_mb__before_spinlock() can restore the required ordering.
- */
-#define smp_mb__before_spinlock()	smp_mb()
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()	smp_mb()
 
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 2e1da71e27a4..ab346f5f8820 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -7,7 +7,8 @@
 #include <asm/errno.h>
 #include <linux/uaccess.h>
 
-extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
+extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr);
 
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index d155ca9e5098..37f7b2bf7f73 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o
 /*
  * do the futex operations
  */
-int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS; break;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
 
 	return ret;
 
-} /* end futex_atomic_op_inuser() */
+} /* end arch_futex_atomic_op_inuser() */
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index a62ba368b27d..fb3dfb2a667e 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new)
 	);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 /**
  * atomic_read - reads a word, atomically
  * @v: pointer to atomic value
diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h
index 7e597f8434da..c607b77c8215 100644
--- a/arch/hexagon/include/asm/futex.h
+++ b/arch/hexagon/include/asm/futex.h
@@ -31,18 +31,9 @@
 
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index 76acbcd5c060..6d67dc1eaf2b 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -45,18 +45,9 @@ do {									\
 } while (0)
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index 6c1380a8a0d4..eee779f26cc4 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i)
 	return i;
 }
 
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
 #define ATOMIC_OP(op, c_op)						\
 static inline void atomic_##op(int i, atomic_t *v)			\
 {									\
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index 01848f056f43..a9dad9e5e132 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -29,18 +29,9 @@
 })
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index 1de190bdfb9c..a9e61ea54ca9 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -83,18 +83,9 @@
 }
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 6bace7695788..c7cbddfcdc3b 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 
 static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
-static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
 
 void tick_broadcast(const struct cpumask *mask)
 {
 	atomic_t *count;
-	struct call_single_data *csd;
+	call_single_data_t *csd;
 	int cpu;
 
 	for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
 
 static int __init tick_broadcast_init(void)
 {
-	struct call_single_data *csd;
+	call_single_data_t *csd;
 	int cpu;
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h
index 778087341977..8fed278a24b8 100644
--- a/arch/openrisc/include/asm/futex.h
+++ b/arch/openrisc/include/asm/futex.h
@@ -30,20 +30,10 @@
 })
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 5394b9c5f914..17b98a87e5e2 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i)
 	_atomic_spin_unlock_irqrestore(v, flags);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 static __inline__ int atomic_read(const atomic_t *v)
 {
 	return READ_ONCE((v)->counter);
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 0ba14300cd8e..c601aab2fb36 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
 }
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
 	unsigned long int flags;
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval, ret;
 	u32 tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)))
-		return -EFAULT;
-
 	_futex_spin_lock_irqsave(uaddr, &flags);
 	pagefault_disable();
 
@@ -85,17 +75,9 @@ out_pagefault_enable:
 	pagefault_enable();
 	_futex_spin_unlock_irqrestore(uaddr, &flags);
 
-	if (ret == 0) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 25d42bd3f114..9c601adfc500 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -74,13 +74,6 @@ do {									\
 	___p1;								\
 })
 
-/*
- * This must resolve to hwsync on SMP for the context switch path.
- * See _switch, and core scheduler context switch memory ordering
- * comments.
- */
-#define smp_mb__before_spinlock()   smp_mb()
-
 #include <asm-generic/barrier.h>
 
 #endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index eaada6c92344..719ed9b61ea7 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -29,18 +29,10 @@
 	: "b" (uaddr), "i" (-EFAULT), "r" (oparg) \
 	: "cr0", "memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 8c1b913de6d7..c1b1ec94b06c 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -342,5 +342,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	__rw_yield(lock)
 #define arch_write_relax(lock)	__rw_yield(lock)
 
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()   smp_mb()
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index a4811aa0304d..8f8eec9e1198 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -21,17 +21,12 @@
 		: "0" (-EFAULT), "d" (oparg), "a" (uaddr),		\
 		  "m" (*uaddr) : "cc");
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, newval, ret;
 
 	load_kernel_asce();
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
 
 	pagefault_disable();
 	switch (op) {
@@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index d0078747d308..8f8cf941a8cd 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
 }
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	u32 oparg = (encoded_op << 8) >> 20;
-	u32 cmparg = (encoded_op << 20) >> 20;
 	u32 oldval, newval, prev;
 	int ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	do {
@@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
 
 	return ret;
 }
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index ee3f11c43cda..7643e979e333 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int);
 int __atomic_add_unless(atomic_t *, int, int);
 void atomic_set(atomic_t *, int);
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 #define atomic_read(v)          ACCESS_ONCE((v)->counter)
 
 #define atomic_add(i, v)	((void)atomic_add_return( (int)(i), (v)))
diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h
index 4e899b0dabf7..1cfd89d92208 100644
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -29,22 +29,14 @@
 	: "r" (uaddr), "r" (oparg), "i" (-EFAULT)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-		return -EFAULT;
 	if (unlikely((((unsigned long) uaddr) & 0x3UL)))
 		return -EINVAL;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index a93774255136..53a423e7cb92 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n)
 	_atomic_xchg(&v->counter, n);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 /* A 64bit atomic type */
 
 typedef struct {
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index e64a1b75fc38..83c1e639b411 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -106,12 +106,9 @@
 	lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int uninitialized_var(val), ret;
 
 	__futex_prolog();
@@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	/* The 32-bit futex code makes this assumption, so validate it here. */
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (val == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (val != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (val < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (val >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (val <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (val > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = val;
+
 	return ret;
 }
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 323cb065be5e..efd9df53b9e7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,8 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_PMEM_API		if X86_64
+	# Causing hangs/crashes, see the commit that added this change for details.
+	select ARCH_HAS_REFCOUNT		if BROKEN
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index cd20ca0b4043..1fc519f3c49e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -305,8 +305,6 @@ config DEBUG_ENTRY
 	  Some of these sanity checks may slow down kernel entries and
 	  exits or otherwise impact performance.
 
-	  This is currently used to help test NMI code.
-
 	  If unsure, say N.
 
 config DEBUG_NMI_SELFTEST
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 9976fcecd17e..af28a8a24366 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,7 +2,6 @@
 # Makefile for the x86 low level entry code
 #
 
-OBJECT_FILES_NON_STANDARD_entry_$(BITS).o   := y
 OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
 
 CFLAGS_syscall_64.o		+= $(call cc-option,-Wno-override-init,)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d393da7..640aafebdc00 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,4 +1,5 @@
 #include <linux/jump_label.h>
+#include <asm/unwind_hints.h>
 
 /*
 
@@ -112,6 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
 	movq %rdx, 12*8+\offset(%rsp)
 	movq %rsi, 13*8+\offset(%rsp)
 	movq %rdi, 14*8+\offset(%rsp)
+	UNWIND_HINT_REGS offset=\offset extra=0
 	.endm
 	.macro SAVE_C_REGS offset=0
 	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -136,6 +138,7 @@ For 32-bit we have the following conventions - kernel is built with
 	movq %r12, 3*8+\offset(%rsp)
 	movq %rbp, 4*8+\offset(%rsp)
 	movq %rbx, 5*8+\offset(%rsp)
+	UNWIND_HINT_REGS offset=\offset
 	.endm
 
 	.macro RESTORE_EXTRA_REGS offset=0
@@ -145,6 +148,7 @@ For 32-bit we have the following conventions - kernel is built with
 	movq 3*8+\offset(%rsp), %r12
 	movq 4*8+\offset(%rsp), %rbp
 	movq 5*8+\offset(%rsp), %rbx
+	UNWIND_HINT_REGS offset=\offset extra=0
 	.endm
 
 	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
@@ -167,6 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
 	.endif
 	movq 13*8(%rsp), %rsi
 	movq 14*8(%rsp), %rdi
+	UNWIND_HINT_IRET_REGS offset=16*8
 	.endm
 	.macro RESTORE_C_REGS
 	RESTORE_C_REGS_HELPER 1,1,1,1,1
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6d078b89a5e8..64b233ab7cad 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <asm/export.h>
+#include <asm/frame.h>
 #include <linux/err.h>
 
 .code64
@@ -43,9 +44,10 @@
 
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_usergs_sysret64)
+	UNWIND_HINT_EMPTY
 	swapgs
 	sysretq
-ENDPROC(native_usergs_sysret64)
+END(native_usergs_sysret64)
 #endif /* CONFIG_PARAVIRT */
 
 .macro TRACE_IRQS_IRETQ
@@ -134,6 +136,7 @@ ENDPROC(native_usergs_sysret64)
  */
 
 ENTRY(entry_SYSCALL_64)
+	UNWIND_HINT_EMPTY
 	/*
 	 * Interrupts are off on entry.
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -169,6 +172,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	pushq	%r10				/* pt_regs->r10 */
 	pushq	%r11				/* pt_regs->r11 */
 	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
+	UNWIND_HINT_REGS extra=0
 
 	/*
 	 * If we need to do entry work or if we guess we'll need to do
@@ -223,6 +227,7 @@ entry_SYSCALL_64_fastpath:
 	movq	EFLAGS(%rsp), %r11
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RSP(%rsp), %rsp
+	UNWIND_HINT_EMPTY
 	USERGS_SYSRET64
 
 1:
@@ -316,6 +321,7 @@ syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RSP(%rsp), %rsp
+	UNWIND_HINT_EMPTY
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
@@ -343,6 +349,7 @@ ENTRY(stub_ptregs_64)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	popq	%rax
+	UNWIND_HINT_REGS extra=0
 	jmp	entry_SYSCALL64_slow_path
 
 1:
@@ -351,6 +358,7 @@ END(stub_ptregs_64)
 
 .macro ptregs_stub func
 ENTRY(ptregs_\func)
+	UNWIND_HINT_FUNC
 	leaq	\func(%rip), %rax
 	jmp	stub_ptregs_64
 END(ptregs_\func)
@@ -367,6 +375,7 @@ END(ptregs_\func)
  * %rsi: next task
  */
 ENTRY(__switch_to_asm)
+	UNWIND_HINT_FUNC
 	/*
 	 * Save callee-saved registers
 	 * This must match the order in inactive_task_frame
@@ -406,6 +415,7 @@ END(__switch_to_asm)
  * r12: kernel thread arg
  */
 ENTRY(ret_from_fork)
+	UNWIND_HINT_EMPTY
 	movq	%rax, %rdi
 	call	schedule_tail			/* rdi: 'prev' task parameter */
 
@@ -413,6 +423,7 @@ ENTRY(ret_from_fork)
 	jnz	1f				/* kernel threads are uncommon */
 
 2:
+	UNWIND_HINT_REGS
 	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
@@ -440,13 +451,102 @@ END(ret_from_fork)
 ENTRY(irq_entries_start)
     vector=FIRST_EXTERNAL_VECTOR
     .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+	UNWIND_HINT_IRET_REGS
 	pushq	$(~vector+0x80)			/* Note: always in signed byte range */
-    vector=vector+1
 	jmp	common_interrupt
 	.align	8
+	vector=vector+1
     .endr
 END(irq_entries_start)
 
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+	pushfq
+	testl $X86_EFLAGS_IF, (%rsp)
+	jz .Lokay_\@
+	ud2
+.Lokay_\@:
+	addq $8, %rsp
+#endif
+.endm
+
+/*
+ * Enters the IRQ stack if we're not already using it.  NMI-safe.  Clobbers
+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
+ * Requires kernel GSBASE.
+ *
+ * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+ */
+.macro ENTER_IRQ_STACK regs=1 old_rsp
+	DEBUG_ENTRY_ASSERT_IRQS_OFF
+	movq	%rsp, \old_rsp
+
+	.if \regs
+	UNWIND_HINT_REGS base=\old_rsp
+	.endif
+
+	incl	PER_CPU_VAR(irq_count)
+	jnz	.Lirq_stack_push_old_rsp_\@
+
+	/*
+	 * Right now, if we just incremented irq_count to zero, we've
+	 * claimed the IRQ stack but we haven't switched to it yet.
+	 *
+	 * If anything is added that can interrupt us here without using IST,
+	 * it must be *extremely* careful to limit its stack usage.  This
+	 * could include kprobes and a hypothetical future IST-less #DB
+	 * handler.
+	 *
+	 * The OOPS unwinder relies on the word at the top of the IRQ
+	 * stack linking back to the previous RSP for the entire time we're
+	 * on the IRQ stack.  For this to work reliably, we need to write
+	 * it before we actually move ourselves to the IRQ stack.
+	 */
+
+	movq	\old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
+	movq	PER_CPU_VAR(irq_stack_ptr), %rsp
+
+#ifdef CONFIG_DEBUG_ENTRY
+	/*
+	 * If the first movq above becomes wrong due to IRQ stack layout
+	 * changes, the only way we'll notice is if we try to unwind right
+	 * here.  Assert that we set up the stack right to catch this type
+	 * of bug quickly.
+	 */
+	cmpq	-8(%rsp), \old_rsp
+	je	.Lirq_stack_okay\@
+	ud2
+	.Lirq_stack_okay\@:
+#endif
+
+.Lirq_stack_push_old_rsp_\@:
+	pushq	\old_rsp
+
+	.if \regs
+	UNWIND_HINT_REGS indirect=1
+	.endif
+.endm
+
+/*
+ * Undoes ENTER_IRQ_STACK.
+ */
+.macro LEAVE_IRQ_STACK regs=1
+	DEBUG_ENTRY_ASSERT_IRQS_OFF
+	/* We need to be off the IRQ stack before decrementing irq_count. */
+	popq	%rsp
+
+	.if \regs
+	UNWIND_HINT_REGS
+	.endif
+
+	/*
+	 * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
+	 * the irq stack but we're not on it.
+	 */
+
+	decl	PER_CPU_VAR(irq_count)
+.endm
+
 /*
  * Interrupt entry/exit.
  *
@@ -485,17 +585,7 @@ END(irq_entries_start)
 	CALL_enter_from_user_mode
 
 1:
-	/*
-	 * Save previous stack pointer, optionally switch to interrupt stack.
-	 * irq_count is used to check if a CPU is already on an interrupt stack
-	 * or not. While this is essentially redundant with preempt_count it is
-	 * a little cheaper to use a separate counter in the PDA (short of
-	 * moving irq_enter into assembly, which would be too much work)
-	 */
-	movq	%rsp, %rdi
-	incl	PER_CPU_VAR(irq_count)
-	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
-	pushq	%rdi
+	ENTER_IRQ_STACK old_rsp=%rdi
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 
@@ -515,10 +605,8 @@ common_interrupt:
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
-	decl	PER_CPU_VAR(irq_count)
 
-	/* Restore saved previous stack */
-	popq	%rsp
+	LEAVE_IRQ_STACK
 
 	testb	$3, CS(%rsp)
 	jz	retint_kernel
@@ -561,6 +649,7 @@ restore_c_regs_and_iret:
 	INTERRUPT_RETURN
 
 ENTRY(native_iret)
+	UNWIND_HINT_IRET_REGS
 	/*
 	 * Are we returning to a stack segment from the LDT?  Note: in
 	 * 64-bit mode SS:RSP on the exception stack is always valid.
@@ -633,6 +722,7 @@ native_irq_return_ldt:
 	orq	PER_CPU_VAR(espfix_stack), %rax
 	SWAPGS
 	movq	%rax, %rsp
+	UNWIND_HINT_IRET_REGS offset=8
 
 	/*
 	 * At this point, we cannot write to the stack any more, but we can
@@ -654,6 +744,7 @@ END(common_interrupt)
  */
 .macro apicinterrupt3 num sym do_sym
 ENTRY(\sym)
+	UNWIND_HINT_IRET_REGS
 	ASM_CLAC
 	pushq	$~(\num)
 .Lcommon_\sym:
@@ -740,6 +831,8 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
+	UNWIND_HINT_IRET_REGS offset=8
+
 	/* Sanity check */
 	.if \shift_ist != -1 && \paranoid == 0
 	.error "using shift_ist requires paranoid=1"
@@ -763,6 +856,7 @@ ENTRY(\sym)
 	.else
 	call	error_entry
 	.endif
+	UNWIND_HINT_REGS
 	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
 	.if \paranoid
@@ -860,6 +954,7 @@ idtentry simd_coprocessor_error		do_simd_coprocessor_error	has_error_code=0
 	 * edi:  new selector
 	 */
 ENTRY(native_load_gs_index)
+	FRAME_BEGIN
 	pushfq
 	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
@@ -868,8 +963,9 @@ ENTRY(native_load_gs_index)
 2:	ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
 	SWAPGS
 	popfq
+	FRAME_END
 	ret
-END(native_load_gs_index)
+ENDPROC(native_load_gs_index)
 EXPORT_SYMBOL(native_load_gs_index)
 
 	_ASM_EXTABLE(.Lgs_change, bad_gs)
@@ -892,14 +988,12 @@ bad_gs:
 ENTRY(do_softirq_own_stack)
 	pushq	%rbp
 	mov	%rsp, %rbp
-	incl	PER_CPU_VAR(irq_count)
-	cmove	PER_CPU_VAR(irq_stack_ptr), %rsp
-	push	%rbp				/* frame pointer backlink */
+	ENTER_IRQ_STACK regs=0 old_rsp=%r11
 	call	__do_softirq
+	LEAVE_IRQ_STACK regs=0
 	leaveq
-	decl	PER_CPU_VAR(irq_count)
 	ret
-END(do_softirq_own_stack)
+ENDPROC(do_softirq_own_stack)
 
 #ifdef CONFIG_XEN
 idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
@@ -923,14 +1017,14 @@ ENTRY(xen_do_hypervisor_callback)		/* do_hypervisor_callback(struct *pt_regs) */
  * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
  * see the correct pointer to the pt_regs
  */
+	UNWIND_HINT_FUNC
 	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
-11:	incl	PER_CPU_VAR(irq_count)
-	movq	%rsp, %rbp
-	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
-	pushq	%rbp				/* frame pointer backlink */
+	UNWIND_HINT_REGS
+
+	ENTER_IRQ_STACK old_rsp=%r10
 	call	xen_evtchn_do_upcall
-	popq	%rsp
-	decl	PER_CPU_VAR(irq_count)
+	LEAVE_IRQ_STACK
+
 #ifndef CONFIG_PREEMPT
 	call	xen_maybe_preempt_hcall
 #endif
@@ -951,6 +1045,7 @@ END(xen_do_hypervisor_callback)
  * with its current contents: any discrepancy means we in category 1.
  */
 ENTRY(xen_failsafe_callback)
+	UNWIND_HINT_EMPTY
 	movl	%ds, %ecx
 	cmpw	%cx, 0x10(%rsp)
 	jne	1f
@@ -970,11 +1065,13 @@ ENTRY(xen_failsafe_callback)
 	pushq	$0				/* RIP */
 	pushq	%r11
 	pushq	%rcx
+	UNWIND_HINT_IRET_REGS offset=8
 	jmp	general_protection
 1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
 	movq	(%rsp), %rcx
 	movq	8(%rsp), %r11
 	addq	$0x30, %rsp
+	UNWIND_HINT_IRET_REGS
 	pushq	$-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
@@ -1020,6 +1117,7 @@ idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vec
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
+	UNWIND_HINT_FUNC
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
@@ -1047,6 +1145,7 @@ END(paranoid_entry)
  * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
  */
 ENTRY(paranoid_exit)
+	UNWIND_HINT_REGS
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF_DEBUG
 	testl	%ebx, %ebx			/* swapgs needed? */
@@ -1068,6 +1167,7 @@ END(paranoid_exit)
  * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
+	UNWIND_HINT_FUNC
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
@@ -1152,6 +1252,7 @@ END(error_entry)
  *   0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
  */
 ENTRY(error_exit)
+	UNWIND_HINT_REGS
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	testl	%ebx, %ebx
@@ -1161,6 +1262,7 @@ END(error_exit)
 
 /* Runs on exception stack */
 ENTRY(nmi)
+	UNWIND_HINT_IRET_REGS
 	/*
 	 * Fix up the exception frame if we're on Xen.
 	 * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
@@ -1234,11 +1336,13 @@ ENTRY(nmi)
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+	UNWIND_HINT_IRET_REGS base=%rdx offset=8
 	pushq	5*8(%rdx)	/* pt_regs->ss */
 	pushq	4*8(%rdx)	/* pt_regs->rsp */
 	pushq	3*8(%rdx)	/* pt_regs->flags */
 	pushq	2*8(%rdx)	/* pt_regs->cs */
 	pushq	1*8(%rdx)	/* pt_regs->rip */
+	UNWIND_HINT_IRET_REGS
 	pushq   $-1		/* pt_regs->orig_ax */
 	pushq   %rdi		/* pt_regs->di */
 	pushq   %rsi		/* pt_regs->si */
@@ -1255,6 +1359,7 @@ ENTRY(nmi)
 	pushq	%r13		/* pt_regs->r13 */
 	pushq	%r14		/* pt_regs->r14 */
 	pushq	%r15		/* pt_regs->r15 */
+	UNWIND_HINT_REGS
 	ENCODE_FRAME_POINTER
 
 	/*
@@ -1409,6 +1514,7 @@ first_nmi:
 	.rept 5
 	pushq	11*8(%rsp)
 	.endr
+	UNWIND_HINT_IRET_REGS
 
 	/* Everything up to here is safe from nested NMIs */
 
@@ -1424,6 +1530,7 @@ first_nmi:
 	pushq	$__KERNEL_CS	/* CS */
 	pushq	$1f		/* RIP */
 	INTERRUPT_RETURN	/* continues at repeat_nmi below */
+	UNWIND_HINT_IRET_REGS
 1:
 #endif
 
@@ -1473,6 +1580,7 @@ end_repeat_nmi:
 	 * exceptions might do.
 	 */
 	call	paranoid_entry
+	UNWIND_HINT_REGS
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	movq	%rsp, %rdi
@@ -1510,17 +1618,19 @@ nmi_restore:
 END(nmi)
 
 ENTRY(ignore_sysret)
+	UNWIND_HINT_EMPTY
 	mov	$-ENOSYS, %eax
 	sysret
 END(ignore_sysret)
 
 ENTRY(rewind_stack_do_exit)
+	UNWIND_HINT_FUNC
 	/* Prevent any naive code from trying to unwind to our caller. */
 	xorl	%ebp, %ebp
 
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rax
-	leaq	-TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
+	leaq	-PTREGS_SIZE(%rax), %rsp
+	UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE
 
 	call	do_exit
-1:	jmp 1b
 END(rewind_stack_do_exit)
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7a9df3beb89b..676ee5807d86 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -74,6 +74,9 @@
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
+# define _ASM_EXTABLE_REFCOUNT(from, to)			\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
 	_ASM_ALIGN ;						\
@@ -123,6 +126,9 @@
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
+# define _ASM_EXTABLE_REFCOUNT(from, to)			\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 33380b871463..0874ebda3069 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new)
 	return xchg(&v->counter, new);
 }
 
-#define ATOMIC_OP(op)							\
-static inline void atomic_##op(int i, atomic_t *v)			\
-{									\
-	asm volatile(LOCK_PREFIX #op"l %1,%0"				\
-			: "+m" (v->counter)				\
-			: "ir" (i)					\
-			: "memory");					\
+static inline void atomic_and(int i, atomic_t *v)
+{
+	asm volatile(LOCK_PREFIX "andl %1,%0"
+			: "+m" (v->counter)
+			: "ir" (i)
+			: "memory");
+}
+
+static inline int atomic_fetch_and(int i, atomic_t *v)
+{
+	int val = atomic_read(v);
+
+	do { } while (!atomic_try_cmpxchg(v, &val, val & i));
+
+	return val;
 }
 
-#define ATOMIC_FETCH_OP(op, c_op)					\
-static inline int atomic_fetch_##op(int i, atomic_t *v)			\
-{									\
-	int val = atomic_read(v);					\
-	do {								\
-	} while (!atomic_try_cmpxchg(v, &val, val c_op i));		\
-	return val;							\
+static inline void atomic_or(int i, atomic_t *v)
+{
+	asm volatile(LOCK_PREFIX "orl %1,%0"
+			: "+m" (v->counter)
+			: "ir" (i)
+			: "memory");
 }
 
-#define ATOMIC_OPS(op, c_op)						\
-	ATOMIC_OP(op)							\
-	ATOMIC_FETCH_OP(op, c_op)
+static inline int atomic_fetch_or(int i, atomic_t *v)
+{
+	int val = atomic_read(v);
 
-ATOMIC_OPS(and, &)
-ATOMIC_OPS(or , |)
-ATOMIC_OPS(xor, ^)
+	do { } while (!atomic_try_cmpxchg(v, &val, val | i));
 
-#undef ATOMIC_OPS
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP
+	return val;
+}
+
+static inline void atomic_xor(int i, atomic_t *v)
+{
+	asm volatile(LOCK_PREFIX "xorl %1,%0"
+			: "+m" (v->counter)
+			: "ir" (i)
+			: "memory");
+}
+
+static inline int atomic_fetch_xor(int i, atomic_t *v)
+{
+	int val = atomic_read(v);
+
+	do { } while (!atomic_try_cmpxchg(v, &val, val ^ i));
+
+	return val;
+}
 
 /**
  * __atomic_add_unless - add unless the number is already a given value
@@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^)
 static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c = atomic_read(v);
+
 	do {
 		if (unlikely(c == u))
 			break;
 	} while (!atomic_try_cmpxchg(v, &c, c + a));
+
 	return c;
 }
 
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 71d7705fb303..9e206f31ce2a 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 #undef alternative_atomic64
 #undef __alternative_atomic64
 
-#define ATOMIC64_OP(op, c_op)						\
-static inline void atomic64_##op(long long i, atomic64_t *v)		\
-{									\
-	long long old, c = 0;						\
-	while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)		\
-		c = old;						\
+static inline void atomic64_and(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
+		c = old;
 }
 
-#define ATOMIC64_FETCH_OP(op, c_op)					\
-static inline long long atomic64_fetch_##op(long long i, atomic64_t *v)	\
-{									\
-	long long old, c = 0;						\
-	while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)		\
-		c = old;						\
-	return old;							\
+static inline long long atomic64_fetch_and(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c & i)) != c)
+		c = old;
+
+	return old;
 }
 
-ATOMIC64_FETCH_OP(add, +)
+static inline void atomic64_or(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
 
-#define atomic64_fetch_sub(i, v)	atomic64_fetch_add(-(i), (v))
+	while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+		c = old;
+}
+
+static inline long long atomic64_fetch_or(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c | i)) != c)
+		c = old;
+
+	return old;
+}
 
-#define ATOMIC64_OPS(op, c_op)						\
-	ATOMIC64_OP(op, c_op)						\
-	ATOMIC64_FETCH_OP(op, c_op)
+static inline void atomic64_xor(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+		c = old;
+}
 
-ATOMIC64_OPS(and, &)
-ATOMIC64_OPS(or, |)
-ATOMIC64_OPS(xor, ^)
+static inline long long atomic64_fetch_xor(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c)
+		c = old;
+
+	return old;
+}
 
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
+static inline long long atomic64_fetch_add(long long i, atomic64_t *v)
+{
+	long long old, c = 0;
+
+	while ((old = atomic64_cmpxchg(v, c, c + i)) != c)
+		c = old;
+
+	return old;
+}
+
+#define atomic64_fetch_sub(i, v)	atomic64_fetch_add(-(i), (v))
 
 #endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 6189a433c9a9..5d9de36a2f04 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
 }
 
 #define atomic64_try_cmpxchg atomic64_try_cmpxchg
-static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new)
 {
 	return try_cmpxchg(&v->counter, old, new);
 }
@@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
  */
 static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
 {
-	long c = atomic64_read(v);
+	s64 c = atomic64_read(v);
 	do {
 		if (unlikely(c == u))
 			return false;
@@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
  */
 static inline long atomic64_dec_if_positive(atomic64_t *v)
 {
-	long dec, c = atomic64_read(v);
+	s64 dec, c = atomic64_read(v);
 	do {
 		dec = c - 1;
 		if (unlikely(dec < 0))
@@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return dec;
 }
 
-#define ATOMIC64_OP(op)							\
-static inline void atomic64_##op(long i, atomic64_t *v)			\
-{									\
-	asm volatile(LOCK_PREFIX #op"q %1,%0"				\
-			: "+m" (v->counter)				\
-			: "er" (i)					\
-			: "memory");					\
+static inline void atomic64_and(long i, atomic64_t *v)
+{
+	asm volatile(LOCK_PREFIX "andq %1,%0"
+			: "+m" (v->counter)
+			: "er" (i)
+			: "memory");
 }
 
-#define ATOMIC64_FETCH_OP(op, c_op)					\
-static inline long atomic64_fetch_##op(long i, atomic64_t *v)		\
-{									\
-	long val = atomic64_read(v);					\
-	do {								\
-	} while (!atomic64_try_cmpxchg(v, &val, val c_op i));		\
-	return val;							\
+static inline long atomic64_fetch_and(long i, atomic64_t *v)
+{
+	s64 val = atomic64_read(v);
+
+	do {
+	} while (!atomic64_try_cmpxchg(v, &val, val & i));
+	return val;
 }
 
-#define ATOMIC64_OPS(op, c_op)						\
-	ATOMIC64_OP(op)							\
-	ATOMIC64_FETCH_OP(op, c_op)
+static inline void atomic64_or(long i, atomic64_t *v)
+{
+	asm volatile(LOCK_PREFIX "orq %1,%0"
+			: "+m" (v->counter)
+			: "er" (i)
+			: "memory");
+}
 
-ATOMIC64_OPS(and, &)
-ATOMIC64_OPS(or, |)
-ATOMIC64_OPS(xor, ^)
+static inline long atomic64_fetch_or(long i, atomic64_t *v)
+{
+	s64 val = atomic64_read(v);
 
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
+	do {
+	} while (!atomic64_try_cmpxchg(v, &val, val | i));
+	return val;
+}
+
+static inline void atomic64_xor(long i, atomic64_t *v)
+{
+	asm volatile(LOCK_PREFIX "xorq %1,%0"
+			: "+m" (v->counter)
+			: "er" (i)
+			: "memory");
+}
+
+static inline long atomic64_fetch_xor(long i, atomic64_t *v)
+{
+	s64 val = atomic64_read(v);
+
+	do {
+	} while (!atomic64_try_cmpxchg(v, &val, val ^ i));
+	return val;
+}
 
 #endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d90296d061e8..b5069e802d5c 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -157,7 +157,7 @@ extern void __add_wrong_size(void)
 #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock)		\
 ({									\
 	bool success;							\
-	__typeof__(_ptr) _old = (_pold);				\
+	__typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);		\
 	__typeof__(*(_ptr)) __old = *_old;				\
 	__typeof__(*(_ptr)) __new = (_new);				\
 	switch (size) {							\
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index b4c1f5453436..f4dc9b63bdda 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -41,20 +41,11 @@
 		       "+m" (*uaddr), "=&r" (tem)		\
 		     : "r" (oparg), "i" (-EFAULT), "1" (0))
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 48febf07e828..1310e1f1cd65 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -69,6 +69,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", )
 build_mmio_write(__writew, "w", unsigned short, "r", )
 build_mmio_write(__writel, "l", unsigned int, "r", )
 
+#define readb readb
+#define readw readw
+#define readl readl
 #define readb_relaxed(a) __readb(a)
 #define readw_relaxed(a) __readw(a)
 #define readl_relaxed(a) __readl(a)
@@ -76,6 +79,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
 #define __raw_readw __readw
 #define __raw_readl __readl
 
+#define writeb writeb
+#define writew writew
+#define writel writel
 #define writeb_relaxed(v, a) __writeb(v, a)
 #define writew_relaxed(v, a) __writew(v, a)
 #define writel_relaxed(v, a) __writel(v, a)
@@ -88,13 +94,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
 #ifdef CONFIG_X86_64
 
 build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_read(__readq, "q", unsigned long, "=r", )
 build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+build_mmio_write(__writeq, "q", unsigned long, "r", )
 
-#define readq_relaxed(a)	readq(a)
-#define writeq_relaxed(v, a)	writeq(v, a)
+#define readq_relaxed(a)	__readq(a)
+#define writeq_relaxed(v, a)	__writeq(v, a)
 
-#define __raw_readq(a)		readq(a)
-#define __raw_writeq(val, addr)	writeq(val, addr)
+#define __raw_readq		__readq
+#define __raw_writeq		__writeq
 
 /* Let people know that we have them */
 #define readq			readq
@@ -119,6 +127,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address)
 {
 	return __pa(address);
 }
+#define virt_to_phys virt_to_phys
 
 /**
  *	phys_to_virt	-	map physical address to virtual
@@ -137,6 +146,7 @@ static inline void *phys_to_virt(phys_addr_t address)
 {
 	return __va(address);
 }
+#define phys_to_virt phys_to_virt
 
 /*
  * Change "struct page" to physical address.
@@ -169,11 +179,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  * else, you probably want one of the following.
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+#define ioremap_nocache ioremap_nocache
 extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 #define ioremap_uc ioremap_uc
 
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+#define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
+#define ioremap_prot ioremap_prot
 
 /**
  * ioremap     -   map bus memory into CPU space
@@ -193,8 +206,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 {
 	return ioremap_nocache(offset, size);
 }
+#define ioremap ioremap
 
 extern void iounmap(volatile void __iomem *addr);
+#define iounmap iounmap
 
 extern void set_iounmap_nonlazy(void);
 
@@ -203,53 +218,6 @@ extern void set_iounmap_nonlazy(void);
 #include <asm-generic/iomap.h>
 
 /*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)	p
-
-/**
- * memset_io	Set a range of I/O memory to a constant value
- * @addr:	The beginning of the I/O-memory range to set
- * @val:	The value to set the memory to
- * @count:	The number of bytes to set
- *
- * Set a range of I/O memory to a given value.
- */
-static inline void
-memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
-{
-	memset((void __force *)addr, val, count);
-}
-
-/**
- * memcpy_fromio	Copy a block of data from I/O memory
- * @dst:		The (RAM) destination for the copy
- * @src:		The (I/O memory) source for the data
- * @count:		The number of bytes to copy
- *
- * Copy a block of data from I/O memory.
- */
-static inline void
-memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
-{
-	memcpy(dst, (const void __force *)src, count);
-}
-
-/**
- * memcpy_toio		Copy a block of data into I/O memory
- * @dst:		The (I/O memory) destination for the copy
- * @src:		The (RAM) source for the data
- * @count:		The number of bytes to copy
- *
- * Copy a block of data to I/O memory.
- */
-static inline void
-memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
-{
-	memcpy((void __force *)dst, src, count);
-}
-
-/*
  * ISA space is 'always mapped' on a typical x86 system, no need to
  * explicitly ioremap() it. The fact that the ISA IO space is mapped
  * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
@@ -341,13 +309,38 @@ BUILDIO(b, b, char)
 BUILDIO(w, w, short)
 BUILDIO(l, , int)
 
+#define inb inb
+#define inw inw
+#define inl inl
+#define inb_p inb_p
+#define inw_p inw_p
+#define inl_p inl_p
+#define insb insb
+#define insw insw
+#define insl insl
+
+#define outb outb
+#define outw outw
+#define outl outl
+#define outb_p outb_p
+#define outw_p outw_p
+#define outl_p outl_p
+#define outsb outsb
+#define outsw outsw
+#define outsl outsl
+
 extern void *xlate_dev_mem_ptr(phys_addr_t phys);
 extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
 
+#define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
+
 extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 				enum page_cache_mode pcm);
 extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+#define ioremap_wc ioremap_wc
 extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
+#define ioremap_wt ioremap_wt
 
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
@@ -365,6 +358,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 
 #define IO_SPACE_LIMIT 0xffff
 
+#include <asm-generic/io.h>
+#undef PCI_IOBASE
+
 #ifdef CONFIG_MTRR
 extern int __must_check arch_phys_wc_index(int handle);
 #define arch_phys_wc_index arch_phys_wc_index
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
new file mode 100644
index 000000000000..7dc777a6cb40
--- /dev/null
+++ b/arch/x86/include/asm/orc_types.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and BP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED		0
+#define ORC_REG_PREV_SP			1
+#define ORC_REG_DX			2
+#define ORC_REG_DI			3
+#define ORC_REG_BP			4
+#define ORC_REG_SP			5
+#define ORC_REG_R10			6
+#define ORC_REG_R13			7
+#define ORC_REG_BP_INDIRECT		8
+#define ORC_REG_SP_INDIRECT		9
+#define ORC_REG_MAX			15
+
+/*
+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
+ * caller's SP right before it made the call).  Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
+ * to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
+ * points to the iret return frame.
+ *
+ * The UNWIND_HINT macros are used only for the unwind_hint struct.  They
+ * aren't used in struct orc_entry due to size and complexity constraints.
+ * Objtool converts them to real types when it converts the hints to orc
+ * entries.
+ */
+#define ORC_TYPE_CALL			0
+#define ORC_TYPE_REGS			1
+#define ORC_TYPE_REGS_IRET		2
+#define UNWIND_HINT_TYPE_SAVE		3
+#define UNWIND_HINT_TYPE_RESTORE	4
+
+#ifndef __ASSEMBLY__
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard.  It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder.  It tells the
+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on
+ * the stack for a given code address.  Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+	s16		sp_offset;
+	s16		bp_offset;
+	unsigned	sp_reg:4;
+	unsigned	bp_reg:4;
+	unsigned	type:2;
+};
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack for the ORC unwinder.
+ *
+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
+ */
+struct unwind_hint {
+	u32		ip;
+	s16		sp_offset;
+	u8		sp_reg;
+	u8		type;
+};
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 028245e1c42b..0b03d655db7c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
 #include <asm/nops.h>
 #include <asm/special_insns.h>
 #include <asm/fpu/types.h>
+#include <asm/unwind_hints.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -684,6 +685,7 @@ static inline void sync_core(void)
 	unsigned int tmp;
 
 	asm volatile (
+		UNWIND_HINT_SAVE
 		"mov %%ss, %0\n\t"
 		"pushq %q0\n\t"
 		"pushq %%rsp\n\t"
@@ -693,6 +695,7 @@ static inline void sync_core(void)
 		"pushq %q0\n\t"
 		"pushq $1f\n\t"
 		"iretq\n\t"
+		UNWIND_HINT_RESTORE
 		"1:"
 		: "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
 #endif
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..ff871210b9f2
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,109 @@
+#ifndef __ASM_X86_REFCOUNT_H
+#define __ASM_X86_REFCOUNT_H
+/*
+ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
+ * PaX/grsecurity.
+ */
+#include <linux/refcount.h>
+
+/*
+ * This is the first portion of the refcount error handling, which lives in
+ * .text.unlikely, and is jumped to from the CPU flag check (in the
+ * following macros). This saves the refcount value location into CX for
+ * the exception handler to use (in mm/extable.c), and then triggers the
+ * central refcount exception. The fixup address for the exception points
+ * back to the regular execution flow in .text.
+ */
+#define _REFCOUNT_EXCEPTION				\
+	".pushsection .text.unlikely\n"			\
+	"111:\tlea %[counter], %%" _ASM_CX "\n"		\
+	"112:\t" ASM_UD0 "\n"				\
+	ASM_UNREACHABLE					\
+	".popsection\n"					\
+	"113:\n"					\
+	_ASM_EXTABLE_REFCOUNT(112b, 113b)
+
+/* Trigger refcount exception if refcount result is negative. */
+#define REFCOUNT_CHECK_LT_ZERO				\
+	"js 111f\n\t"					\
+	_REFCOUNT_EXCEPTION
+
+/* Trigger refcount exception if refcount result is zero or negative. */
+#define REFCOUNT_CHECK_LE_ZERO				\
+	"jz 111f\n\t"					\
+	REFCOUNT_CHECK_LT_ZERO
+
+/* Trigger refcount exception unconditionally. */
+#define REFCOUNT_ERROR					\
+	"jmp 111f\n\t"					\
+	_REFCOUNT_EXCEPTION
+
+static __always_inline void refcount_add(unsigned int i, refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
+		REFCOUNT_CHECK_LT_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: "ir" (i)
+		: "cc", "cx");
+}
+
+static __always_inline void refcount_inc(refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "incl %0\n\t"
+		REFCOUNT_CHECK_LT_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: : "cc", "cx");
+}
+
+static __always_inline void refcount_dec(refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "decl %0\n\t"
+		REFCOUNT_CHECK_LE_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: : "cc", "cx");
+}
+
+static __always_inline __must_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+	GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
+				  r->refs.counter, "er", i, "%0", e);
+}
+
+static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+	GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
+				 r->refs.counter, "%0", e);
+}
+
+static __always_inline __must_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+	int c, result;
+
+	c = atomic_read(&(r->refs));
+	do {
+		if (unlikely(c == 0))
+			return false;
+
+		result = c + i;
+
+		/* Did we try to increment from/to an undesirable state? */
+		if (unlikely(c < 0 || c == INT_MAX || result < c)) {
+			asm volatile(REFCOUNT_ERROR
+				     : : [counter] "m" (r->refs.counter)
+				     : "cc", "cx");
+			break;
+		}
+
+	} while (!atomic_try_cmpxchg(&(r->refs), &c, result));
+
+	return c != 0;
+}
+
+static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+	return refcount_add_not_zero(1, r);
+}
+
+#endif
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 661dd305694a..045f99211a99 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -1,45 +1,56 @@
 #ifndef _ASM_X86_RMWcc
 #define _ASM_X86_RMWcc
 
+#define __CLOBBERS_MEM		"memory"
+#define __CLOBBERS_MEM_CC_CX	"memory", "cc", "cx"
+
 #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
 
 /* Use asm goto */
 
-#define __GEN_RMWcc(fullop, var, cc, ...)				\
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...)			\
 do {									\
 	asm_volatile_goto (fullop "; j" #cc " %l[cc_label]"		\
-			: : "m" (var), ## __VA_ARGS__ 			\
-			: "memory" : cc_label);				\
+			: : [counter] "m" (var), ## __VA_ARGS__		\
+			: clobbers : cc_label);				\
 	return 0;							\
 cc_label:								\
 	return 1;							\
 } while (0)
 
-#define GEN_UNARY_RMWcc(op, var, arg0, cc) 				\
-	__GEN_RMWcc(op " " arg0, var, cc)
+#define __BINARY_RMWcc_ARG	" %1, "
 
-#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
-	__GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
 
 #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
 
 /* Use flags output or a set instruction */
 
-#define __GEN_RMWcc(fullop, var, cc, ...)				\
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...)			\
 do {									\
 	bool c;								\
 	asm volatile (fullop ";" CC_SET(cc)				\
-			: "+m" (var), CC_OUT(cc) (c)			\
-			: __VA_ARGS__ : "memory");			\
+			: [counter] "+m" (var), CC_OUT(cc) (c)		\
+			: __VA_ARGS__ : clobbers);			\
 	return c;							\
 } while (0)
 
+#define __BINARY_RMWcc_ARG	" %2, "
+
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+
 #define GEN_UNARY_RMWcc(op, var, arg0, cc)				\
-	__GEN_RMWcc(op " " arg0, var, cc)
+	__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM)
+
+#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc)		\
+	__GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc,			\
+		    __CLOBBERS_MEM_CC_CX)
 
 #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
-	__GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
+	__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc,		\
+		    __CLOBBERS_MEM, vcon (val))
 
-#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
+#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc)	\
+	__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc,	\
+		    __CLOBBERS_MEM_CC_CX, vcon (val))
 
 #endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
new file mode 100644
index 000000000000..5e02b11c9b86
--- /dev/null
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -0,0 +1,103 @@
+#ifndef _ASM_X86_UNWIND_HINTS_H
+#define _ASM_X86_UNWIND_HINTS_H
+
+#include "orc_types.h"
+
+#ifdef __ASSEMBLY__
+
+/*
+ * In asm, there are two kinds of code: normal C-type callable functions and
+ * the rest.  The normal callable functions can be called by other code, and
+ * don't do anything unusual with the stack.  Such normal callable functions
+ * are annotated with the ENTRY/ENDPROC macros.  Most asm code falls in this
+ * category.  In this case, no special debugging annotations are needed because
+ * objtool can automatically generate the ORC data for the ORC unwinder to read
+ * at runtime.
+ *
+ * Anything which doesn't fall into the above category, such as syscall and
+ * interrupt handlers, tends to not be called directly by other functions, and
+ * often does unusual non-C-function-type things with the stack pointer.  Such
+ * code needs to be annotated such that objtool can understand it.  The
+ * following CFI hint macros are for this type of code.
+ *
+ * These macros provide hints to objtool about the state of the stack at each
+ * instruction.  Objtool starts from the hints and follows the code flow,
+ * making automatic CFI adjustments when it sees pushes and pops, filling out
+ * the debuginfo as necessary.  It will also warn if it sees any
+ * inconsistencies.
+ */
+.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL
+#ifdef CONFIG_STACK_VALIDATION
+.Lunwind_hint_ip_\@:
+	.pushsection .discard.unwind_hints
+		/* struct unwind_hint */
+		.long .Lunwind_hint_ip_\@ - .
+		.short \sp_offset
+		.byte \sp_reg
+		.byte \type
+	.popsection
+#endif
+.endm
+
+.macro UNWIND_HINT_EMPTY
+	UNWIND_HINT sp_reg=ORC_REG_UNDEFINED
+.endm
+
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
+	.if \base == %rsp && \indirect
+		.set sp_reg, ORC_REG_SP_INDIRECT
+	.elseif \base == %rsp
+		.set sp_reg, ORC_REG_SP
+	.elseif \base == %rbp
+		.set sp_reg, ORC_REG_BP
+	.elseif \base == %rdi
+		.set sp_reg, ORC_REG_DI
+	.elseif \base == %rdx
+		.set sp_reg, ORC_REG_DX
+	.elseif \base == %r10
+		.set sp_reg, ORC_REG_R10
+	.else
+		.error "UNWIND_HINT_REGS: bad base register"
+	.endif
+
+	.set sp_offset, \offset
+
+	.if \iret
+		.set type, ORC_TYPE_REGS_IRET
+	.elseif \extra == 0
+		.set type, ORC_TYPE_REGS_IRET
+		.set sp_offset, \offset + (16*8)
+	.else
+		.set type, ORC_TYPE_REGS
+	.endif
+
+	UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+.endm
+
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
+	UNWIND_HINT_REGS base=\base offset=\offset iret=1
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=8
+	UNWIND_HINT sp_offset=\sp_offset
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#define UNWIND_HINT(sp_reg, sp_offset, type)			\
+	"987: \n\t"						\
+	".pushsection .discard.unwind_hints\n\t"		\
+	/* struct unwind_hint */				\
+	".long 987b - .\n\t"					\
+	".short " __stringify(sp_offset) "\n\t"		\
+	".byte " __stringify(sp_reg) "\n\t"			\
+	".byte " __stringify(type) "\n\t"			\
+	".popsection\n\t"
+
+#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE)
+
+#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dbce3cca94cb..bd265a4cf108 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (stack_name)
 			printk("%s <%s>\n", log_lvl, stack_name);
 
+		if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
+			__show_regs(regs, 0);
+
 		/*
 		 * Scan the stack, printing any text addresses we find.  At the
 		 * same time, follow proper stack frames with the unwinder.
@@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			 * Don't print regs->ip again if it was already printed
 			 * by __show_regs() below.
 			 */
-			if (regs && stack == &regs->ip) {
-				unwind_next_frame(&state);
-				continue;
-			}
+			if (regs && stack == &regs->ip)
+				goto next;
 
 			if (stack == ret_addr_p)
 				reliable = 1;
@@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			if (!reliable)
 				continue;
 
+next:
 			/*
 			 * Get the next frame from the unwinder.  No need to
 			 * check for an error: if anything goes wrong, the rest
@@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
 			/* if the frame has entry regs, print them */
 			regs = unwind_get_entry_regs(&state);
-			if (regs)
+			if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
 				__show_regs(regs, 0);
 		}
 
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index e5f0b40e66d2..4f0481474903 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
 	 * This is a software stack, so 'end' can be a valid stack pointer.
 	 * It just means the stack is empty.
 	 */
-	if (stack < begin || stack > end)
+	if (stack <= begin || stack > end)
 		return false;
 
 	info->type	= STACK_TYPE_IRQ;
@@ -62,7 +62,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
 	 * This is a software stack, so 'end' can be a valid stack pointer.
 	 * It just means the stack is empty.
 	 */
-	if (stack < begin || stack > end)
+	if (stack <= begin || stack > end)
 		return false;
 
 	info->type	= STACK_TYPE_SOFTIRQ;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 3e1471d57487..225af4184f06 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -55,7 +55,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
 		begin = end - (exception_stack_sizes[k] / sizeof(long));
 		regs  = (struct pt_regs *)end - 1;
 
-		if (stack < begin || stack >= end)
+		if (stack <= begin || stack >= end)
 			continue;
 
 		info->type	= STACK_TYPE_EXCEPTION + k;
@@ -78,7 +78,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
 	 * This is a software stack, so 'end' can be a valid stack pointer.
 	 * It just means the stack is empty.
 	 */
-	if (stack < begin || stack > end)
+	if (stack <= begin || stack > end)
 		return false;
 
 	info->type	= STACK_TYPE_IRQ;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c3169be4c596..2987e3991c2b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -279,6 +279,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	unsigned prev_fsindex, prev_gsindex;
 
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+		     this_cpu_read(irq_count) != -1);
+
 	switch_fpu_prepare(prev_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 0ea8afcb929c..761fc88cd820 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL_GPL(ex_handler_fault);
 
+/*
+ * Handler for UD0 exception following a failed test against the
+ * result of a refcount inc/dec/add/sub.
+ */
+bool ex_handler_refcount(const struct exception_table_entry *fixup,
+			 struct pt_regs *regs, int trapnr)
+{
+	/* First unconditionally saturate the refcount. */
+	*(int *)regs->cx = INT_MIN / 2;
+
+	/*
+	 * Strictly speaking, this reports the fixup destination, not
+	 * the fault location, and not the actually overflowing
+	 * instruction, which is the instruction before the "js", but
+	 * since that instruction could be a variety of lengths, just
+	 * report the location after the overflow, which should be close
+	 * enough for finding the overflow, as it's at least back in
+	 * the function, having returned from .text.unlikely.
+	 */
+	regs->ip = ex_fixup_addr(fixup);
+
+	/*
+	 * This function has been called because either a negative refcount
+	 * value was seen by any of the refcount functions, or a zero
+	 * refcount value was seen by refcount_dec().
+	 *
+	 * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
+	 * wrapped around) will be set. Additionally, seeing the refcount
+	 * reach 0 will set ZF (Zero Flag: result was zero). In each of
+	 * these cases we want a report, since it's a boundary condition.
+	 *
+	 */
+	if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
+		bool zero = regs->flags & X86_EFLAGS_ZF;
+
+		refcount_error_report(regs, zero ? "hit zero" : "overflow");
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_refcount);
+
 bool ex_handler_ext(const struct exception_table_entry *fixup,
 		   struct pt_regs *regs, int trapnr)
 {
diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index b39531babec0..eaaf1ebcc7a4 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -44,18 +44,10 @@
 	: "r" (uaddr), "I" (-EFAULT), "r" (oparg)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 #if !XCHAL_HAVE_S32C1I
 	return -ENOSYS;
@@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (ret)
-		return ret;
+	if (!ret)
+		*oval = oldval;
 
-	switch (cmp) {
-	case FUTEX_OP_CMP_EQ: return (oldval == cmparg);
-	case FUTEX_OP_CMP_NE: return (oldval != cmparg);
-	case FUTEX_OP_CMP_LT: return (oldval < cmparg);
-	case FUTEX_OP_CMP_GE: return (oldval >= cmparg);
-	case FUTEX_OP_CMP_LE: return (oldval <= cmparg);
-	case FUTEX_OP_CMP_GT: return (oldval > cmparg);
-	}
-
-	return -ENOSYS;
+	return ret;
 }
 
 static inline int
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 87b7df4851bf..07125e7941f4 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
 static int raise_blk_irq(int cpu, struct request *rq)
 {
 	if (cpu_online(cpu)) {
-		struct call_single_data *data = &rq->csd;
+		call_single_data_t *data = &rq->csd;
 
 		data->func = trigger_softirq;
 		data->info = rq;
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 19182d091587..1893e416e7c0 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2884,7 +2884,7 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
 	 * need to be interruptible while waiting.
 	 */
 	INIT_WORK_ONSTACK(&flush.work, flush_probe);
-	COMPLETION_INITIALIZER_ONSTACK(flush.cmp);
+	init_completion(&flush.cmp);
 	queue_work(nfit_wq, &flush.work);
 	mutex_unlock(&acpi_desc->init_mutex);
 
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 85c24cace973..81142ce781da 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -13,7 +13,7 @@
 struct nullb_cmd {
 	struct list_head list;
 	struct llist_node ll_list;
-	struct call_single_data csd;
+	call_single_data_t csd;
 	struct request *rq;
 	struct bio *bio;
 	unsigned int tag;
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 72bbfccef113..fd4b7f684bd0 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -455,7 +455,11 @@ void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa
 			per_cpu(timer_unstable_counter_workaround, i) = wa;
 	}
 
-	static_branch_enable(&arch_timer_read_ool_enabled);
+	/*
+	 * Use the locked version, as we're called from the CPU
+	 * hotplug framework. Otherwise, we end-up in deadlock-land.
+	 */
+	static_branch_enable_cpuslocked(&arch_timer_read_ool_enabled);
 
 	/*
 	 * Don't use the vdso fastpath if errata require using the
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 71e586d7df71..147f38ea0fcd 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -119,13 +119,13 @@ struct cpuidle_coupled {
 
 #define CPUIDLE_COUPLED_NOT_IDLE	(-1)
 
-static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
+static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);
 
 /*
  * The cpuidle_coupled_poke_pending mask is used to avoid calling
- * __smp_call_function_single with the per cpu call_single_data struct already
+ * __smp_call_function_single with the per cpu call_single_data_t struct already
  * in use.  This prevents a deadlock where two cpus are waiting for each others
- * call_single_data struct to be available
+ * call_single_data_t struct to be available
  */
 static cpumask_t cpuidle_coupled_poke_pending;
 
@@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info)
  */
 static void cpuidle_coupled_poke(int cpu)
 {
-	struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
+	call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
 
 	if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
 		smp_call_function_single_async(cpu, csd);
@@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev)
 {
 	int cpu;
 	struct cpuidle_device *other_dev;
-	struct call_single_data *csd;
+	call_single_data_t *csd;
 	struct cpuidle_coupled *coupled;
 
 	if (cpumask_empty(&dev->coupled_cpus))
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 48572b157222..a36216bd2a84 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -28,6 +28,7 @@
 
 #include <linux/debugfs.h>
 #include <linux/sort.h>
+#include <linux/sched/mm.h>
 #include "intel_drv.h"
 
 static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
@@ -4305,7 +4306,7 @@ i915_drop_caches_set(void *data, u64 val)
 		mutex_unlock(&dev->struct_mutex);
 	}
 
-	lockdep_set_current_reclaim_state(GFP_KERNEL);
+	fs_reclaim_acquire(GFP_KERNEL);
 	if (val & DROP_BOUND)
 		i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND);
 
@@ -4314,7 +4315,7 @@ i915_drop_caches_set(void *data, u64 val)
 
 	if (val & DROP_SHRINK_ALL)
 		i915_gem_shrink_all(dev_priv);
-	lockdep_clear_current_reclaim_state();
+	fs_reclaim_release(GFP_KERNEL);
 
 	if (val & DROP_FREED) {
 		synchronize_rcu();
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 51583ae4b1eb..120b6e537b28 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg)
 	if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
 		napi_schedule_irqoff(&droq->napi);
 	} else {
-		struct call_single_data *csd = &droq->csd;
+		call_single_data_t *csd = &droq->csd;
 
 		csd->func = napi_schedule_wrapper;
 		csd->info = &droq->napi;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
index 6efd139b894d..f91bc84d1719 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
@@ -328,7 +328,7 @@ struct octeon_droq {
 
 	u32 cpu_id;
 
-	struct call_single_data csd;
+	call_single_data_t csd;
 };
 
 #define OCT_DROQ_SIZE   (sizeof(struct octeon_droq))
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 3d424a51cabb..f0fd3adb1693 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -446,14 +446,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 
 			ovl_path_upper(dentry, &upperpath);
 			realfile = ovl_path_open(&upperpath, O_RDONLY);
-			smp_mb__before_spinlock();
+
 			inode_lock(inode);
 			if (!od->upperfile) {
 				if (IS_ERR(realfile)) {
 					inode_unlock(inode);
 					return PTR_ERR(realfile);
 				}
-				od->upperfile = realfile;
+				smp_store_release(&od->upperfile, realfile);
 			} else {
 				/* somebody has beaten us to it */
 				if (!IS_ERR(realfile))
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b0d5897bc4e6..886085b47c75 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 		goto out;
 	WRITE_ONCE(uwq->waken, true);
 	/*
-	 * The implicit smp_mb__before_spinlock in try_to_wake_up()
-	 * renders uwq->waken visible to other CPUs before the task is
-	 * waken.
+	 * The Program-Order guarantees provided by the scheduler
+	 * ensure uwq->waken is visible before the task is woken.
 	 */
 	ret = wake_up_state(wq->private, mode);
-	if (ret)
+	if (ret) {
 		/*
 		 * Wake only once, autoremove behavior.
 		 *
-		 * After the effect of list_del_init is visible to the
-		 * other CPUs, the waitqueue may disappear from under
-		 * us, see the !list_empty_careful() in
-		 * handle_userfault(). try_to_wake_up() has an
-		 * implicit smp_mb__before_spinlock, and the
-		 * wq->private is read before calling the extern
-		 * function "wake_up_state" (which in turns calls
-		 * try_to_wake_up). While the spin_lock;spin_unlock;
-		 * wouldn't be enough, the smp_mb__before_spinlock is
-		 * enough to avoid an explicit smp_mb() here.
+		 * After the effect of list_del_init is visible to the other
+		 * CPUs, the waitqueue may disappear from under us, see the
+		 * !list_empty_careful() in handle_userfault().
+		 *
+		 * try_to_wake_up() has an implicit smp_mb(), and the
+		 * wq->private is read before calling the extern function
+		 * "wake_up_state" (which in turns calls try_to_wake_up).
 		 */
 		list_del_init(&wq->entry);
+	}
 out:
 	return ret;
 }
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index dad68bf46c77..8d28eb010d0d 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -21,6 +21,8 @@ typedef struct {
 extern long long atomic64_read(const atomic64_t *v);
 extern void	 atomic64_set(atomic64_t *v, long long i);
 
+#define atomic64_set_release(v, i)	atomic64_set((v), (i))
+
 #define ATOMIC64_OP(op)							\
 extern void	 atomic64_##op(long long a, atomic64_t *v);
 
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index bf2d34c9d804..f0d8b1c51343 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -13,7 +13,7 @@
  */
 
 /**
- * futex_atomic_op_inuser() - Atomic arithmetic operation with constant
+ * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant
  *			  argument and comparison of the previous
  *			  futex value with another constant.
  *
@@ -25,18 +25,11 @@
  * <0 - On error
  */
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval, ret;
 	u32 tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
 	preempt_disable();
 	pagefault_disable();
 
@@ -74,17 +67,9 @@ out_pagefault_enable:
 	pagefault_enable();
 	preempt_enable();
 
-	if (ret == 0) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (ret == 0)
+		*oval = oldval;
+
 	return ret;
 }
 
@@ -126,18 +111,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 
 #else
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -153,17 +129,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 7ef015eb3403..b4531e3b2120 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -915,6 +915,9 @@ extern void ioport_unmap(void __iomem *p);
 #endif /* CONFIG_GENERIC_IOMAP */
 #endif /* CONFIG_HAS_IOPORT_MAP */
 
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
 #ifndef xlate_dev_kmem_ptr
 #define xlate_dev_kmem_ptr xlate_dev_kmem_ptr
 static inline void *xlate_dev_kmem_ptr(void *addr)
@@ -954,6 +957,14 @@ static inline void *bus_to_virt(unsigned long address)
 
 #ifndef memset_io
 #define memset_io memset_io
+/**
+ * memset_io	Set a range of I/O memory to a constant value
+ * @addr:	The beginning of the I/O-memory range to set
+ * @val:	The value to set the memory to
+ * @count:	The number of bytes to set
+ *
+ * Set a range of I/O memory to a given value.
+ */
 static inline void memset_io(volatile void __iomem *addr, int value,
 			     size_t size)
 {
@@ -963,6 +974,14 @@ static inline void memset_io(volatile void __iomem *addr, int value,
 
 #ifndef memcpy_fromio
 #define memcpy_fromio memcpy_fromio
+/**
+ * memcpy_fromio	Copy a block of data from I/O memory
+ * @dst:		The (RAM) destination for the copy
+ * @src:		The (I/O memory) source for the data
+ * @count:		The number of bytes to copy
+ *
+ * Copy a block of data from I/O memory.
+ */
 static inline void memcpy_fromio(void *buffer,
 				 const volatile void __iomem *addr,
 				 size_t size)
@@ -973,6 +992,14 @@ static inline void memcpy_fromio(void *buffer,
 
 #ifndef memcpy_toio
 #define memcpy_toio memcpy_toio
+/**
+ * memcpy_toio		Copy a block of data into I/O memory
+ * @dst:		The (I/O memory) destination for the copy
+ * @src:		The (RAM) source for the data
+ * @count:		The number of bytes to copy
+ *
+ * Copy a block of data to I/O memory.
+ */
 static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
 			       size_t size)
 {
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index c56be7410130..40d6bfec0e0d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -38,6 +38,9 @@
  * Besides, if an arch has a special barrier for acquire/release, it could
  * implement its own __atomic_op_* and use the same framework for building
  * variants
+ *
+ * If an architecture overrides __atomic_op_acquire() it will probably want
+ * to define smp_mb__after_spinlock().
  */
 #ifndef __atomic_op_acquire
 #define __atomic_op_acquire(op, args...)				\
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2a5d52fa90f5..4b99b13c7e68 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -134,7 +134,7 @@ typedef __u32 __bitwise req_flags_t;
 struct request {
 	struct list_head queuelist;
 	union {
-		struct call_single_data csd;
+		call_single_data_t csd;
 		u64 fifo_time;
 	};
 
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index bdb80c4aef6e..10825052b03f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -203,11 +203,16 @@
 
 #ifdef CONFIG_STACK_VALIDATION
 #define annotate_unreachable() ({					\
-	asm("%c0:\t\n"							\
-	    ".pushsection .discard.unreachable\t\n"			\
-	    ".long %c0b - .\t\n"					\
-	    ".popsection\t\n" : : "i" (__LINE__));			\
+	asm("%c0:\n\t"							\
+	    ".pushsection .discard.unreachable\n\t"			\
+	    ".long %c0b - .\n\t"					\
+	    ".popsection\n\t" : : "i" (__LINE__));			\
 })
+#define ASM_UNREACHABLE							\
+	"999:\n\t"							\
+	".pushsection .discard.unreachable\n\t"				\
+	".long 999b - .\n\t"						\
+	".popsection\n\t"
 #else
 #define annotate_unreachable()
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 043b60de041e..535504312fc3 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -185,6 +185,9 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #endif
 
 /* Unreachable code */
+#ifndef ASM_UNREACHABLE
+# define ASM_UNREACHABLE
+#endif
 #ifndef unreachable
 # define unreachable() do { } while (1)
 #endif
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 5d5aaae3af43..cae5400022a3 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -9,6 +9,9 @@
  */
 
 #include <linux/wait.h>
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
+#include <linux/lockdep.h>
+#endif
 
 /*
  * struct completion - structure used to maintain state for a "completion"
@@ -25,13 +28,53 @@
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
+	struct lockdep_map_cross map;
+#endif
 };
 
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
+static inline void complete_acquire(struct completion *x)
+{
+	lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
+}
+
+static inline void complete_release(struct completion *x)
+{
+	lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_);
+}
+
+static inline void complete_release_commit(struct completion *x)
+{
+	lock_commit_crosslock((struct lockdep_map *)&x->map);
+}
+
+#define init_completion(x)						\
+do {									\
+	static struct lock_class_key __key;				\
+	lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,	\
+			"(complete)" #x,				\
+			&__key, 0);					\
+	__init_completion(x);						\
+} while (0)
+#else
+#define init_completion(x) __init_completion(x)
+static inline void complete_acquire(struct completion *x) {}
+static inline void complete_release(struct completion *x) {}
+static inline void complete_release_commit(struct completion *x) {}
+#endif
+
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
+#define COMPLETION_INITIALIZER(work) \
+	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
+	STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
+#else
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+#endif
 
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
-	({ init_completion(&work); work; })
+	(*({ init_completion(&work); &work; }))
 
 /**
  * DECLARE_COMPLETION - declare and initialize a completion structure
@@ -70,7 +113,7 @@ struct completion {
  * This inline function will initialize a dynamically created completion
  * structure.
  */
-static inline void init_completion(struct completion *x)
+static inline void __init_completion(struct completion *x)
 {
 	x->done = 0;
 	init_waitqueue_head(&x->wait);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 898cfe2eeb42..e74655d941b7 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -37,12 +37,6 @@ static inline bool cpusets_enabled(void)
 	return static_branch_unlikely(&cpusets_enabled_key);
 }
 
-static inline int nr_cpusets(void)
-{
-	/* jump label reference count + the top-level cpuset */
-	return static_key_count(&cpusets_enabled_key.key) + 1;
-}
-
 static inline void cpuset_inc(void)
 {
 	static_branch_inc(&cpusets_pre_enable_key);
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 7c5b694864cd..f36bfd26f998 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -54,7 +54,6 @@ union futex_key {
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
-extern void exit_pi_state_list(struct task_struct *curr);
 #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
 #define futex_cmpxchg_enabled 1
 #else
@@ -64,8 +63,14 @@ extern int futex_cmpxchg_enabled;
 static inline void exit_robust_list(struct task_struct *curr)
 {
 }
+#endif
+
+#ifdef CONFIG_FUTEX_PI
+extern void exit_pi_state_list(struct task_struct *curr);
+#else
 static inline void exit_pi_state_list(struct task_struct *curr)
 {
 }
 #endif
+
 #endif
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5dd1272d1ab2..5fdd93bb9300 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -23,10 +23,26 @@
 # define trace_softirq_context(p)	((p)->softirq_context)
 # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
-# define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
-# define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
-# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
+# define trace_hardirq_enter()			\
+do {						\
+	current->hardirq_context++;		\
+	crossrelease_hist_start(XHLOCK_HARD);	\
+} while (0)
+# define trace_hardirq_exit()			\
+do {						\
+	current->hardirq_context--;		\
+	crossrelease_hist_end(XHLOCK_HARD);	\
+} while (0)
+# define lockdep_softirq_enter()		\
+do {						\
+	current->softirq_context++;		\
+	crossrelease_hist_start(XHLOCK_SOFT);	\
+} while (0)
+# define lockdep_softirq_exit()			\
+do {						\
+	current->softirq_context--;		\
+	crossrelease_hist_end(XHLOCK_SOFT);	\
+} while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2afd74b9d844..cd5861651b17 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -163,6 +163,8 @@ extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
 extern void static_key_disable(struct static_key *key);
+extern void static_key_enable_cpuslocked(struct static_key *key);
+extern void static_key_disable_cpuslocked(struct static_key *key);
 
 /*
  * We should be using ATOMIC_INIT() for initializing .enabled, but
@@ -234,24 +236,29 @@ static inline int jump_label_apply_nops(struct module *mod)
 
 static inline void static_key_enable(struct static_key *key)
 {
-	int count = static_key_count(key);
-
-	WARN_ON_ONCE(count < 0 || count > 1);
+	STATIC_KEY_CHECK_USE();
 
-	if (!count)
-		static_key_slow_inc(key);
+	if (atomic_read(&key->enabled) != 0) {
+		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+		return;
+	}
+	atomic_set(&key->enabled, 1);
 }
 
 static inline void static_key_disable(struct static_key *key)
 {
-	int count = static_key_count(key);
-
-	WARN_ON_ONCE(count < 0 || count > 1);
+	STATIC_KEY_CHECK_USE();
 
-	if (count)
-		static_key_slow_dec(key);
+	if (atomic_read(&key->enabled) != 1) {
+		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+		return;
+	}
+	atomic_set(&key->enabled, 0);
 }
 
+#define static_key_enable_cpuslocked(k)		static_key_enable((k))
+#define static_key_disable_cpuslocked(k)	static_key_disable((k))
+
 #define STATIC_KEY_INIT_TRUE	{ .enabled = ATOMIC_INIT(1) }
 #define STATIC_KEY_INIT_FALSE	{ .enabled = ATOMIC_INIT(0) }
 
@@ -413,8 +420,10 @@ extern bool ____wrong_branch_error(void);
  * Normal usage; boolean enable/disable.
  */
 
-#define static_branch_enable(x)		static_key_enable(&(x)->key)
-#define static_branch_disable(x)	static_key_disable(&(x)->key)
+#define static_branch_enable(x)			static_key_enable(&(x)->key)
+#define static_branch_disable(x)		static_key_disable(&(x)->key)
+#define static_branch_enable_cpuslocked(x)	static_key_enable_cpuslocked(&(x)->key)
+#define static_branch_disable_cpuslocked(x)	static_key_disable_cpuslocked(&(x)->key)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index b7f8aced7870..41960fecf783 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,11 +2,13 @@
 #define _LINUX_KASAN_CHECKS_H
 
 #ifdef CONFIG_KASAN
-void kasan_check_read(const void *p, unsigned int size);
-void kasan_check_write(const void *p, unsigned int size);
+void kasan_check_read(const volatile void *p, unsigned int size);
+void kasan_check_write(const volatile void *p, unsigned int size);
 #else
-static inline void kasan_check_read(const void *p, unsigned int size) { }
-static inline void kasan_check_write(const void *p, unsigned int size) { }
+static inline void kasan_check_read(const volatile void *p, unsigned int size)
+{ }
+static inline void kasan_check_write(const volatile void *p, unsigned int size)
+{ }
 #endif
 
 #endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bd6d96cf80b1..6607225d0ea4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -277,6 +277,13 @@ extern int oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err);
+#else
+static inline void refcount_error_report(struct pt_regs *regs, const char *err)
+{ }
+#endif
+
 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
 int __must_check _kstrtol(const char *s, unsigned int base, long *res);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index fffe49f188e6..bfa8e0b0d6f1 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -18,6 +18,8 @@ extern int lock_stat;
 
 #define MAX_LOCKDEP_SUBCLASSES		8UL
 
+#include <linux/types.h>
+
 #ifdef CONFIG_LOCKDEP
 
 #include <linux/linkage.h>
@@ -29,7 +31,7 @@ extern int lock_stat;
  * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
  * the total number of states... :-(
  */
-#define XXX_LOCK_USAGE_STATES		(1+3*4)
+#define XXX_LOCK_USAGE_STATES		(1+2*4)
 
 /*
  * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
@@ -155,6 +157,12 @@ struct lockdep_map {
 	int				cpu;
 	unsigned long			ip;
 #endif
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+	/*
+	 * Whether it's a crosslock.
+	 */
+	int				cross;
+#endif
 };
 
 static inline void lockdep_copy_map(struct lockdep_map *to,
@@ -258,8 +266,95 @@ struct held_lock {
 	unsigned int hardirqs_off:1;
 	unsigned int references:12;					/* 32 bits */
 	unsigned int pin_count;
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+	/*
+	 * Generation id.
+	 *
+	 * A value of cross_gen_id will be stored when holding this,
+	 * which is globally increased whenever each crosslock is held.
+	 */
+	unsigned int gen_id;
+#endif
+};
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#define MAX_XHLOCK_TRACE_ENTRIES 5
+
+/*
+ * This is for keeping locks waiting for commit so that true dependencies
+ * can be added at commit step.
+ */
+struct hist_lock {
+	/*
+	 * Id for each entry in the ring buffer. This is used to
+	 * decide whether the ring buffer was overwritten or not.
+	 *
+	 * For example,
+	 *
+	 *           |<----------- hist_lock ring buffer size ------->|
+	 *           pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii
+	 * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii.......................
+	 *
+	 *           where 'p' represents an acquisition in process
+	 *           context, 'i' represents an acquisition in irq
+	 *           context.
+	 *
+	 * In this example, the ring buffer was overwritten by
+	 * acquisitions in irq context, that should be detected on
+	 * rollback or commit.
+	 */
+	unsigned int hist_id;
+
+	/*
+	 * Seperate stack_trace data. This will be used at commit step.
+	 */
+	struct stack_trace	trace;
+	unsigned long		trace_entries[MAX_XHLOCK_TRACE_ENTRIES];
+
+	/*
+	 * Seperate hlock instance. This will be used at commit step.
+	 *
+	 * TODO: Use a smaller data structure containing only necessary
+	 * data. However, we should make lockdep code able to handle the
+	 * smaller one first.
+	 */
+	struct held_lock	hlock;
+};
+
+/*
+ * To initialize a lock as crosslock, lockdep_init_map_crosslock() should
+ * be called instead of lockdep_init_map().
+ */
+struct cross_lock {
+	/*
+	 * When more than one acquisition of crosslocks are overlapped,
+	 * we have to perform commit for them based on cross_gen_id of
+	 * the first acquisition, which allows us to add more true
+	 * dependencies.
+	 *
+	 * Moreover, when no acquisition of a crosslock is in progress,
+	 * we should not perform commit because the lock might not exist
+	 * any more, which might cause incorrect memory access. So we
+	 * have to track the number of acquisitions of a crosslock.
+	 */
+	int nr_acquire;
+
+	/*
+	 * Seperate hlock instance. This will be used at commit step.
+	 *
+	 * TODO: Use a smaller data structure containing only necessary
+	 * data. However, we should make lockdep code able to handle the
+	 * smaller one first.
+	 */
+	struct held_lock	hlock;
 };
 
+struct lockdep_map_cross {
+	struct lockdep_map map;
+	struct cross_lock xlock;
+};
+#endif
+
 /*
  * Initialization, self-test and debugging-output methods:
  */
@@ -282,13 +377,6 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
 			     struct lock_class_key *key, int subclass);
 
 /*
- * To initialize a lockdep_map statically use this macro.
- * Note that _name must not be NULL.
- */
-#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
-	{ .name = (_name), .key = (void *)(_key), }
-
-/*
  * Reinitialize a lock key - for cases where there is special locking or
  * special initialization of locks so that the validator gets the scope
  * of dependencies wrong: they are either too broad (they need a class-split)
@@ -363,10 +451,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
 
 extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
 
-extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
-extern void lockdep_clear_current_reclaim_state(void);
-extern void lockdep_trace_alloc(gfp_t mask);
-
 struct pin_cookie { unsigned int val; };
 
 #define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
@@ -375,7 +459,7 @@ extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
 extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
 extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 
-# define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
+# define INIT_LOCKDEP				.lockdep_recursion = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
 
@@ -416,9 +500,6 @@ static inline void lockdep_on(void)
 # define lock_downgrade(l, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
-# define lockdep_set_current_reclaim_state(g)	do { } while (0)
-# define lockdep_clear_current_reclaim_state()	do { } while (0)
-# define lockdep_trace_alloc(g)			do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
 		do { (void)(name); (void)(key); } while (0)
@@ -467,6 +548,58 @@ struct pin_cookie { };
 
 #endif /* !LOCKDEP */
 
+enum xhlock_context_t {
+	XHLOCK_HARD,
+	XHLOCK_SOFT,
+	XHLOCK_CTX_NR,
+};
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+extern void lockdep_init_map_crosslock(struct lockdep_map *lock,
+				       const char *name,
+				       struct lock_class_key *key,
+				       int subclass);
+extern void lock_commit_crosslock(struct lockdep_map *lock);
+
+/*
+ * What we essencially have to initialize is 'nr_acquire'. Other members
+ * will be initialized in add_xlock().
+ */
+#define STATIC_CROSS_LOCK_INIT() \
+	{ .nr_acquire = 0,}
+
+#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \
+	{ .map.name = (_name), .map.key = (void *)(_key), \
+	  .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), }
+
+/*
+ * To initialize a lockdep_map statically use this macro.
+ * Note that _name must not be NULL.
+ */
+#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
+	{ .name = (_name), .key = (void *)(_key), .cross = 0, }
+
+extern void crossrelease_hist_start(enum xhlock_context_t c);
+extern void crossrelease_hist_end(enum xhlock_context_t c);
+extern void lockdep_invariant_state(bool force);
+extern void lockdep_init_task(struct task_struct *task);
+extern void lockdep_free_task(struct task_struct *task);
+#else /* !CROSSRELEASE */
+#define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
+/*
+ * To initialize a lockdep_map statically use this macro.
+ * Note that _name must not be NULL.
+ */
+#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
+	{ .name = (_name), .key = (void *)(_key), }
+
+static inline void crossrelease_hist_start(enum xhlock_context_t c) {}
+static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
+static inline void lockdep_invariant_state(bool force) {}
+static inline void lockdep_init_task(struct task_struct *task) {}
+static inline void lockdep_free_task(struct task_struct *task) {}
+#endif /* CROSSRELEASE */
+
 #ifdef CONFIG_LOCK_STAT
 
 extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3cadee0a3508..57378c7cb5f8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -526,26 +526,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 extern void tlb_finish_mmu(struct mmu_gather *tlb,
 				unsigned long start, unsigned long end);
 
-/*
- * Memory barriers to keep this state in sync are graciously provided by
- * the page table locks, outside of which no page table modifications happen.
- * The barriers are used to ensure the order between tlb_flush_pending updates,
- * which happen while the lock is not taken, and the PTE updates, which happen
- * while the lock is taken, are serialized.
- */
-static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
-{
-	return atomic_read(&mm->tlb_flush_pending) > 0;
-}
-
-/*
- * Returns true if there are two above TLB batching threads in parallel.
- */
-static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
-{
-	return atomic_read(&mm->tlb_flush_pending) > 1;
-}
-
 static inline void init_tlb_flush_pending(struct mm_struct *mm)
 {
 	atomic_set(&mm->tlb_flush_pending, 0);
@@ -554,27 +534,82 @@ static inline void init_tlb_flush_pending(struct mm_struct *mm)
 static inline void inc_tlb_flush_pending(struct mm_struct *mm)
 {
 	atomic_inc(&mm->tlb_flush_pending);
-
 	/*
-	 * Guarantee that the tlb_flush_pending increase does not leak into the
-	 * critical section updating the page tables
+	 * The only time this value is relevant is when there are indeed pages
+	 * to flush. And we'll only flush pages after changing them, which
+	 * requires the PTL.
+	 *
+	 * So the ordering here is:
+	 *
+	 *	atomic_inc(&mm->tlb_flush_pending);
+	 *	spin_lock(&ptl);
+	 *	...
+	 *	set_pte_at();
+	 *	spin_unlock(&ptl);
+	 *
+	 *				spin_lock(&ptl)
+	 *				mm_tlb_flush_pending();
+	 *				....
+	 *				spin_unlock(&ptl);
+	 *
+	 *	flush_tlb_range();
+	 *	atomic_dec(&mm->tlb_flush_pending);
+	 *
+	 * Where the increment if constrained by the PTL unlock, it thus
+	 * ensures that the increment is visible if the PTE modification is
+	 * visible. After all, if there is no PTE modification, nobody cares
+	 * about TLB flushes either.
+	 *
+	 * This very much relies on users (mm_tlb_flush_pending() and
+	 * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
+	 * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
+	 * locks (PPC) the unlock of one doesn't order against the lock of
+	 * another PTL.
+	 *
+	 * The decrement is ordered by the flush_tlb_range(), such that
+	 * mm_tlb_flush_pending() will not return false unless all flushes have
+	 * completed.
 	 */
-	smp_mb__before_spinlock();
 }
 
-/* Clearing is done after a TLB flush, which also provides a barrier. */
 static inline void dec_tlb_flush_pending(struct mm_struct *mm)
 {
 	/*
-	 * Guarantee that the tlb_flush_pending does not not leak into the
-	 * critical section, since we must order the PTE change and changes to
-	 * the pending TLB flush indication. We could have relied on TLB flush
-	 * as a memory barrier, but this behavior is not clearly documented.
+	 * See inc_tlb_flush_pending().
+	 *
+	 * This cannot be smp_mb__before_atomic() because smp_mb() simply does
+	 * not order against TLB invalidate completion, which is what we need.
+	 *
+	 * Therefore we must rely on tlb_flush_*() to guarantee order.
 	 */
-	smp_mb__before_atomic();
 	atomic_dec(&mm->tlb_flush_pending);
 }
 
+static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
+{
+	/*
+	 * Must be called after having acquired the PTL; orders against that
+	 * PTLs release and therefore ensures that if we observe the modified
+	 * PTE we must also observe the increment from inc_tlb_flush_pending().
+	 *
+	 * That is, it only guarantees to return true if there is a flush
+	 * pending for _this_ PTL.
+	 */
+	return atomic_read(&mm->tlb_flush_pending);
+}
+
+static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+{
+	/*
+	 * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
+	 * for which there is a TLB flush pending in order to guarantee
+	 * we've seen both that PTE modification and the increment.
+	 *
+	 * (no requirement on actually still holding the PTL, that is irrelevant)
+	 */
+	return atomic_read(&mm->tlb_flush_pending) > 1;
+}
+
 struct vm_fault;
 
 struct vm_special_mapping {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c99ba7914c0a..461bd5757af6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2774,7 +2774,7 @@ struct softnet_data {
 	unsigned int		input_queue_head ____cacheline_aligned_in_smp;
 
 	/* Elements below can be accessed between CPUs for RPS/RFS */
-	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	call_single_data_t	csd ____cacheline_aligned_in_smp;
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
 	unsigned int		input_queue_tail;
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 591792c8e5b0..48b7c9c68c4d 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -53,6 +53,9 @@ extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
 extern __must_check bool refcount_dec_and_test(refcount_t *r);
 extern void refcount_dec(refcount_t *r);
 #else
+# ifdef CONFIG_ARCH_HAS_REFCOUNT
+#  include <asm/refcount.h>
+# else
 static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 {
 	return atomic_add_unless(&r->refs, i, 0);
@@ -87,6 +90,7 @@ static inline void refcount_dec(refcount_t *r)
 {
 	atomic_dec(&r->refs);
 }
+# endif /* !CONFIG_ARCH_HAS_REFCOUNT */
 #endif /* CONFIG_REFCOUNT_FULL */
 
 extern __must_check bool refcount_dec_if_one(refcount_t *r);
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index ae0528b834cd..e784761a4443 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -32,6 +32,7 @@ struct rw_semaphore {
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 
 extern void __down_read(struct rw_semaphore *sem);
+extern int __must_check __down_read_killable(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
 extern void __down_write(struct rw_semaphore *sem);
 extern int __must_check __down_write_killable(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index dd1d14250340..0ad7318ff299 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -44,6 +44,7 @@ struct rw_semaphore {
 };
 
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c05ac5f5aa03..93be319e0cbf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -846,7 +846,17 @@ struct task_struct {
 	int				lockdep_depth;
 	unsigned int			lockdep_recursion;
 	struct held_lock		held_locks[MAX_LOCK_DEPTH];
-	gfp_t				lockdep_reclaim_gfp;
+#endif
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#define MAX_XHLOCKS_NR 64UL
+	struct hist_lock *xhlocks; /* Crossrelease history locks */
+	unsigned int xhlock_idx;
+	/* For restoring at history boundaries */
+	unsigned int xhlock_idx_hist[XHLOCK_CTX_NR];
+	unsigned int hist_id;
+	/* For overwrite check at each context exit */
+	unsigned int hist_id_save[XHLOCK_CTX_NR];
 #endif
 
 #ifdef CONFIG_UBSAN
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b24a6974847..2b0a281f9d26 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -167,6 +167,14 @@ static inline gfp_t current_gfp_context(gfp_t flags)
 	return flags;
 }
 
+#ifdef CONFIG_LOCKDEP
+extern void fs_reclaim_acquire(gfp_t gfp_mask);
+extern void fs_reclaim_release(gfp_t gfp_mask);
+#else
+static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
+static inline void fs_reclaim_release(gfp_t gfp_mask) { }
+#endif
+
 static inline unsigned int memalloc_noio_save(void)
 {
 	unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 68123c1fe549..98b1fe027fc9 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -14,13 +14,17 @@
 #include <linux/llist.h>
 
 typedef void (*smp_call_func_t)(void *info);
-struct call_single_data {
+struct __call_single_data {
 	struct llist_node llist;
 	smp_call_func_t func;
 	void *info;
 	unsigned int flags;
 };
 
+/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
+typedef struct __call_single_data call_single_data_t
+	__aligned(sizeof(struct __call_single_data));
+
 /* total number of cpus in this system (may exceed NR_CPUS) */
 extern unsigned int total_cpus;
 
@@ -48,7 +52,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 		smp_call_func_t func, void *info, bool wait,
 		gfp_t gfp_flags);
 
-int smp_call_function_single_async(int cpu, struct call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
 #ifdef CONFIG_SMP
 
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index d9510e8522d4..4e8cce19b507 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -118,16 +118,39 @@ do {								\
 #endif
 
 /*
- * Despite its name it doesn't necessarily has to be a full barrier.
- * It should only guarantee that a STORE before the critical section
- * can not be reordered with LOADs and STOREs inside this section.
- * spin_lock() is the one-way barrier, this LOAD can not escape out
- * of the region. So the default implementation simply ensures that
- * a STORE can not move into the critical section, smp_wmb() should
- * serialize it with another STORE done by spin_lock().
+ * This barrier must provide two things:
+ *
+ *   - it must guarantee a STORE before the spin_lock() is ordered against a
+ *     LOAD after it, see the comments at its two usage sites.
+ *
+ *   - it must ensure the critical section is RCsc.
+ *
+ * The latter is important for cases where we observe values written by other
+ * CPUs in spin-loops, without barriers, while being subject to scheduling.
+ *
+ * CPU0			CPU1			CPU2
+ *
+ *			for (;;) {
+ *			  if (READ_ONCE(X))
+ *			    break;
+ *			}
+ * X=1
+ *			<sched-out>
+ *						<sched-in>
+ *						r = X;
+ *
+ * without transitivity it could be that CPU1 observes X!=0 breaks the loop,
+ * we get migrated and CPU2 sees X==0.
+ *
+ * Since most load-store architectures implement ACQUIRE with an smp_mb() after
+ * the LL/SC loop, they need no further barriers. Similarly all our TSO
+ * architectures imply an smp_mb() for each atomic instruction and equally don't
+ * need more.
+ *
+ * Architectures that can implement ACQUIRE better need to take care.
  */
-#ifndef smp_mb__before_spinlock
-#define smp_mb__before_spinlock()	smp_wmb()
+#ifndef smp_mb__after_spinlock
+#define smp_mb__after_spinlock()	do { } while (0)
 #endif
 
 /**
diff --git a/init/Kconfig b/init/Kconfig
index 8514b25db21c..5f0ef850e808 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1275,12 +1275,17 @@ config BASE_FULL
 config FUTEX
 	bool "Enable futex support" if EXPERT
 	default y
-	select RT_MUTEXES
+	imply RT_MUTEXES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for "fast userspace mutexes".  The resulting kernel may not
 	  run glibc-based applications correctly.
 
+config FUTEX_PI
+	bool
+	depends on FUTEX && RT_MUTEXES
+	default y
+
 config HAVE_FUTEX_CMPXCHG
 	bool
 	depends on FUTEX
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 87a1213dd326..39ca7e976fbb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
 	rcu_read_unlock();
 }
 
+/* Must be called with cpuset_mutex held.  */
+static inline int nr_cpusets(void)
+{
+	/* jump label reference count + the top-level cpuset */
+	return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
 /*
  * generate_sched_domains()
  *
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..fa72d57db747 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -920,6 +920,7 @@ void __noreturn do_exit(long code)
 	exit_rcu();
 	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
 
+	lockdep_free_task(tsk);
 	do_task_dead();
 }
 EXPORT_SYMBOL_GPL(do_exit);
diff --git a/kernel/fork.c b/kernel/fork.c
index b7e9e57b71ea..dab73d18bc4d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -484,6 +484,8 @@ void __init fork_init(void)
 	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
 			  NULL, free_vm_stack_cache);
 #endif
+
+	lockdep_init_task(&init_task);
 }
 
 int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1700,6 +1702,7 @@ static __latent_entropy struct task_struct *copy_process(
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
 	p->lockdep_recursion = 0;
+	lockdep_init_task(p);
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -1958,6 +1961,7 @@ bad_fork_cleanup_audit:
 bad_fork_cleanup_perf:
 	perf_event_free_task(p);
 bad_fork_cleanup_policy:
+	lockdep_free_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
diff --git a/kernel/futex.c b/kernel/futex.c
index f50b434756c1..3d38eaf05492 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -876,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid)
 	return p;
 }
 
+#ifdef CONFIG_FUTEX_PI
+
 /*
  * This task is holding PI mutexes at exit time => bad.
  * Kernel cleans up PI-state, but userspace is likely hosed.
@@ -933,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr)
 	raw_spin_unlock_irq(&curr->pi_lock);
 }
 
+#endif
+
 /*
  * We need to check the following states:
  *
@@ -1547,6 +1551,45 @@ out:
 	return ret;
 }
 
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+	unsigned int op =	  (encoded_op & 0x70000000) >> 28;
+	unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24;
+	int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
+	int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+	int oldval, ret;
+
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+		if (oparg < 0 || oparg > 31)
+			return -EINVAL;
+		oparg = 1 << oparg;
+	}
+
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+		return -EFAULT;
+
+	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+	if (ret)
+		return ret;
+
+	switch (cmp) {
+	case FUTEX_OP_CMP_EQ:
+		return oldval == cmparg;
+	case FUTEX_OP_CMP_NE:
+		return oldval != cmparg;
+	case FUTEX_OP_CMP_LT:
+		return oldval < cmparg;
+	case FUTEX_OP_CMP_GE:
+		return oldval >= cmparg;
+	case FUTEX_OP_CMP_LE:
+		return oldval <= cmparg;
+	case FUTEX_OP_CMP_GT:
+		return oldval > cmparg;
+	default:
+		return -ENOSYS;
+	}
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
@@ -1800,6 +1843,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_q *this, *next;
 	DEFINE_WAKE_Q(wake_q);
 
+	/*
+	 * When PI not supported: return -ENOSYS if requeue_pi is true,
+	 * consequently the compiler knows requeue_pi is always false past
+	 * this point which will optimize away all the conditional code
+	 * further down.
+	 */
+	if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+		return -ENOSYS;
+
 	if (requeue_pi) {
 		/*
 		 * Requeue PI only works on two distinct uaddrs. This
@@ -2595,6 +2647,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	struct futex_q q = futex_q_init;
 	int res, ret;
 
+	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+		return -ENOSYS;
+
 	if (refill_pi_state_cache())
 		return -ENOMEM;
 
@@ -2774,6 +2829,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	struct futex_q *top_waiter;
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+		return -ENOSYS;
+
 retry:
 	if (get_user(uval, uaddr))
 		return -EFAULT;
@@ -2984,6 +3042,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	struct futex_q q = futex_q_init;
 	int res, ret;
 
+	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+		return -ENOSYS;
+
 	if (uaddr == uaddr2)
 		return -EINVAL;
 
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d11c506a6ac3..0bf2e8f5244a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,29 +79,7 @@ int static_key_count(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_count);
 
-void static_key_enable(struct static_key *key)
-{
-	int count = static_key_count(key);
-
-	WARN_ON_ONCE(count < 0 || count > 1);
-
-	if (!count)
-		static_key_slow_inc(key);
-}
-EXPORT_SYMBOL_GPL(static_key_enable);
-
-void static_key_disable(struct static_key *key)
-{
-	int count = static_key_count(key);
-
-	WARN_ON_ONCE(count < 0 || count > 1);
-
-	if (count)
-		static_key_slow_dec(key);
-}
-EXPORT_SYMBOL_GPL(static_key_disable);
-
-void static_key_slow_inc(struct static_key *key)
+static void static_key_slow_inc_cpuslocked(struct static_key *key)
 {
 	int v, v1;
 
@@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key)
 			return;
 	}
 
-	cpus_read_lock();
 	jump_label_lock();
 	if (atomic_read(&key->enabled) == 0) {
 		atomic_set(&key->enabled, -1);
 		jump_label_update(key);
-		atomic_set(&key->enabled, 1);
+		/*
+		 * Ensure that if the above cmpxchg loop observes our positive
+		 * value, it must also observe all the text changes.
+		 */
+		atomic_set_release(&key->enabled, 1);
 	} else {
 		atomic_inc(&key->enabled);
 	}
 	jump_label_unlock();
+}
+
+void static_key_slow_inc(struct static_key *key)
+{
+	cpus_read_lock();
+	static_key_slow_inc_cpuslocked(key);
 	cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
-static void __static_key_slow_dec(struct static_key *key,
-		unsigned long rate_limit, struct delayed_work *work)
+void static_key_enable_cpuslocked(struct static_key *key)
+{
+	STATIC_KEY_CHECK_USE();
+
+	if (atomic_read(&key->enabled) > 0) {
+		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+		return;
+	}
+
+	jump_label_lock();
+	if (atomic_read(&key->enabled) == 0) {
+		atomic_set(&key->enabled, -1);
+		jump_label_update(key);
+		/*
+		 * See static_key_slow_inc().
+		 */
+		atomic_set_release(&key->enabled, 1);
+	}
+	jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);
+
+void static_key_enable(struct static_key *key)
+{
+	cpus_read_lock();
+	static_key_enable_cpuslocked(key);
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable_cpuslocked(struct static_key *key)
+{
+	STATIC_KEY_CHECK_USE();
+
+	if (atomic_read(&key->enabled) != 1) {
+		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+		return;
+	}
+
+	jump_label_lock();
+	if (atomic_cmpxchg(&key->enabled, 1, 0))
+		jump_label_update(key);
+	jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);
+
+void static_key_disable(struct static_key *key)
 {
 	cpus_read_lock();
+	static_key_disable_cpuslocked(key);
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
+static void static_key_slow_dec_cpuslocked(struct static_key *key,
+					   unsigned long rate_limit,
+					   struct delayed_work *work)
+{
 	/*
 	 * The negative count check is valid even when a negative
 	 * key->enabled is in use by static_key_slow_inc(); a
@@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key,
 	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
 		WARN(atomic_read(&key->enabled) < 0,
 		     "jump label: negative count!\n");
-		cpus_read_unlock();
 		return;
 	}
 
@@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key,
 		jump_label_update(key);
 	}
 	jump_label_unlock();
+}
+
+static void __static_key_slow_dec(struct static_key *key,
+				  unsigned long rate_limit,
+				  struct delayed_work *work)
+{
+	cpus_read_lock();
+	static_key_slow_dec_cpuslocked(key, rate_limit, work);
 	cpus_read_unlock();
 }
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7d2499bec5fe..44c8d0d17170 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -58,6 +58,10 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
 
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#include <linux/slab.h>
+#endif
+
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on);
 #if VERBOSE
 # define HARDIRQ_VERBOSE	1
 # define SOFTIRQ_VERBOSE	1
-# define RECLAIM_VERBOSE	1
 #else
 # define HARDIRQ_VERBOSE	0
 # define SOFTIRQ_VERBOSE	0
-# define RECLAIM_VERBOSE	0
 #endif
 
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
 /*
  * Quick filtering for interesting events:
  */
@@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
 
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+static void cross_init(struct lockdep_map *lock, int cross);
+static int cross_lock(struct lockdep_map *lock);
+static int lock_acquire_crosslock(struct held_lock *hlock);
+static int lock_release_crosslock(struct lockdep_map *lock);
+#else
+static inline void cross_init(struct lockdep_map *lock, int cross) {}
+static inline int cross_lock(struct lockdep_map *lock) { return 0; }
+static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
+static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
+#endif
+
 /*
  * Register a lock's class in the hash-table, if the class is not present
  * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src,
 		printk(KERN_CONT "\n\n");
 	}
 
-	printk(" Possible unsafe locking scenario:\n\n");
-	printk("       CPU0                    CPU1\n");
-	printk("       ----                    ----\n");
-	printk("  lock(");
-	__print_lock_name(target);
-	printk(KERN_CONT ");\n");
-	printk("                               lock(");
-	__print_lock_name(parent);
-	printk(KERN_CONT ");\n");
-	printk("                               lock(");
-	__print_lock_name(target);
-	printk(KERN_CONT ");\n");
-	printk("  lock(");
-	__print_lock_name(source);
-	printk(KERN_CONT ");\n");
-	printk("\n *** DEADLOCK ***\n\n");
+	if (cross_lock(tgt->instance)) {
+		printk(" Possible unsafe locking scenario by crosslock:\n\n");
+		printk("       CPU0                    CPU1\n");
+		printk("       ----                    ----\n");
+		printk("  lock(");
+		__print_lock_name(parent);
+		printk(KERN_CONT ");\n");
+		printk("  lock(");
+		__print_lock_name(target);
+		printk(KERN_CONT ");\n");
+		printk("                               lock(");
+		__print_lock_name(source);
+		printk(KERN_CONT ");\n");
+		printk("                               unlock(");
+		__print_lock_name(target);
+		printk(KERN_CONT ");\n");
+		printk("\n *** DEADLOCK ***\n\n");
+	} else {
+		printk(" Possible unsafe locking scenario:\n\n");
+		printk("       CPU0                    CPU1\n");
+		printk("       ----                    ----\n");
+		printk("  lock(");
+		__print_lock_name(target);
+		printk(KERN_CONT ");\n");
+		printk("                               lock(");
+		__print_lock_name(parent);
+		printk(KERN_CONT ");\n");
+		printk("                               lock(");
+		__print_lock_name(target);
+		printk(KERN_CONT ");\n");
+		printk("  lock(");
+		__print_lock_name(source);
+		printk(KERN_CONT ");\n");
+		printk("\n *** DEADLOCK ***\n\n");
+	}
 }
 
 /*
@@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
 	pr_warn("%s/%d is trying to acquire lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(check_src);
-	pr_warn("\nbut task is already holding lock:\n");
+
+	if (cross_lock(check_tgt->instance))
+		pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
+	else
+		pr_warn("\nbut task is already holding lock:\n");
+
 	print_lock(check_tgt);
 	pr_warn("\nwhich lock already depends on the new lock.\n\n");
 	pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
@@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data)
 static noinline int print_circular_bug(struct lock_list *this,
 				struct lock_list *target,
 				struct held_lock *check_src,
-				struct held_lock *check_tgt)
+				struct held_lock *check_tgt,
+				struct stack_trace *trace)
 {
 	struct task_struct *curr = current;
 	struct lock_list *parent;
@@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	if (!save_trace(&this->trace))
+	if (cross_lock(check_tgt->instance))
+		this->trace = *trace;
+	else if (!save_trace(&this->trace))
 		return 0;
 
 	depth = get_lock_depth(target);
@@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
 	return result;
 }
 
+static noinline int
+check_redundant(struct lock_list *root, struct lock_class *target,
+		struct lock_list **target_entry)
+{
+	int result;
+
+	debug_atomic_inc(nr_redundant_checks);
+
+	result = __bfs_forwards(root, target, class_equal, target_entry);
+
+	return result;
+}
+
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
@@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 		if (nest)
 			return 2;
 
+		if (cross_lock(prev->instance))
+			continue;
+
 		return print_deadlock_bug(curr, prev, next);
 	}
 	return 1;
@@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance, int *stack_saved)
+	       struct held_lock *next, int distance, struct stack_trace *trace,
+	       int (*save)(struct stack_trace *trace))
 {
 	struct lock_list *entry;
 	int ret;
 	struct lock_list this;
 	struct lock_list *uninitialized_var(target_entry);
-	/*
-	 * Static variable, serialized by the graph_lock().
-	 *
-	 * We use this static variable to save the stack trace in case
-	 * we call into this function multiple times due to encountering
-	 * trylocks in the held lock stack.
-	 */
-	static struct stack_trace trace;
 
 	/*
 	 * Prove that the new <prev> -> <next> dependency would not
@@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	this.parent = NULL;
 	ret = check_noncircular(&this, hlock_class(prev), &target_entry);
 	if (unlikely(!ret))
-		return print_circular_bug(&this, target_entry, next, prev);
+		return print_circular_bug(&this, target_entry, next, prev, trace);
 	else if (unlikely(ret < 0))
 		return print_bfs_bug(ret);
 
@@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		if (entry->class == hlock_class(next)) {
 			if (distance == 1)
 				entry->distance = 1;
-			return 2;
+			return 1;
 		}
 	}
 
-	if (!*stack_saved) {
-		if (!save_trace(&trace))
-			return 0;
-		*stack_saved = 1;
+	/*
+	 * Is the <prev> -> <next> link redundant?
+	 */
+	this.class = hlock_class(prev);
+	this.parent = NULL;
+	ret = check_redundant(&this, hlock_class(next), &target_entry);
+	if (!ret) {
+		debug_atomic_inc(nr_redundant);
+		return 2;
 	}
+	if (ret < 0)
+		return print_bfs_bug(ret);
+
+
+	if (save && !save(trace))
+		return 0;
 
 	/*
 	 * Ok, all validations passed, add the new lock
@@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 */
 	ret = add_lock_to_list(hlock_class(next),
 			       &hlock_class(prev)->locks_after,
-			       next->acquire_ip, distance, &trace);
+			       next->acquire_ip, distance, trace);
 
 	if (!ret)
 		return 0;
 
 	ret = add_lock_to_list(hlock_class(prev),
 			       &hlock_class(next)->locks_before,
-			       next->acquire_ip, distance, &trace);
+			       next->acquire_ip, distance, trace);
 	if (!ret)
 		return 0;
 
@@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 * Debugging printouts:
 	 */
 	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
-		/* We drop graph lock, so another thread can overwrite trace. */
-		*stack_saved = 0;
 		graph_unlock();
 		printk("\n new dependency: ");
 		print_lock_name(hlock_class(prev));
@@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		print_lock_name(hlock_class(next));
 		printk(KERN_CONT "\n");
 		dump_stack();
-		return graph_lock();
+		if (!graph_lock())
+			return 0;
 	}
-	return 1;
+	return 2;
 }
 
 /*
@@ -1925,8 +1985,9 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
 	int depth = curr->lockdep_depth;
-	int stack_saved = 0;
 	struct held_lock *hlock;
+	struct stack_trace trace;
+	int (*save)(struct stack_trace *trace) = save_trace;
 
 	/*
 	 * Debugging checks.
@@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		int distance = curr->lockdep_depth - depth + 1;
 		hlock = curr->held_locks + depth - 1;
 		/*
-		 * Only non-recursive-read entries get new dependencies
-		 * added:
+		 * Only non-crosslock entries get new dependencies added.
+		 * Crosslock entries will be added by commit later:
 		 */
-		if (hlock->read != 2 && hlock->check) {
-			if (!check_prev_add(curr, hlock, next,
-						distance, &stack_saved))
-				return 0;
+		if (!cross_lock(hlock->instance)) {
 			/*
-			 * Stop after the first non-trylock entry,
-			 * as non-trylock entries have added their
-			 * own direct dependencies already, so this
-			 * lock is connected to them indirectly:
+			 * Only non-recursive-read entries get new dependencies
+			 * added:
 			 */
-			if (!hlock->trylock)
-				break;
+			if (hlock->read != 2 && hlock->check) {
+				int ret = check_prev_add(curr, hlock, next,
+							 distance, &trace, save);
+				if (!ret)
+					return 0;
+
+				/*
+				 * Stop saving stack_trace if save_trace() was
+				 * called at least once:
+				 */
+				if (save && ret == 2)
+					save = NULL;
+
+				/*
+				 * Stop after the first non-trylock entry,
+				 * as non-trylock entries have added their
+				 * own direct dependencies already, so this
+				 * lock is connected to them indirectly:
+				 */
+				if (!hlock->trylock)
+					break;
+			}
 		}
 		depth--;
 		/*
@@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr,
 }
 
 /*
- * Look up a dependency chain. If the key is not present yet then
- * add it and return 1 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 0.
- * (On return with 1 graph_lock is held.)
+ * This is for building a chain between just two different classes,
+ * instead of adding a new hlock upon current, which is done by
+ * add_chain_cache().
+ *
+ * This can be called in any context with two classes, while
+ * add_chain_cache() must be done within the lock owener's context
+ * since it uses hlock which might be racy in another context.
  */
-static inline int lookup_chain_cache(struct task_struct *curr,
-				     struct held_lock *hlock,
-				     u64 chain_key)
+static inline int add_chain_cache_classes(unsigned int prev,
+					  unsigned int next,
+					  unsigned int irq_context,
+					  u64 chain_key)
 {
-	struct lock_class *class = hlock_class(hlock);
 	struct hlist_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
-	int i, j;
+
+	/*
+	 * Allocate a new chain entry from the static array, and add
+	 * it to the hash:
+	 */
 
 	/*
 	 * We might need to take the graph lock, ensure we've got IRQs
@@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	 */
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
+
+	if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+		if (!debug_locks_off_graph_unlock())
+			return 0;
+
+		print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
+		dump_stack();
+		return 0;
+	}
+
+	chain = lock_chains + nr_lock_chains++;
+	chain->chain_key = chain_key;
+	chain->irq_context = irq_context;
+	chain->depth = 2;
+	if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+		chain->base = nr_chain_hlocks;
+		nr_chain_hlocks += chain->depth;
+		chain_hlocks[chain->base] = prev - 1;
+		chain_hlocks[chain->base + 1] = next -1;
+	}
+#ifdef CONFIG_DEBUG_LOCKDEP
 	/*
-	 * We can walk it lock-free, because entries only get added
-	 * to the hash:
+	 * Important for check_no_collision().
 	 */
-	hlist_for_each_entry_rcu(chain, hash_head, entry) {
-		if (chain->chain_key == chain_key) {
-cache_hit:
-			debug_atomic_inc(chain_lookup_hits);
-			if (!check_no_collision(curr, hlock, chain))
-				return 0;
-
-			if (very_verbose(class))
-				printk("\nhash chain already cached, key: "
-					"%016Lx tail class: [%p] %s\n",
-					(unsigned long long)chain_key,
-					class->key, class->name);
+	else {
+		if (!debug_locks_off_graph_unlock())
 			return 0;
-		}
+
+		print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+		dump_stack();
+		return 0;
 	}
-	if (very_verbose(class))
-		printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
-			(unsigned long long)chain_key, class->key, class->name);
+#endif
+
+	hlist_add_head_rcu(&chain->entry, hash_head);
+	debug_atomic_inc(chain_lookup_misses);
+	inc_chains();
+
+	return 1;
+}
+
+/*
+ * Adds a dependency chain into chain hashtable. And must be called with
+ * graph_lock held.
+ *
+ * Return 0 if fail, and graph_lock is released.
+ * Return 1 if succeed, with graph_lock held.
+ */
+static inline int add_chain_cache(struct task_struct *curr,
+				  struct held_lock *hlock,
+				  u64 chain_key)
+{
+	struct lock_class *class = hlock_class(hlock);
+	struct hlist_head *hash_head = chainhashentry(chain_key);
+	struct lock_chain *chain;
+	int i, j;
+
 	/*
 	 * Allocate a new chain entry from the static array, and add
 	 * it to the hash:
 	 */
-	if (!graph_lock())
-		return 0;
+
 	/*
-	 * We have to walk the chain again locked - to avoid duplicates:
+	 * We might need to take the graph lock, ensure we've got IRQs
+	 * disabled to make this an IRQ-safe lock.. for recursion reasons
+	 * lockdep won't complain about its own locking errors.
 	 */
-	hlist_for_each_entry(chain, hash_head, entry) {
-		if (chain->chain_key == chain_key) {
-			graph_unlock();
-			goto cache_hit;
-		}
-	}
+	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+		return 0;
+
 	if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
 		if (!debug_locks_off_graph_unlock())
 			return 0;
@@ -2235,6 +2351,78 @@ cache_hit:
 	return 1;
 }
 
+/*
+ * Look up a dependency chain.
+ */
+static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
+{
+	struct hlist_head *hash_head = chainhashentry(chain_key);
+	struct lock_chain *chain;
+
+	/*
+	 * We can walk it lock-free, because entries only get added
+	 * to the hash:
+	 */
+	hlist_for_each_entry_rcu(chain, hash_head, entry) {
+		if (chain->chain_key == chain_key) {
+			debug_atomic_inc(chain_lookup_hits);
+			return chain;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * If the key is not present yet in dependency chain cache then
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
+ */
+static inline int lookup_chain_cache_add(struct task_struct *curr,
+					 struct held_lock *hlock,
+					 u64 chain_key)
+{
+	struct lock_class *class = hlock_class(hlock);
+	struct lock_chain *chain = lookup_chain_cache(chain_key);
+
+	if (chain) {
+cache_hit:
+		if (!check_no_collision(curr, hlock, chain))
+			return 0;
+
+		if (very_verbose(class)) {
+			printk("\nhash chain already cached, key: "
+					"%016Lx tail class: [%p] %s\n",
+					(unsigned long long)chain_key,
+					class->key, class->name);
+		}
+
+		return 0;
+	}
+
+	if (very_verbose(class)) {
+		printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n",
+			(unsigned long long)chain_key, class->key, class->name);
+	}
+
+	if (!graph_lock())
+		return 0;
+
+	/*
+	 * We have to walk the chain again locked - to avoid duplicates:
+	 */
+	chain = lookup_chain_cache(chain_key);
+	if (chain) {
+		graph_unlock();
+		goto cache_hit;
+	}
+
+	if (!add_chain_cache(curr, hlock, chain_key))
+		return 0;
+
+	return 1;
+}
+
 static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
 		struct held_lock *hlock, int chain_head, u64 chain_key)
 {
@@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
 	 *
 	 * We look up the chain_key and do the O(N^2) check and update of
 	 * the dependencies only if this is a new dependency chain.
-	 * (If lookup_chain_cache() returns with 1 it acquires
+	 * (If lookup_chain_cache_add() return with 1 it acquires
 	 * graph_lock for us)
 	 */
 	if (!hlock->trylock && hlock->check &&
-	    lookup_chain_cache(curr, hlock, chain_key)) {
+	    lookup_chain_cache_add(curr, hlock, chain_key)) {
 		/*
 		 * Check whether last held lock:
 		 *
@@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
 		 * Add dependency only if this lock is not the head
 		 * of the chain, and if it's not a secondary read-lock:
 		 */
-		if (!chain_head && ret != 2)
+		if (!chain_head && ret != 2) {
 			if (!check_prevs_add(curr, hlock))
 				return 0;
+		}
+
 		graph_unlock();
-	} else
-		/* after lookup_chain_cache(): */
+	} else {
+		/* after lookup_chain_cache_add(): */
 		if (unlikely(!debug_locks))
 			return 0;
+	}
 
 	return 1;
 }
@@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
 	return 0;
 }
 
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
-	return class_filter(class);
-#endif
-	return 0;
-}
-
 #define STRICT_READ_CHECKS	1
 
 static int (*state_verbose_f[])(struct lock_class *class) = {
@@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip)
 		debug_atomic_inc(redundant_softirqs_off);
 }
 
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
-{
-	struct task_struct *curr = current;
-
-	if (unlikely(!debug_locks))
-		return;
-
-	gfp_mask = current_gfp_context(gfp_mask);
-
-	/* no reclaim without waiting on it */
-	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
-		return;
-
-	/* this guy won't enter reclaim */
-	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
-		return;
-
-	/* We're only interested __GFP_FS allocations for now */
-	if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
-		return;
-
-	/*
-	 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
-	 */
-	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
-		return;
-
-	/* Disable lockdep if explicitly requested */
-	if (gfp_mask & __GFP_NOLOCKDEP)
-		return;
-
-	mark_held_locks(curr, RECLAIM_FS);
-}
-
-static void check_flags(unsigned long flags);
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-	unsigned long flags;
-
-	if (unlikely(current->lockdep_recursion))
-		return;
-
-	raw_local_irq_save(flags);
-	check_flags(flags);
-	current->lockdep_recursion = 1;
-	__lockdep_trace_alloc(gfp_mask, flags);
-	current->lockdep_recursion = 0;
-	raw_local_irq_restore(flags);
-}
-
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
 	/*
@@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 		}
 	}
 
-	/*
-	 * We reuse the irq context infrastructure more broadly as a general
-	 * context checking code. This tests GFP_FS recursion (a lock taken
-	 * during reclaim for a GFP_FS allocation is held over a GFP_FS
-	 * allocation).
-	 */
-	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
-		if (hlock->read) {
-			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
-					return 0;
-		} else {
-			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
-					return 0;
-		}
-	}
-
 	return 1;
 }
 
@@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr,
 	return 0;
 }
 
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
-
 #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 
 /*
@@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 /*
  * Initialize a lock instance's lock-class mapping info:
  */
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
+static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 		      struct lock_class_key *key, int subclass)
 {
 	int i;
@@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		raw_local_irq_restore(flags);
 	}
 }
+
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+		      struct lock_class_key *key, int subclass)
+{
+	cross_init(lock, 0);
+	__lockdep_init_map(lock, name, key, subclass);
+}
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
+		      struct lock_class_key *key, int subclass)
+{
+	cross_init(lock, 1);
+	__lockdep_init_map(lock, name, key, subclass);
+}
+EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
+#endif
+
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
 
@@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int chain_head = 0;
 	int class_idx;
 	u64 chain_key;
+	int ret;
 
 	if (unlikely(!debug_locks))
 		return 0;
@@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 	class_idx = class - lock_classes + 1;
 
-	if (depth) {
+	/* TODO: nest_lock is not implemented for crosslock yet. */
+	if (depth && !cross_lock(lock)) {
 		hlock = curr->held_locks + depth - 1;
 		if (hlock->class_idx == class_idx && nest_lock) {
 			if (hlock->references) {
@@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
 		return 0;
 
+	ret = lock_acquire_crosslock(hlock);
+	/*
+	 * 2 means normal acquire operations are needed. Otherwise, it's
+	 * ok just to return with '0:fail, 1:success'.
+	 */
+	if (ret != 2)
+		return ret;
+
 	curr->curr_chain_key = chain_key;
 	curr->lockdep_depth++;
 	check_chain_key(curr);
@@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 	struct task_struct *curr = current;
 	struct held_lock *hlock;
 	unsigned int depth;
-	int i;
+	int ret, i;
 
 	if (unlikely(!debug_locks))
 		return 0;
 
+	ret = lock_release_crosslock(lock);
+	/*
+	 * 2 means normal release operations are needed. Otherwise, it's
+	 * ok just to return with '0:fail, 1:success'.
+	 */
+	if (ret != 2)
+		return ret;
+
 	depth = curr->lockdep_depth;
 	/*
 	 * So we're all set to release this lock.. wait what lock? We don't
@@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 }
 EXPORT_SYMBOL_GPL(lock_unpin_lock);
 
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
-{
-	current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
-}
-EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
-
-void lockdep_clear_current_reclaim_state(void)
-{
-	current->lockdep_reclaim_gfp = 0;
-}
-EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
-
 #ifdef CONFIG_LOCK_STAT
 static int
 print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -4484,6 +4619,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
 				curr->comm, curr->pid);
 		lockdep_print_held_locks(curr);
 	}
+
+	/*
+	 * The lock history for each syscall should be independent. So wipe the
+	 * slate clean on return to userspace.
+	 */
+	lockdep_invariant_state(false);
 }
 
 void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4532,3 +4673,488 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 	dump_stack();
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+
+/*
+ * Crossrelease works by recording a lock history for each thread and
+ * connecting those historic locks that were taken after the
+ * wait_for_completion() in the complete() context.
+ *
+ * Task-A				Task-B
+ *
+ *					mutex_lock(&A);
+ *					mutex_unlock(&A);
+ *
+ * wait_for_completion(&C);
+ *   lock_acquire_crosslock();
+ *     atomic_inc_return(&cross_gen_id);
+ *                                |
+ *				  |	mutex_lock(&B);
+ *				  |	mutex_unlock(&B);
+ *                                |
+ *				  |	complete(&C);
+ *				  `--	  lock_commit_crosslock();
+ *
+ * Which will then add a dependency between B and C.
+ */
+
+#define xhlock(i)         (current->xhlocks[(i) % MAX_XHLOCKS_NR])
+
+/*
+ * Whenever a crosslock is held, cross_gen_id will be increased.
+ */
+static atomic_t cross_gen_id; /* Can be wrapped */
+
+/*
+ * Make an entry of the ring buffer invalid.
+ */
+static inline void invalidate_xhlock(struct hist_lock *xhlock)
+{
+	/*
+	 * Normally, xhlock->hlock.instance must be !NULL.
+	 */
+	xhlock->hlock.instance = NULL;
+}
+
+/*
+ * Lock history stacks; we have 2 nested lock history stacks:
+ *
+ *   HARD(IRQ)
+ *   SOFT(IRQ)
+ *
+ * The thing is that once we complete a HARD/SOFT IRQ the future task locks
+ * should not depend on any of the locks observed while running the IRQ.  So
+ * what we do is rewind the history buffer and erase all our knowledge of that
+ * temporal event.
+ */
+
+void crossrelease_hist_start(enum xhlock_context_t c)
+{
+	struct task_struct *cur = current;
+
+	if (!cur->xhlocks)
+		return;
+
+	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+	cur->hist_id_save[c]    = cur->hist_id;
+}
+
+void crossrelease_hist_end(enum xhlock_context_t c)
+{
+	struct task_struct *cur = current;
+
+	if (cur->xhlocks) {
+		unsigned int idx = cur->xhlock_idx_hist[c];
+		struct hist_lock *h = &xhlock(idx);
+
+		cur->xhlock_idx = idx;
+
+		/* Check if the ring was overwritten. */
+		if (h->hist_id != cur->hist_id_save[c])
+			invalidate_xhlock(h);
+	}
+}
+
+/*
+ * lockdep_invariant_state() is used to annotate independence inside a task, to
+ * make one task look like multiple independent 'tasks'.
+ *
+ * Take for instance workqueues; each work is independent of the last. The
+ * completion of a future work does not depend on the completion of a past work
+ * (in general). Therefore we must not carry that (lock) dependency across
+ * works.
+ *
+ * This is true for many things; pretty much all kthreads fall into this
+ * pattern, where they have an invariant state and future completions do not
+ * depend on past completions. Its just that since they all have the 'same'
+ * form -- the kthread does the same over and over -- it doesn't typically
+ * matter.
+ *
+ * The same is true for system-calls, once a system call is completed (we've
+ * returned to userspace) the next system call does not depend on the lock
+ * history of the previous system call.
+ *
+ * They key property for independence, this invariant state, is that it must be
+ * a point where we hold no locks and have no history. Because if we were to
+ * hold locks, the restore at _end() would not necessarily recover it's history
+ * entry. Similarly, independence per-definition means it does not depend on
+ * prior state.
+ */
+void lockdep_invariant_state(bool force)
+{
+	/*
+	 * We call this at an invariant point, no current state, no history.
+	 * Verify the former, enforce the latter.
+	 */
+	WARN_ON_ONCE(!force && current->lockdep_depth);
+	invalidate_xhlock(&xhlock(current->xhlock_idx));
+}
+
+static int cross_lock(struct lockdep_map *lock)
+{
+	return lock ? lock->cross : 0;
+}
+
+/*
+ * This is needed to decide the relationship between wrapable variables.
+ */
+static inline int before(unsigned int a, unsigned int b)
+{
+	return (int)(a - b) < 0;
+}
+
+static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
+{
+	return hlock_class(&xhlock->hlock);
+}
+
+static inline struct lock_class *xlock_class(struct cross_lock *xlock)
+{
+	return hlock_class(&xlock->hlock);
+}
+
+/*
+ * Should we check a dependency with previous one?
+ */
+static inline int depend_before(struct held_lock *hlock)
+{
+	return hlock->read != 2 && hlock->check && !hlock->trylock;
+}
+
+/*
+ * Should we check a dependency with next one?
+ */
+static inline int depend_after(struct held_lock *hlock)
+{
+	return hlock->read != 2 && hlock->check;
+}
+
+/*
+ * Check if the xhlock is valid, which would be false if,
+ *
+ *    1. Has not used after initializaion yet.
+ *    2. Got invalidated.
+ *
+ * Remind hist_lock is implemented as a ring buffer.
+ */
+static inline int xhlock_valid(struct hist_lock *xhlock)
+{
+	/*
+	 * xhlock->hlock.instance must be !NULL.
+	 */
+	return !!xhlock->hlock.instance;
+}
+
+/*
+ * Record a hist_lock entry.
+ *
+ * Irq disable is only required.
+ */
+static void add_xhlock(struct held_lock *hlock)
+{
+	unsigned int idx = ++current->xhlock_idx;
+	struct hist_lock *xhlock = &xhlock(idx);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+	/*
+	 * This can be done locklessly because they are all task-local
+	 * state, we must however ensure IRQs are disabled.
+	 */
+	WARN_ON_ONCE(!irqs_disabled());
+#endif
+
+	/* Initialize hist_lock's members */
+	xhlock->hlock = *hlock;
+	xhlock->hist_id = ++current->hist_id;
+
+	xhlock->trace.nr_entries = 0;
+	xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
+	xhlock->trace.entries = xhlock->trace_entries;
+	xhlock->trace.skip = 3;
+	save_stack_trace(&xhlock->trace);
+}
+
+static inline int same_context_xhlock(struct hist_lock *xhlock)
+{
+	return xhlock->hlock.irq_context == task_irq_context(current);
+}
+
+/*
+ * This should be lockless as far as possible because this would be
+ * called very frequently.
+ */
+static void check_add_xhlock(struct held_lock *hlock)
+{
+	/*
+	 * Record a hist_lock, only in case that acquisitions ahead
+	 * could depend on the held_lock. For example, if the held_lock
+	 * is trylock then acquisitions ahead never depends on that.
+	 * In that case, we don't need to record it. Just return.
+	 */
+	if (!current->xhlocks || !depend_before(hlock))
+		return;
+
+	add_xhlock(hlock);
+}
+
+/*
+ * For crosslock.
+ */
+static int add_xlock(struct held_lock *hlock)
+{
+	struct cross_lock *xlock;
+	unsigned int gen_id;
+
+	if (!graph_lock())
+		return 0;
+
+	xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
+
+	/*
+	 * When acquisitions for a crosslock are overlapped, we use
+	 * nr_acquire to perform commit for them, based on cross_gen_id
+	 * of the first acquisition, which allows to add additional
+	 * dependencies.
+	 *
+	 * Moreover, when no acquisition of a crosslock is in progress,
+	 * we should not perform commit because the lock might not exist
+	 * any more, which might cause incorrect memory access. So we
+	 * have to track the number of acquisitions of a crosslock.
+	 *
+	 * depend_after() is necessary to initialize only the first
+	 * valid xlock so that the xlock can be used on its commit.
+	 */
+	if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
+		goto unlock;
+
+	gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
+	xlock->hlock = *hlock;
+	xlock->hlock.gen_id = gen_id;
+unlock:
+	graph_unlock();
+	return 1;
+}
+
+/*
+ * Called for both normal and crosslock acquires. Normal locks will be
+ * pushed on the hist_lock queue. Cross locks will record state and
+ * stop regular lock_acquire() to avoid being placed on the held_lock
+ * stack.
+ *
+ * Return: 0 - failure;
+ *         1 - crosslock, done;
+ *         2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_acquire_crosslock(struct held_lock *hlock)
+{
+	/*
+	 *	CONTEXT 1		CONTEXT 2
+	 *	---------		---------
+	 *	lock A (cross)
+	 *	X = atomic_inc_return(&cross_gen_id)
+	 *	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	 *				Y = atomic_read_acquire(&cross_gen_id)
+	 *				lock B
+	 *
+	 * atomic_read_acquire() is for ordering between A and B,
+	 * IOW, A happens before B, when CONTEXT 2 see Y >= X.
+	 *
+	 * Pairs with atomic_inc_return() in add_xlock().
+	 */
+	hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
+
+	if (cross_lock(hlock->instance))
+		return add_xlock(hlock);
+
+	check_add_xhlock(hlock);
+	return 2;
+}
+
+static int copy_trace(struct stack_trace *trace)
+{
+	unsigned long *buf = stack_trace + nr_stack_trace_entries;
+	unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+	unsigned int nr = min(max_nr, trace->nr_entries);
+
+	trace->nr_entries = nr;
+	memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
+	trace->entries = buf;
+	nr_stack_trace_entries += nr;
+
+	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+		if (!debug_locks_off_graph_unlock())
+			return 0;
+
+		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
+		dump_stack();
+
+		return 0;
+	}
+
+	return 1;
+}
+
+static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
+{
+	unsigned int xid, pid;
+	u64 chain_key;
+
+	xid = xlock_class(xlock) - lock_classes;
+	chain_key = iterate_chain_key((u64)0, xid);
+	pid = xhlock_class(xhlock) - lock_classes;
+	chain_key = iterate_chain_key(chain_key, pid);
+
+	if (lookup_chain_cache(chain_key))
+		return 1;
+
+	if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
+				chain_key))
+		return 0;
+
+	if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
+			    &xhlock->trace, copy_trace))
+		return 0;
+
+	return 1;
+}
+
+static void commit_xhlocks(struct cross_lock *xlock)
+{
+	unsigned int cur = current->xhlock_idx;
+	unsigned int prev_hist_id = xhlock(cur).hist_id;
+	unsigned int i;
+
+	if (!graph_lock())
+		return;
+
+	if (xlock->nr_acquire) {
+		for (i = 0; i < MAX_XHLOCKS_NR; i++) {
+			struct hist_lock *xhlock = &xhlock(cur - i);
+
+			if (!xhlock_valid(xhlock))
+				break;
+
+			if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
+				break;
+
+			if (!same_context_xhlock(xhlock))
+				break;
+
+			/*
+			 * Filter out the cases where the ring buffer was
+			 * overwritten and the current entry has a bigger
+			 * hist_id than the previous one, which is impossible
+			 * otherwise:
+			 */
+			if (unlikely(before(prev_hist_id, xhlock->hist_id)))
+				break;
+
+			prev_hist_id = xhlock->hist_id;
+
+			/*
+			 * commit_xhlock() returns 0 with graph_lock already
+			 * released if fail.
+			 */
+			if (!commit_xhlock(xlock, xhlock))
+				return;
+		}
+	}
+
+	graph_unlock();
+}
+
+void lock_commit_crosslock(struct lockdep_map *lock)
+{
+	struct cross_lock *xlock;
+	unsigned long flags;
+
+	if (unlikely(!debug_locks || current->lockdep_recursion))
+		return;
+
+	if (!current->xhlocks)
+		return;
+
+	/*
+	 * Do commit hist_locks with the cross_lock, only in case that
+	 * the cross_lock could depend on acquisitions after that.
+	 *
+	 * For example, if the cross_lock does not have the 'check' flag
+	 * then we don't need to check dependencies and commit for that.
+	 * Just skip it. In that case, of course, the cross_lock does
+	 * not depend on acquisitions ahead, either.
+	 *
+	 * WARNING: Don't do that in add_xlock() in advance. When an
+	 * acquisition context is different from the commit context,
+	 * invalid(skipped) cross_lock might be accessed.
+	 */
+	if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	xlock = &((struct lockdep_map_cross *)lock)->xlock;
+	commit_xhlocks(xlock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_commit_crosslock);
+
+/*
+ * Return: 0 - failure;
+ *         1 - crosslock, done;
+ *         2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_release_crosslock(struct lockdep_map *lock)
+{
+	if (cross_lock(lock)) {
+		if (!graph_lock())
+			return 0;
+		((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
+		graph_unlock();
+		return 1;
+	}
+	return 2;
+}
+
+static void cross_init(struct lockdep_map *lock, int cross)
+{
+	if (cross)
+		((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
+
+	lock->cross = cross;
+
+	/*
+	 * Crossrelease assumes that the ring buffer size of xhlocks
+	 * is aligned with power of 2. So force it on build.
+	 */
+	BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
+}
+
+void lockdep_init_task(struct task_struct *task)
+{
+	int i;
+
+	task->xhlock_idx = UINT_MAX;
+	task->hist_id = 0;
+
+	for (i = 0; i < XHLOCK_CTX_NR; i++) {
+		task->xhlock_idx_hist[i] = UINT_MAX;
+		task->hist_id_save[i] = 0;
+	}
+
+	task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
+				GFP_KERNEL);
+}
+
+void lockdep_free_task(struct task_struct *task)
+{
+	if (task->xhlocks) {
+		void *tmp = task->xhlocks;
+		/* Diable crossrelease for current */
+		task->xhlocks = NULL;
+		kfree(tmp);
+	}
+}
+#endif
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c08fbd2f5ba9..1da4669d57a7 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -143,6 +143,8 @@ struct lockdep_stats {
 	int	redundant_softirqs_on;
 	int	redundant_softirqs_off;
 	int	nr_unused_locks;
+	int	nr_redundant_checks;
+	int	nr_redundant;
 	int	nr_cyclic_checks;
 	int	nr_cyclic_check_recursions;
 	int	nr_find_usage_forwards_checks;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 6d1fcc786081..68d9e267ccd4 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
 		debug_atomic_read(chain_lookup_hits));
 	seq_printf(m, " cyclic checks:                 %11llu\n",
 		debug_atomic_read(nr_cyclic_checks));
+	seq_printf(m, " redundant checks:              %11llu\n",
+		debug_atomic_read(nr_redundant_checks));
+	seq_printf(m, " redundant links:               %11llu\n",
+		debug_atomic_read(nr_redundant));
 	seq_printf(m, " find-mask forwards checks:     %11llu\n",
 		debug_atomic_read(nr_find_usage_forwards_checks));
 	seq_printf(m, " find-mask backwards checks:    %11llu\n",
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..35ca09f2ed0b 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
  */
 LOCKDEP_STATE(HARDIRQ)
 LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index a3167941093b..a74ee6abd039 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 
 	prev = decode_cpu(old);
 	node->prev = prev;
+
+	/*
+	 * osq_lock()			unqueue
+	 *
+	 * node->prev = prev		osq_wait_next()
+	 * WMB				MB
+	 * prev->next = node		next->prev = prev // unqueue-C
+	 *
+	 * Here 'node->prev' and 'next->prev' are the same variable and we need
+	 * to ensure these stores happen in-order to avoid corrupting the list.
+	 */
+	smp_wmb();
+
 	WRITE_ONCE(prev->next, node);
 
 	/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 4ccfcaae5b89..43555681c40b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
 	struct __qspinlock *l = (void *)lock;
 
 	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-	    (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+	    (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
 		qstat_inc(qstat_pv_lock_stealing, true);
 		return true;
 	}
@@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock)
 
 /*
  * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
- * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
- * just to be sure that it will get it.
+ * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the
+ * lock just to be sure that it will get it.
  */
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
 
 	return !READ_ONCE(l->locked) &&
-	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
-			== _Q_PENDING_VAL);
+	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
 }
 #else /* _Q_PENDING_BITS == 8 */
 static __always_inline void set_pending(struct qspinlock *lock)
@@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 		 */
 		old = val;
 		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
-		val = atomic_cmpxchg(&lock->val, old, new);
+		val = atomic_cmpxchg_acquire(&lock->val, old, new);
 
 		if (val == old)
 			return 1;
@@ -362,8 +362,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	 * observe its next->locked value and advance itself.
 	 *
 	 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+	 *
+	 * The write to next->locked in arch_mcs_spin_unlock_contended()
+	 * must be ordered before the read of pn->state in the cmpxchg()
+	 * below for the code to work correctly. To guarantee full ordering
+	 * irrespective of the success or failure of the cmpxchg(),
+	 * a relaxed version with explicit barrier is used. The control
+	 * dependency will order the reading of pn->state before any
+	 * subsequent writes.
 	 */
-	if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+	smp_mb__before_atomic();
+	if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed)
+	    != vcpu_halted)
 		return;
 
 	/*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 72ad45a9a794..8d039b928d61 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,6 +40,9 @@ struct rt_mutex_waiter {
 /*
  * Various helpers to access the waiters-tree:
  */
+
+#ifdef CONFIG_RT_MUTEXES
+
 static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
 {
 	return !RB_EMPTY_ROOT(&lock->waiters);
@@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p)
 			pi_tree_entry);
 }
 
+#else
+
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+	return false;
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+	return NULL;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+	return false;
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+	return NULL;
+}
+
+#endif
+
 /*
  * lock->owner state tracking:
  */
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 20819df98125..0848634c5512 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
 /*
  * get a read lock on the semaphore
  */
-void __sched __down_read(struct rw_semaphore *sem)
+int __sched __down_read_common(struct rw_semaphore *sem, int state)
 {
 	struct rwsem_waiter waiter;
 	unsigned long flags;
@@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem)
 		goto out;
 	}
 
-	set_current_state(TASK_UNINTERRUPTIBLE);
-
 	/* set up my own style of waitqueue */
 	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_READ;
@@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem)
 
 	list_add_tail(&waiter.list, &sem->wait_list);
 
-	/* we don't need to touch the semaphore struct anymore */
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
 	/* wait to be given the lock */
 	for (;;) {
 		if (!waiter.task)
 			break;
+		if (signal_pending_state(state, current))
+			goto out_nolock;
+		set_current_state(state);
+		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		raw_spin_lock_irqsave(&sem->wait_lock, flags);
 	}
 
-	__set_current_state(TASK_RUNNING);
+	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
  out:
-	;
+	return 0;
+
+out_nolock:
+	/*
+	 * We didn't take the lock, so that there is a writer, which
+	 * is owner or the first waiter of the sem. If it's a waiter,
+	 * it will be woken by current owner. Not need to wake anybody.
+	 */
+	list_del(&waiter.list);
+	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+	return -EINTR;
+}
+
+void __sched __down_read(struct rw_semaphore *sem)
+{
+	__down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_read_killable(struct rw_semaphore *sem)
+{
+	return __down_read_common(sem, TASK_KILLABLE);
 }
 
 /*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 34e727f18e49..02f660666ab8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 /*
  * Wait for the read lock to be granted
  */
-__visible
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
 {
 	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
@@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 
 	/* wait to be given the lock */
 	while (true) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		set_current_state(state);
 		if (!waiter.task)
 			break;
+		if (signal_pending_state(state, current)) {
+			raw_spin_lock_irq(&sem->wait_lock);
+			if (waiter.task)
+				goto out_nolock;
+			raw_spin_unlock_irq(&sem->wait_lock);
+			break;
+		}
 		schedule();
 	}
 
 	__set_current_state(TASK_RUNNING);
 	return sem;
+out_nolock:
+	list_del(&waiter.list);
+	if (list_empty(&sem->wait_list))
+		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+	raw_spin_unlock_irq(&sem->wait_lock);
+	__set_current_state(TASK_RUNNING);
+	return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
 
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+
 /*
  * This function must be called with the sem->wait_lock held to prevent
  * race conditions between checking the rwsem wait list and setting the
diff --git a/kernel/panic.c b/kernel/panic.c
index a58932b41700..bdd18afa19a4 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -26,6 +26,7 @@
 #include <linux/nmi.h>
 #include <linux/console.h>
 #include <linux/bug.h>
+#include <linux/ratelimit.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail);
 
 #endif
 
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err)
+{
+	WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
+		err, (void *)instruction_pointer(regs),
+		current->comm, task_pid_nr(current),
+		from_kuid_munged(&init_user_ns, current_uid()),
+		from_kuid_munged(&init_user_ns, current_euid()));
+}
+#endif
+
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..566b6ec7b6fe 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -32,6 +32,12 @@ void complete(struct completion *x)
 	unsigned long flags;
 
 	spin_lock_irqsave(&x->wait.lock, flags);
+
+	/*
+	 * Perform commit of crossrelease here.
+	 */
+	complete_release_commit(x);
+
 	if (x->done != UINT_MAX)
 		x->done++;
 	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
@@ -92,9 +98,14 @@ __wait_for_common(struct completion *x,
 {
 	might_sleep();
 
+	complete_acquire(x);
+
 	spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, action, timeout, state);
 	spin_unlock_irq(&x->wait.lock);
+
+	complete_release(x);
+
 	return timeout;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0869b20fba81..9fece583a1f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1967,8 +1967,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * reordered with p->state check below. This pairs with mb() in
 	 * set_current_state() the waiting thread does.
 	 */
-	smp_mb__before_spinlock();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	smp_mb__after_spinlock();
 	if (!(p->state & state))
 		goto out;
 
@@ -3281,8 +3281,8 @@ static void __sched notrace __schedule(bool preempt)
 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
-	smp_mb__before_spinlock();
 	rq_lock(rq, &rf);
+	smp_mb__after_spinlock();
 
 	/* Promote REQ to ACT */
 	rq->clock_update_flags <<= 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3086d1..f29a7d2b57e1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -769,7 +769,7 @@ struct rq {
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
-	struct call_single_data hrtick_csd;
+	call_single_data_t hrtick_csd;
 #endif
 	struct hrtimer hrtick_timer;
 #endif
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 3d5610dcce11..2227e183e202 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q)
 {
 	unsigned long flags;
 
-	if (!swait_active(q))
-		return;
-
 	raw_spin_lock_irqsave(&q->lock, flags);
 	swake_up_locked(q);
 	raw_spin_unlock_irqrestore(&q->lock, flags);
@@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q)
 	struct swait_queue *curr;
 	LIST_HEAD(tmp);
 
-	if (!swait_active(q))
-		return;
-
 	raw_spin_lock_irq(&q->lock);
 	list_splice_init(&q->task_list, &tmp);
 	while (!list_empty(&tmp)) {
diff --git a/kernel/smp.c b/kernel/smp.c
index 3061483cb3ad..81cfca9b4cc3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,7 +28,7 @@ enum {
 };
 
 struct call_function_data {
-	struct call_single_data	__percpu *csd;
+	call_single_data_t	__percpu *csd;
 	cpumask_var_t		cpumask;
 	cpumask_var_t		cpumask_ipi;
 };
@@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
 		free_cpumask_var(cfd->cpumask);
 		return -ENOMEM;
 	}
-	cfd->csd = alloc_percpu(struct call_single_data);
+	cfd->csd = alloc_percpu(call_single_data_t);
 	if (!cfd->csd) {
 		free_cpumask_var(cfd->cpumask);
 		free_cpumask_var(cfd->cpumask_ipi);
@@ -103,12 +103,12 @@ void __init call_function_init(void)
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static __always_inline void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
 }
 
-static __always_inline void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
 {
 	csd_lock_wait(csd);
 	csd->flags |= CSD_FLAG_LOCK;
@@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
 	/*
 	 * prevent CPU from reordering the above assignment
 	 * to ->flags with any subsequent assignments to other
-	 * fields of the specified call_single_data structure:
+	 * fields of the specified call_single_data_t structure:
 	 */
 	smp_wmb();
 }
 
-static __always_inline void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
 {
 	WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
 
@@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
 	smp_store_release(&csd->flags, 0);
 }
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 
 /*
- * Insert a previously allocated call_single_data element
+ * Insert a previously allocated call_single_data_t element
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static int generic_exec_single(int cpu, struct call_single_data *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd,
 			       smp_call_func_t func, void *info)
 {
 	if (cpu == smp_processor_id()) {
@@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 {
 	struct llist_head *head;
 	struct llist_node *entry;
-	struct call_single_data *csd, *csd_next;
+	call_single_data_t *csd, *csd_next;
 	static bool warned;
 
 	WARN_ON(!irqs_disabled());
@@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 			     int wait)
 {
-	struct call_single_data *csd;
-	struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+	call_single_data_t *csd;
+	call_single_data_t csd_stack = {
+		.flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+	};
 	int this_cpu;
 	int err;
 
@@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  * NOTE: Be careful, there is unfortunately no current debugging facility to
  * validate the correctness of this serialization.
  */
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	int err = 0;
 
@@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	cpumask_clear(cfd->cpumask_ipi);
 	for_each_cpu(cpu, cfd->cpumask) {
-		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+		call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
 
 		csd_lock(csd);
 		if (wait)
@@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	if (wait) {
 		for_each_cpu(cpu, cfd->cpumask) {
-			struct call_single_data *csd;
+			call_single_data_t *csd;
 
 			csd = per_cpu_ptr(cfd->csd, cpu);
 			csd_lock_wait(csd);
diff --git a/kernel/up.c b/kernel/up.c
index ee81ac9af4ca..42c46bf3e0a5 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	unsigned long flags;
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca937b0c3a96..ab3c0dc8c7ed 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2091,8 +2091,30 @@ __acquires(&pool->lock)
 
 	spin_unlock_irq(&pool->lock);
 
-	lock_map_acquire_read(&pwq->wq->lockdep_map);
+	lock_map_acquire(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
+	/*
+	 * Strictly speaking we should mark the invariant state without holding
+	 * any locks, that is, before these two lock_map_acquire()'s.
+	 *
+	 * However, that would result in:
+	 *
+	 *   A(W1)
+	 *   WFC(C)
+	 *		A(W1)
+	 *		C(C)
+	 *
+	 * Which would create W1->C->W1 dependencies, even though there is no
+	 * actual deadlock possible. There are two solutions, using a
+	 * read-recursive acquire on the work(queue) 'locks', but this will then
+	 * hit the lockdep limitation on recursive locks, or simply discard
+	 * these locks.
+	 *
+	 * AFAICT there is no possible deadlock scenario between the
+	 * flush_work() and complete() primitives (except for single-threaded
+	 * workqueues), so hiding them isn't a problem.
+	 */
+	lockdep_invariant_state(true);
 	trace_workqueue_execute_start(work);
 	worker->current_func(work);
 	/*
@@ -2474,7 +2496,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 	 */
 	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
-	init_completion(&barr->done);
+
+	/*
+	 * Explicitly init the crosslock for wq_barrier::done, make its lock
+	 * key a subkey of the corresponding work. As a result we won't
+	 * build a dependency between wq_barrier::done and unrelated work.
+	 */
+	lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
+				   "(complete)wq_barr::done",
+				   target->lockdep_map.key, 1);
+	__init_completion(&barr->done);
 	barr->task = current;
 
 	/*
@@ -2815,16 +2846,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 	spin_unlock_irq(&pool->lock);
 
 	/*
-	 * If @max_active is 1 or rescuer is in use, flushing another work
-	 * item on the same workqueue may lead to deadlock.  Make sure the
-	 * flusher is not running on the same workqueue by verifying write
-	 * access.
+	 * Force a lock recursion deadlock when using flush_work() inside a
+	 * single-threaded or rescuer equipped workqueue.
+	 *
+	 * For single threaded workqueues the deadlock happens when the work
+	 * is after the work issuing the flush_work(). For rescuer equipped
+	 * workqueues the deadlock happens when the rescuer stalls, blocking
+	 * forward progress.
 	 */
-	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) {
 		lock_map_acquire(&pwq->wq->lockdep_map);
-	else
-		lock_map_acquire_read(&pwq->wq->lockdep_map);
-	lock_map_release(&pwq->wq->lockdep_map);
+		lock_map_release(&pwq->wq->lockdep_map);
+	}
 
 	return true;
 already_gone:
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c617b9d1d6cb..e718df3cbd46 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1088,6 +1088,8 @@ config PROVE_LOCKING
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
+	select LOCKDEP_CROSSRELEASE
+	select LOCKDEP_COMPLETIONS
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1157,6 +1159,22 @@ config LOCK_STAT
 	 CONFIG_LOCK_STAT defines "contended" and "acquired" lock events.
 	 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
 
+config LOCKDEP_CROSSRELEASE
+	bool
+	help
+	 This makes lockdep work for crosslock which is a lock allowed to
+	 be released in a different context from the acquisition context.
+	 Normally a lock must be released in the context acquiring the lock.
+	 However, relexing this constraint helps synchronization primitives
+	 such as page locks or completions can use the lock correctness
+	 detector, lockdep.
+
+config LOCKDEP_COMPLETIONS
+	bool
+	help
+	 A deadlock caused by wait_for_completion() and complete() can be
+	 detected by lockdep using crossrelease feature.
+
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 6f2b135dc5e8..cd0b5c964bd0 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -363,6 +363,103 @@ static void rsem_AA3(void)
 }
 
 /*
+ * read_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		write_lock(A)
+ */
+static void rlock_ABBA1(void)
+{
+	RL(X1);
+	L(Y1);
+	U(Y1);
+	RU(X1);
+
+	L(Y1);
+	WL(X1);
+	WU(X1);
+	U(Y1); // should fail
+}
+
+static void rwsem_ABBA1(void)
+{
+	RSL(X1);
+	ML(Y1);
+	MU(Y1);
+	RSU(X1);
+
+	ML(Y1);
+	WSL(X1);
+	WSU(X1);
+	MU(Y1); // should fail
+}
+
+/*
+ * read_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		read_lock(A)
+ */
+static void rlock_ABBA2(void)
+{
+	RL(X1);
+	L(Y1);
+	U(Y1);
+	RU(X1);
+
+	L(Y1);
+	RL(X1);
+	RU(X1);
+	U(Y1); // should NOT fail
+}
+
+static void rwsem_ABBA2(void)
+{
+	RSL(X1);
+	ML(Y1);
+	MU(Y1);
+	RSU(X1);
+
+	ML(Y1);
+	RSL(X1);
+	RSU(X1);
+	MU(Y1); // should fail
+}
+
+
+/*
+ * write_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		write_lock(A)
+ */
+static void rlock_ABBA3(void)
+{
+	WL(X1);
+	L(Y1);
+	U(Y1);
+	WU(X1);
+
+	L(Y1);
+	WL(X1);
+	WU(X1);
+	U(Y1); // should fail
+}
+
+static void rwsem_ABBA3(void)
+{
+	WSL(X1);
+	ML(Y1);
+	MU(Y1);
+	WSU(X1);
+
+	ML(Y1);
+	WSL(X1);
+	WSU(X1);
+	MU(Y1); // should fail
+}
+
+/*
  * ABBA deadlock:
  */
 
@@ -1056,8 +1153,6 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
 	if (debug_locks != expected) {
 		unexpected_testcase_failures++;
 		pr_cont("FAILED|");
-
-		dump_stack();
 	} else {
 		testcase_successes++;
 		pr_cont("  ok  |");
@@ -1933,6 +2028,30 @@ void locking_selftest(void)
 	dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
 	pr_cont("\n");
 
+	print_testname("mixed read-lock/lock-write ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK);
+	/*
+	 * Lockdep does indeed fail here, but there's nothing we can do about
+	 * that now.  Don't kill lockdep for it.
+	 */
+	unexpected_testcase_failures--;
+
+	pr_cont("             |");
+	dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM);
+
+	print_testname("mixed read-lock/lock-read ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA2, SUCCESS, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA2, FAILURE, LOCKTYPE_RWSEM);
+
+	print_testname("mixed write-lock/lock-write ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA3, FAILURE, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA3, FAILURE, LOCKTYPE_RWSEM);
+
 	printk("  --------------------------------------------------------------------------\n");
 
 	/*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 90731e3b7e58..3644ff918434 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1510,8 +1510,15 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 	}
 
 	/*
-	 * The page_table_lock above provides a memory barrier
-	 * with change_protection_range.
+	 * Since we took the NUMA fault, we must have observed the !accessible
+	 * bit. Make sure all other CPUs agree with that, to avoid them
+	 * modifying the page we're about to migrate.
+	 *
+	 * Must be done under PTL such that we'll observe the relevant
+	 * inc_tlb_flush_pending().
+	 *
+	 * We are not sure a pending tlb flush here is for a huge page
+	 * mapping or not. Hence use the tlb range variant
 	 */
 	if (mm_tlb_flush_pending(vma->vm_mm))
 		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
@@ -1521,6 +1528,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 	 * and access rights restored.
 	 */
 	spin_unlock(vmf->ptl);
+
 	migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
 				vmf->pmd, pmd, vmf->address, page, target_nid);
 	if (migrated) {
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index ca11bc4ce205..6f319fb81718 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr,
 	check_memory_region_inline(addr, size, write, ret_ip);
 }
 
-void kasan_check_read(const void *p, unsigned int size)
+void kasan_check_read(const volatile void *p, unsigned int size)
 {
 	check_memory_region((unsigned long)p, size, false, _RET_IP_);
 }
 EXPORT_SYMBOL(kasan_check_read);
 
-void kasan_check_write(const void *p, unsigned int size)
+void kasan_check_write(const volatile void *p, unsigned int size)
 {
 	check_memory_region((unsigned long)p, size, true, _RET_IP_);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1423da8dd16f..9327a940e373 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
+#include <linux/lockdep.h>
 #include <linux/nmi.h>
 
 #include <asm/sections.h>
@@ -3513,6 +3514,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
 }
 #endif /* CONFIG_COMPACTION */
 
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+	gfp_mask = current_gfp_context(gfp_mask);
+
+	/* no reclaim without waiting on it */
+	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+		return false;
+
+	/* this guy won't enter reclaim */
+	if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+		return false;
+
+	/* We're only interested __GFP_FS allocations for now */
+	if (!(gfp_mask & __GFP_FS))
+		return false;
+
+	if (gfp_mask & __GFP_NOLOCKDEP)
+		return false;
+
+	return true;
+}
+
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+	if (__need_fs_reclaim(gfp_mask))
+		lock_map_acquire(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+	if (__need_fs_reclaim(gfp_mask))
+		lock_map_release(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
+
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3527,7 +3569,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	noreclaim_flag = memalloc_noreclaim_save();
-	lockdep_set_current_reclaim_state(gfp_mask);
+	fs_reclaim_acquire(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 
@@ -3535,7 +3577,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 								ac->nodemask);
 
 	current->reclaim_state = NULL;
-	lockdep_clear_current_reclaim_state();
+	fs_reclaim_release(gfp_mask);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
 	cond_resched();
@@ -4064,7 +4106,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 			*alloc_flags |= ALLOC_CPUSET;
 	}
 
-	lockdep_trace_alloc(gfp_mask);
+	fs_reclaim_acquire(gfp_mask);
+	fs_reclaim_release(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
diff --git a/mm/slab.h b/mm/slab.h
index 6885e1192ec5..073362816acc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -43,6 +43,7 @@ struct kmem_cache {
 #include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/random.h>
+#include <linux/sched/mm.h>
 
 /*
  * State of the slab allocator.
@@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
 						     gfp_t flags)
 {
 	flags &= gfp_allowed_mask;
-	lockdep_trace_alloc(flags);
+
+	fs_reclaim_acquire(flags);
+	fs_reclaim_release(flags);
+
 	might_sleep_if(gfpflags_allow_blocking(flags));
 
 	if (should_failslab(s, flags))
diff --git a/mm/slob.c b/mm/slob.c
index 1bae78d71096..a8bd6fa11a66 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 
 	gfp &= gfp_allowed_mask;
 
-	lockdep_trace_alloc(gfp);
+	fs_reclaim_acquire(gfp);
+	fs_reclaim_release(gfp);
 
 	if (size < PAGE_SIZE - align) {
 		if (!size)
@@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 
 	flags &= gfp_allowed_mask;
 
-	lockdep_trace_alloc(flags);
+	fs_reclaim_acquire(flags);
+	fs_reclaim_release(flags);
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a1af041930a6..f957afe900ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3525,8 +3525,6 @@ static int kswapd(void *p)
 	};
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
-	lockdep_set_current_reclaim_state(GFP_KERNEL);
-
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
@@ -3585,14 +3583,15 @@ kswapd_try_sleep:
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
 						alloc_order);
+		fs_reclaim_acquire(GFP_KERNEL);
 		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+		fs_reclaim_release(GFP_KERNEL);
 		if (reclaim_order < alloc_order)
 			goto kswapd_try_sleep;
 	}
 
 	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
 	current->reclaim_state = NULL;
-	lockdep_clear_current_reclaim_state();
 
 	return 0;
 }
@@ -3655,14 +3654,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 	unsigned int noreclaim_flag;
 
 	noreclaim_flag = memalloc_noreclaim_save();
-	lockdep_set_current_reclaim_state(sc.gfp_mask);
+	fs_reclaim_acquire(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	p->reclaim_state = NULL;
-	lockdep_clear_current_reclaim_state();
+	fs_reclaim_release(sc.gfp_mask);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
 	return nr_reclaimed;
@@ -3847,7 +3846,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	p->flags |= PF_SWAPWRITE;
-	lockdep_set_current_reclaim_state(sc.gfp_mask);
+	fs_reclaim_acquire(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
@@ -3862,9 +3861,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	}
 
 	p->reclaim_state = NULL;
+	fs_reclaim_release(gfp_mask);
 	current->flags &= ~PF_SWAPWRITE;
 	memalloc_noreclaim_restore(noreclaim_flag);
-	lockdep_clear_current_reclaim_state();
 	return sc.nr_reclaimed >= nr_pages;
 }
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 62344804baae..38e795e0c4bf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1810,8 +1810,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 static struct static_key udp_encap_needed __read_mostly;
 void udp_encap_enable(void)
 {
-	if (!static_key_enabled(&udp_encap_needed))
-		static_key_slow_inc(&udp_encap_needed);
+	static_key_enable(&udp_encap_needed);
 }
 EXPORT_SYMBOL(udp_encap_enable);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d6886228e1d0..56030d45823a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -575,8 +575,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 static struct static_key udpv6_encap_needed __read_mostly;
 void udpv6_encap_enable(void)
 {
-	if (!static_key_enabled(&udpv6_encap_needed))
-		static_key_slow_inc(&udpv6_encap_needed);
+	static_key_enable(&udpv6_encap_needed);
 }
 EXPORT_SYMBOL(udpv6_encap_enable);
 
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index f6152c70f7f4..a18cb4496e1e 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -262,6 +262,9 @@ objtool_args = check
 ifndef CONFIG_FRAME_POINTER
 objtool_args += --no-fp
 endif
+ifdef CONFIG_GCOV_KERNEL
+objtool_args += --no-unreachable
+endif
 
 # 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory
 # 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file
diff --git a/tools/objtool/Build b/tools/objtool/Build
index 6f2e1987c4d9..749becdf5b90 100644
--- a/tools/objtool/Build
+++ b/tools/objtool/Build
@@ -1,6 +1,9 @@
 objtool-y += arch/$(SRCARCH)/
 objtool-y += builtin-check.o
+objtool-y += builtin-orc.o
 objtool-y += check.o
+objtool-y += orc_gen.o
+objtool-y += orc_dump.o
 objtool-y += elf.o
 objtool-y += special.o
 objtool-y += objtool.o
diff --git a/tools/objtool/Documentation/stack-validation.txt b/tools/objtool/Documentation/stack-validation.txt
index 17c1195f11f4..6a1af43862df 100644
--- a/tools/objtool/Documentation/stack-validation.txt
+++ b/tools/objtool/Documentation/stack-validation.txt
@@ -11,9 +11,6 @@ analyzes every .o file and ensures the validity of its stack metadata.
 It enforces a set of rules on asm code and C inline assembly code so
 that stack traces can be reliable.
 
-Currently it only checks frame pointer usage, but there are plans to add
-CFI validation for C files and CFI generation for asm files.
-
 For each function, it recursively follows all possible code paths and
 validates the correct frame pointer state at each instruction.
 
@@ -23,6 +20,10 @@ alternative execution paths to a given instruction (or set of
 instructions).  Similarly, it knows how to follow switch statements, for
 which gcc sometimes uses jump tables.
 
+(Objtool also has an 'orc generate' subcommand which generates debuginfo
+for the ORC unwinder.  See Documentation/x86/orc-unwinder.txt in the
+kernel tree for more details.)
+
 
 Why do we need stack metadata validation?
 -----------------------------------------
@@ -93,37 +94,14 @@ a) More reliable stack traces for frame pointer enabled kernels
        or at the very end of the function after the stack frame has been
        destroyed.  This is an inherent limitation of frame pointers.
 
-b) 100% reliable stack traces for DWARF enabled kernels
-
-   (NOTE: This is not yet implemented)
-
-   As an alternative to frame pointers, DWARF Call Frame Information
-   (CFI) metadata can be used to walk the stack.  Unlike frame pointers,
-   CFI metadata is out of band.  So it doesn't affect runtime
-   performance and it can be reliable even when interrupts or exceptions
-   are involved.
-
-   For C code, gcc automatically generates DWARF CFI metadata.  But for
-   asm code, generating CFI is a tedious manual approach which requires
-   manually placed .cfi assembler macros to be scattered throughout the
-   code.  It's clumsy and very easy to get wrong, and it makes the real
-   code harder to read.
-
-   Stacktool will improve this situation in several ways.  For code
-   which already has CFI annotations, it will validate them.  For code
-   which doesn't have CFI annotations, it will generate them.  So an
-   architecture can opt to strip out all the manual .cfi annotations
-   from their asm code and have objtool generate them instead.
+b) ORC (Oops Rewind Capability) unwind table generation
 
-   We might also add a runtime stack validation debug option where we
-   periodically walk the stack from schedule() and/or an NMI to ensure
-   that the stack metadata is sane and that we reach the bottom of the
-   stack.
+   An alternative to frame pointers and DWARF, ORC unwind data can be
+   used to walk the stack.  Unlike frame pointers, ORC data is out of
+   band.  So it doesn't affect runtime performance and it can be
+   reliable even when interrupts or exceptions are involved.
 
-   So the benefit of objtool here will be that external tooling should
-   always show perfect stack traces.  And the same will be true for
-   kernel warning/oops traces if the architecture has a runtime DWARF
-   unwinder.
+   For more details, see Documentation/x86/orc-unwinder.txt.
 
 c) Higher live patching compatibility rate
 
@@ -211,7 +189,7 @@ they mean, and suggestions for how to fix them.
    function, add proper frame pointer logic using the FRAME_BEGIN and
    FRAME_END macros.  Otherwise, if it's not a callable function, remove
    its ELF function annotation by changing ENDPROC to END, and instead
-   use the manual CFI hint macros in asm/undwarf.h.
+   use the manual unwind hint macros in asm/unwind_hints.h.
 
    If it's a GCC-compiled .c file, the error may be because the function
    uses an inline asm() statement which has a "call" instruction.  An
@@ -231,8 +209,8 @@ they mean, and suggestions for how to fix them.
    If the error is for an asm file, and the instruction is inside (or
    reachable from) a callable function, the function should be annotated
    with the ENTRY/ENDPROC macros (ENDPROC is the important one).
-   Otherwise, the code should probably be annotated with the CFI hint
-   macros in asm/undwarf.h so objtool and the unwinder can know the
+   Otherwise, the code should probably be annotated with the unwind hint
+   macros in asm/unwind_hints.h so objtool and the unwinder can know the
    stack state associated with the code.
 
    If you're 100% sure the code won't affect stack traces, or if you're
@@ -258,7 +236,7 @@ they mean, and suggestions for how to fix them.
    instructions aren't allowed in a callable function, and are most
    likely part of the kernel entry code.  They should usually not have
    the callable function annotation (ENDPROC) and should always be
-   annotated with the CFI hint macros in asm/undwarf.h.
+   annotated with the unwind hint macros in asm/unwind_hints.h.
 
 
 6. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame
@@ -272,7 +250,7 @@ they mean, and suggestions for how to fix them.
 
    If the instruction is not actually in a callable function (e.g.
    kernel entry code), change ENDPROC to END and annotate manually with
-   the CFI hint macros in asm/undwarf.h.
+   the unwind hint macros in asm/unwind_hints.h.
 
 
 7. file: warning: objtool: func()+0x5c: stack state mismatch
@@ -288,8 +266,8 @@ they mean, and suggestions for how to fix them.
 
    Another possibility is that the code has some asm or inline asm which
    does some unusual things to the stack or the frame pointer.  In such
-   cases it's probably appropriate to use the CFI hint macros in
-   asm/undwarf.h.
+   cases it's probably appropriate to use the unwind hint macros in
+   asm/unwind_hints.h.
 
 
 8. file.o: warning: objtool: funcA() falls through to next function funcB()
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 0e2765e243c0..3a6425fefc43 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -52,6 +52,9 @@ $(OBJTOOL): $(LIBSUBCMD) $(OBJTOOL_IN)
 	diff -I'^#include' arch/x86/insn/inat.h ../../arch/x86/include/asm/inat.h >/dev/null && \
 	diff -I'^#include' arch/x86/insn/inat_types.h ../../arch/x86/include/asm/inat_types.h >/dev/null) \
 	|| echo "warning: objtool: x86 instruction decoder differs from kernel" >&2 )) || true
+	@(test -d ../../kernel -a -d ../../tools -a -d ../objtool && (( \
+	diff ../../arch/x86/include/asm/orc_types.h orc_types.h >/dev/null) \
+	|| echo "warning: objtool: orc_types.h differs from kernel" >&2 )) || true
 	$(QUIET_LINK)$(CC) $(OBJTOOL_IN) $(LDFLAGS) -o $@
 
 
diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c
index 365c34ecab26..57254f5b2779 100644
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -29,7 +29,7 @@
 #include "builtin.h"
 #include "check.h"
 
-bool nofp;
+bool no_fp, no_unreachable;
 
 static const char * const check_usage[] = {
 	"objtool check [<options>] file.o",
@@ -37,7 +37,8 @@ static const char * const check_usage[] = {
 };
 
 const struct option check_options[] = {
-	OPT_BOOLEAN('f', "no-fp", &nofp, "Skip frame pointer validation"),
+	OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"),
+	OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"),
 	OPT_END(),
 };
 
@@ -52,5 +53,5 @@ int cmd_check(int argc, const char **argv)
 
 	objname = argv[0];
 
-	return check(objname, nofp);
+	return check(objname, no_fp, no_unreachable, false);
 }
diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c
new file mode 100644
index 000000000000..4c6b5c9ef073
--- /dev/null
+++ b/tools/objtool/builtin-orc.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * objtool orc:
+ *
+ * This command analyzes a .o file and adds .orc_unwind and .orc_unwind_ip
+ * sections to it, which is used by the in-kernel ORC unwinder.
+ *
+ * This command is a superset of "objtool check".
+ */
+
+#include <string.h>
+#include <subcmd/parse-options.h>
+#include "builtin.h"
+#include "check.h"
+
+
+static const char *orc_usage[] = {
+	"objtool orc generate [<options>] file.o",
+	"objtool orc dump file.o",
+	NULL,
+};
+
+extern const struct option check_options[];
+extern bool no_fp, no_unreachable;
+
+int cmd_orc(int argc, const char **argv)
+{
+	const char *objname;
+
+	argc--; argv++;
+	if (!strncmp(argv[0], "gen", 3)) {
+		argc = parse_options(argc, argv, check_options, orc_usage, 0);
+		if (argc != 1)
+			usage_with_options(orc_usage, check_options);
+
+		objname = argv[0];
+
+		return check(objname, no_fp, no_unreachable, true);
+
+	}
+
+	if (!strcmp(argv[0], "dump")) {
+		if (argc != 2)
+			usage_with_options(orc_usage, check_options);
+
+		objname = argv[1];
+
+		return orc_dump(objname);
+	}
+
+	usage_with_options(orc_usage, check_options);
+
+	return 0;
+}
diff --git a/tools/objtool/builtin.h b/tools/objtool/builtin.h
index 34d2ba78a616..dd526067fed5 100644
--- a/tools/objtool/builtin.h
+++ b/tools/objtool/builtin.h
@@ -18,5 +18,6 @@
 #define _BUILTIN_H
 
 extern int cmd_check(int argc, const char **argv);
+extern int cmd_orc(int argc, const char **argv);
 
 #endif /* _BUILTIN_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 2c6d74880403..3436a942b606 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -33,11 +33,11 @@ struct alternative {
 };
 
 const char *objname;
-static bool nofp;
+static bool no_fp;
 struct cfi_state initial_func_cfi;
 
-static struct instruction *find_insn(struct objtool_file *file,
-				     struct section *sec, unsigned long offset)
+struct instruction *find_insn(struct objtool_file *file,
+			      struct section *sec, unsigned long offset)
 {
 	struct instruction *insn;
 
@@ -59,19 +59,6 @@ static struct instruction *next_insn_same_sec(struct objtool_file *file,
 	return next;
 }
 
-static bool gcov_enabled(struct objtool_file *file)
-{
-	struct section *sec;
-	struct symbol *sym;
-
-	for_each_sec(file, sec)
-		list_for_each_entry(sym, &sec->symbol_list, list)
-			if (!strncmp(sym->name, "__gcov_.", 8))
-				return true;
-
-	return false;
-}
-
 #define func_for_each_insn(file, func, insn)				\
 	for (insn = find_insn(file, func->sec, func->offset);		\
 	     insn && &insn->list != &file->insn_list &&			\
@@ -100,7 +87,6 @@ static bool gcov_enabled(struct objtool_file *file)
 static bool ignore_func(struct objtool_file *file, struct symbol *func)
 {
 	struct rela *rela;
-	struct instruction *insn;
 
 	/* check for STACK_FRAME_NON_STANDARD */
 	if (file->whitelist && file->whitelist->rela)
@@ -113,11 +99,6 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func)
 				return true;
 		}
 
-	/* check if it has a context switching instruction */
-	func_for_each_insn(file, func, insn)
-		if (insn->type == INSN_CONTEXT_SWITCH)
-			return true;
-
 	return false;
 }
 
@@ -259,6 +240,11 @@ static int decode_instructions(struct objtool_file *file)
 		if (!(sec->sh.sh_flags & SHF_EXECINSTR))
 			continue;
 
+		if (strcmp(sec->name, ".altinstr_replacement") &&
+		    strcmp(sec->name, ".altinstr_aux") &&
+		    strncmp(sec->name, ".discard.", 9))
+			sec->text = true;
+
 		for (offset = 0; offset < sec->len; offset += insn->len) {
 			insn = malloc(sizeof(*insn));
 			if (!insn) {
@@ -874,6 +860,99 @@ static int add_switch_table_alts(struct objtool_file *file)
 	return 0;
 }
 
+static int read_unwind_hints(struct objtool_file *file)
+{
+	struct section *sec, *relasec;
+	struct rela *rela;
+	struct unwind_hint *hint;
+	struct instruction *insn;
+	struct cfi_reg *cfa;
+	int i;
+
+	sec = find_section_by_name(file->elf, ".discard.unwind_hints");
+	if (!sec)
+		return 0;
+
+	relasec = sec->rela;
+	if (!relasec) {
+		WARN("missing .rela.discard.unwind_hints section");
+		return -1;
+	}
+
+	if (sec->len % sizeof(struct unwind_hint)) {
+		WARN("struct unwind_hint size mismatch");
+		return -1;
+	}
+
+	file->hints = true;
+
+	for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) {
+		hint = (struct unwind_hint *)sec->data->d_buf + i;
+
+		rela = find_rela_by_dest(sec, i * sizeof(*hint));
+		if (!rela) {
+			WARN("can't find rela for unwind_hints[%d]", i);
+			return -1;
+		}
+
+		insn = find_insn(file, rela->sym->sec, rela->addend);
+		if (!insn) {
+			WARN("can't find insn for unwind_hints[%d]", i);
+			return -1;
+		}
+
+		cfa = &insn->state.cfa;
+
+		if (hint->type == UNWIND_HINT_TYPE_SAVE) {
+			insn->save = true;
+			continue;
+
+		} else if (hint->type == UNWIND_HINT_TYPE_RESTORE) {
+			insn->restore = true;
+			insn->hint = true;
+			continue;
+		}
+
+		insn->hint = true;
+
+		switch (hint->sp_reg) {
+		case ORC_REG_UNDEFINED:
+			cfa->base = CFI_UNDEFINED;
+			break;
+		case ORC_REG_SP:
+			cfa->base = CFI_SP;
+			break;
+		case ORC_REG_BP:
+			cfa->base = CFI_BP;
+			break;
+		case ORC_REG_SP_INDIRECT:
+			cfa->base = CFI_SP_INDIRECT;
+			break;
+		case ORC_REG_R10:
+			cfa->base = CFI_R10;
+			break;
+		case ORC_REG_R13:
+			cfa->base = CFI_R13;
+			break;
+		case ORC_REG_DI:
+			cfa->base = CFI_DI;
+			break;
+		case ORC_REG_DX:
+			cfa->base = CFI_DX;
+			break;
+		default:
+			WARN_FUNC("unsupported unwind_hint sp base reg %d",
+				  insn->sec, insn->offset, hint->sp_reg);
+			return -1;
+		}
+
+		cfa->offset = hint->sp_offset;
+		insn->state.type = hint->type;
+	}
+
+	return 0;
+}
+
 static int decode_sections(struct objtool_file *file)
 {
 	int ret;
@@ -904,6 +983,10 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 		return ret;
 
+	ret = read_unwind_hints(file);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
@@ -947,6 +1030,30 @@ static bool has_valid_stack_frame(struct insn_state *state)
 	return false;
 }
 
+static int update_insn_state_regs(struct instruction *insn, struct insn_state *state)
+{
+	struct cfi_reg *cfa = &state->cfa;
+	struct stack_op *op = &insn->stack_op;
+
+	if (cfa->base != CFI_SP)
+		return 0;
+
+	/* push */
+	if (op->dest.type == OP_DEST_PUSH)
+		cfa->offset += 8;
+
+	/* pop */
+	if (op->src.type == OP_SRC_POP)
+		cfa->offset -= 8;
+
+	/* add immediate to sp */
+	if (op->dest.type == OP_DEST_REG && op->src.type == OP_SRC_ADD &&
+	    op->dest.reg == CFI_SP && op->src.reg == CFI_SP)
+		cfa->offset -= op->src.offset;
+
+	return 0;
+}
+
 static void save_reg(struct insn_state *state, unsigned char reg, int base,
 		     int offset)
 {
@@ -1032,6 +1139,9 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 		return 0;
 	}
 
+	if (state->type == ORC_TYPE_REGS || state->type == ORC_TYPE_REGS_IRET)
+		return update_insn_state_regs(insn, state);
+
 	switch (op->dest.type) {
 
 	case OP_DEST_REG:
@@ -1051,7 +1161,7 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 				regs[CFI_BP].base = CFI_BP;
 				regs[CFI_BP].offset = -state->stack_size;
 				state->bp_scratch = false;
-			} else if (!nofp) {
+			} else if (!no_fp) {
 
 				WARN_FUNC("unknown stack-related register move",
 					  insn->sec, insn->offset);
@@ -1222,7 +1332,7 @@ static int update_insn_state(struct instruction *insn, struct insn_state *state)
 		}
 
 		/* detect when asm code uses rbp as a scratch register */
-		if (!nofp && insn->func && op->src.reg == CFI_BP &&
+		if (!no_fp && insn->func && op->src.reg == CFI_BP &&
 		    cfa->base != CFI_BP)
 			state->bp_scratch = true;
 		break;
@@ -1323,6 +1433,10 @@ static bool insn_state_match(struct instruction *insn, struct insn_state *state)
 			break;
 		}
 
+	} else if (state1->type != state2->type) {
+		WARN_FUNC("stack state mismatch: type1=%d type2=%d",
+			  insn->sec, insn->offset, state1->type, state2->type);
+
 	} else if (state1->drap != state2->drap ||
 		 (state1->drap && state1->drap_reg != state2->drap_reg)) {
 		WARN_FUNC("stack state mismatch: drap1=%d(%d) drap2=%d(%d)",
@@ -1346,7 +1460,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 			   struct insn_state state)
 {
 	struct alternative *alt;
-	struct instruction *insn;
+	struct instruction *insn, *next_insn;
 	struct section *sec;
 	struct symbol *func = NULL;
 	int ret;
@@ -1361,6 +1475,8 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 	}
 
 	while (1) {
+		next_insn = next_insn_same_sec(file, insn);
+
 		if (file->c_file && insn->func) {
 			if (func && func != insn->func) {
 				WARN("%s() falls through to next function %s()",
@@ -1378,13 +1494,54 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 		}
 
 		if (insn->visited) {
-			if (!!insn_state_match(insn, &state))
+			if (!insn->hint && !insn_state_match(insn, &state))
 				return 1;
 
 			return 0;
 		}
 
-		insn->state = state;
+		if (insn->hint) {
+			if (insn->restore) {
+				struct instruction *save_insn, *i;
+
+				i = insn;
+				save_insn = NULL;
+				func_for_each_insn_continue_reverse(file, func, i) {
+					if (i->save) {
+						save_insn = i;
+						break;
+					}
+				}
+
+				if (!save_insn) {
+					WARN_FUNC("no corresponding CFI save for CFI restore",
+						  sec, insn->offset);
+					return 1;
+				}
+
+				if (!save_insn->visited) {
+					/*
+					 * Oops, no state to copy yet.
+					 * Hopefully we can reach this
+					 * instruction from another branch
+					 * after the save insn has been
+					 * visited.
+					 */
+					if (insn == first)
+						return 0;
+
+					WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo",
+						  sec, insn->offset);
+					return 1;
+				}
+
+				insn->state = save_insn->state;
+			}
+
+			state = insn->state;
+
+		} else
+			insn->state = state;
 
 		insn->visited = true;
 
@@ -1423,7 +1580,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 
 			/* fallthrough */
 		case INSN_CALL_DYNAMIC:
-			if (!nofp && func && !has_valid_stack_frame(&state)) {
+			if (!no_fp && func && !has_valid_stack_frame(&state)) {
 				WARN_FUNC("call without frame pointer save/setup",
 					  sec, insn->offset);
 				return 1;
@@ -1461,6 +1618,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 
 			return 0;
 
+		case INSN_CONTEXT_SWITCH:
+			if (func && (!next_insn || !next_insn->hint)) {
+				WARN_FUNC("unsupported instruction in callable function",
+					  sec, insn->offset);
+				return 1;
+			}
+			return 0;
+
 		case INSN_STACK:
 			if (update_insn_state(insn, &state))
 				return -1;
@@ -1474,7 +1639,7 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 		if (insn->dead_end)
 			return 0;
 
-		insn = next_insn_same_sec(file, insn);
+		insn = next_insn;
 		if (!insn) {
 			WARN("%s: unexpected end of section", sec->name);
 			return 1;
@@ -1484,6 +1649,27 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 	return 0;
 }
 
+static int validate_unwind_hints(struct objtool_file *file)
+{
+	struct instruction *insn;
+	int ret, warnings = 0;
+	struct insn_state state;
+
+	if (!file->hints)
+		return 0;
+
+	clear_insn_state(&state);
+
+	for_each_insn(file, insn) {
+		if (insn->hint && !insn->visited) {
+			ret = validate_branch(file, insn, state);
+			warnings += ret;
+		}
+	}
+
+	return warnings;
+}
+
 static bool is_kasan_insn(struct instruction *insn)
 {
 	return (insn->type == INSN_CALL &&
@@ -1580,15 +1766,6 @@ static int validate_reachable_instructions(struct objtool_file *file)
 		if (insn->visited || ignore_unreachable_insn(insn))
 			continue;
 
-		/*
-		 * gcov produces a lot of unreachable instructions.  If we get
-		 * an unreachable warning and the file has gcov enabled, just
-		 * ignore it, and all other such warnings for the file.  Do
-		 * this here because this is an expensive function.
-		 */
-		if (gcov_enabled(file))
-			return 0;
-
 		WARN_FUNC("unreachable instruction", insn->sec, insn->offset);
 		return 1;
 	}
@@ -1613,15 +1790,15 @@ static void cleanup(struct objtool_file *file)
 	elf_close(file->elf);
 }
 
-int check(const char *_objname, bool _nofp)
+int check(const char *_objname, bool _no_fp, bool no_unreachable, bool orc)
 {
 	struct objtool_file file;
 	int ret, warnings = 0;
 
 	objname = _objname;
-	nofp = _nofp;
+	no_fp = _no_fp;
 
-	file.elf = elf_open(objname);
+	file.elf = elf_open(objname, orc ? O_RDWR : O_RDONLY);
 	if (!file.elf)
 		return 1;
 
@@ -1629,8 +1806,9 @@ int check(const char *_objname, bool _nofp)
 	hash_init(file.insn_hash);
 	file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
 	file.rodata = find_section_by_name(file.elf, ".rodata");
-	file.ignore_unreachables = false;
 	file.c_file = find_section_by_name(file.elf, ".comment");
+	file.ignore_unreachables = no_unreachable;
+	file.hints = false;
 
 	arch_initial_func_cfi_state(&initial_func_cfi);
 
@@ -1647,6 +1825,11 @@ int check(const char *_objname, bool _nofp)
 		goto out;
 	warnings += ret;
 
+	ret = validate_unwind_hints(&file);
+	if (ret < 0)
+		goto out;
+	warnings += ret;
+
 	if (!warnings) {
 		ret = validate_reachable_instructions(&file);
 		if (ret < 0)
@@ -1654,6 +1837,20 @@ int check(const char *_objname, bool _nofp)
 		warnings += ret;
 	}
 
+	if (orc) {
+		ret = create_orc(&file);
+		if (ret < 0)
+			goto out;
+
+		ret = create_orc_sections(&file);
+		if (ret < 0)
+			goto out;
+
+		ret = elf_write(file.elf);
+		if (ret < 0)
+			goto out;
+	}
+
 out:
 	cleanup(&file);
 
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index da85f5b00ec6..c9af11f0c8af 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -22,12 +22,14 @@
 #include "elf.h"
 #include "cfi.h"
 #include "arch.h"
+#include "orc.h"
 #include <linux/hashtable.h>
 
 struct insn_state {
 	struct cfi_reg cfa;
 	struct cfi_reg regs[CFI_NUM_REGS];
 	int stack_size;
+	unsigned char type;
 	bool bp_scratch;
 	bool drap;
 	int drap_reg;
@@ -41,13 +43,14 @@ struct instruction {
 	unsigned int len;
 	unsigned char type;
 	unsigned long immediate;
-	bool alt_group, visited, dead_end, ignore;
+	bool alt_group, visited, dead_end, ignore, hint, save, restore;
 	struct symbol *call_dest;
 	struct instruction *jump_dest;
 	struct list_head alts;
 	struct symbol *func;
 	struct stack_op stack_op;
 	struct insn_state state;
+	struct orc_entry orc;
 };
 
 struct objtool_file {
@@ -55,12 +58,22 @@ struct objtool_file {
 	struct list_head insn_list;
 	DECLARE_HASHTABLE(insn_hash, 16);
 	struct section *rodata, *whitelist;
-	bool ignore_unreachables, c_file;
+	bool ignore_unreachables, c_file, hints;
 };
 
-int check(const char *objname, bool nofp);
+int check(const char *objname, bool no_fp, bool no_unreachable, bool orc);
+
+struct instruction *find_insn(struct objtool_file *file,
+			      struct section *sec, unsigned long offset);
 
 #define for_each_insn(file, insn)					\
 	list_for_each_entry(insn, &file->insn_list, list)
 
+#define sec_for_each_insn(file, sec, insn)				\
+	for (insn = find_insn(file, sec, 0);				\
+	     insn && &insn->list != &file->insn_list &&			\
+			insn->sec == sec;				\
+	     insn = list_next_entry(insn, list))
+
+
 #endif /* _CHECK_H */
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 1a7e8aa2af58..6e9f980a7d26 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -30,16 +30,6 @@
 #include "elf.h"
 #include "warn.h"
 
-/*
- * Fallback for systems without this "read, mmaping if possible" cmd.
- */
-#ifndef ELF_C_READ_MMAP
-#define ELF_C_READ_MMAP ELF_C_READ
-#endif
-
-#define WARN_ELF(format, ...)					\
-	WARN(format ": %s", ##__VA_ARGS__, elf_errmsg(-1))
-
 struct section *find_section_by_name(struct elf *elf, const char *name)
 {
 	struct section *sec;
@@ -349,9 +339,10 @@ static int read_relas(struct elf *elf)
 	return 0;
 }
 
-struct elf *elf_open(const char *name)
+struct elf *elf_open(const char *name, int flags)
 {
 	struct elf *elf;
+	Elf_Cmd cmd;
 
 	elf_version(EV_CURRENT);
 
@@ -364,13 +355,20 @@ struct elf *elf_open(const char *name)
 
 	INIT_LIST_HEAD(&elf->sections);
 
-	elf->fd = open(name, O_RDONLY);
+	elf->fd = open(name, flags);
 	if (elf->fd == -1) {
 		perror("open");
 		goto err;
 	}
 
-	elf->elf = elf_begin(elf->fd, ELF_C_READ_MMAP, NULL);
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		cmd = ELF_C_READ_MMAP;
+	else if ((flags & O_ACCMODE) == O_RDWR)
+		cmd = ELF_C_RDWR;
+	else /* O_WRONLY */
+		cmd = ELF_C_WRITE;
+
+	elf->elf = elf_begin(elf->fd, cmd, NULL);
 	if (!elf->elf) {
 		WARN_ELF("elf_begin");
 		goto err;
@@ -397,6 +395,194 @@ err:
 	return NULL;
 }
 
+struct section *elf_create_section(struct elf *elf, const char *name,
+				   size_t entsize, int nr)
+{
+	struct section *sec, *shstrtab;
+	size_t size = entsize * nr;
+	struct Elf_Scn *s;
+	Elf_Data *data;
+
+	sec = malloc(sizeof(*sec));
+	if (!sec) {
+		perror("malloc");
+		return NULL;
+	}
+	memset(sec, 0, sizeof(*sec));
+
+	INIT_LIST_HEAD(&sec->symbol_list);
+	INIT_LIST_HEAD(&sec->rela_list);
+	hash_init(sec->rela_hash);
+	hash_init(sec->symbol_hash);
+
+	list_add_tail(&sec->list, &elf->sections);
+
+	s = elf_newscn(elf->elf);
+	if (!s) {
+		WARN_ELF("elf_newscn");
+		return NULL;
+	}
+
+	sec->name = strdup(name);
+	if (!sec->name) {
+		perror("strdup");
+		return NULL;
+	}
+
+	sec->idx = elf_ndxscn(s);
+	sec->len = size;
+	sec->changed = true;
+
+	sec->data = elf_newdata(s);
+	if (!sec->data) {
+		WARN_ELF("elf_newdata");
+		return NULL;
+	}
+
+	sec->data->d_size = size;
+	sec->data->d_align = 1;
+
+	if (size) {
+		sec->data->d_buf = malloc(size);
+		if (!sec->data->d_buf) {
+			perror("malloc");
+			return NULL;
+		}
+		memset(sec->data->d_buf, 0, size);
+	}
+
+	if (!gelf_getshdr(s, &sec->sh)) {
+		WARN_ELF("gelf_getshdr");
+		return NULL;
+	}
+
+	sec->sh.sh_size = size;
+	sec->sh.sh_entsize = entsize;
+	sec->sh.sh_type = SHT_PROGBITS;
+	sec->sh.sh_addralign = 1;
+	sec->sh.sh_flags = SHF_ALLOC;
+
+
+	/* Add section name to .shstrtab */
+	shstrtab = find_section_by_name(elf, ".shstrtab");
+	if (!shstrtab) {
+		WARN("can't find .shstrtab section");
+		return NULL;
+	}
+
+	s = elf_getscn(elf->elf, shstrtab->idx);
+	if (!s) {
+		WARN_ELF("elf_getscn");
+		return NULL;
+	}
+
+	data = elf_newdata(s);
+	if (!data) {
+		WARN_ELF("elf_newdata");
+		return NULL;
+	}
+
+	data->d_buf = sec->name;
+	data->d_size = strlen(name) + 1;
+	data->d_align = 1;
+
+	sec->sh.sh_name = shstrtab->len;
+
+	shstrtab->len += strlen(name) + 1;
+	shstrtab->changed = true;
+
+	return sec;
+}
+
+struct section *elf_create_rela_section(struct elf *elf, struct section *base)
+{
+	char *relaname;
+	struct section *sec;
+
+	relaname = malloc(strlen(base->name) + strlen(".rela") + 1);
+	if (!relaname) {
+		perror("malloc");
+		return NULL;
+	}
+	strcpy(relaname, ".rela");
+	strcat(relaname, base->name);
+
+	sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0);
+	if (!sec)
+		return NULL;
+
+	base->rela = sec;
+	sec->base = base;
+
+	sec->sh.sh_type = SHT_RELA;
+	sec->sh.sh_addralign = 8;
+	sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx;
+	sec->sh.sh_info = base->idx;
+	sec->sh.sh_flags = SHF_INFO_LINK;
+
+	return sec;
+}
+
+int elf_rebuild_rela_section(struct section *sec)
+{
+	struct rela *rela;
+	int nr, idx = 0, size;
+	GElf_Rela *relas;
+
+	nr = 0;
+	list_for_each_entry(rela, &sec->rela_list, list)
+		nr++;
+
+	size = nr * sizeof(*relas);
+	relas = malloc(size);
+	if (!relas) {
+		perror("malloc");
+		return -1;
+	}
+
+	sec->data->d_buf = relas;
+	sec->data->d_size = size;
+
+	sec->sh.sh_size = size;
+
+	idx = 0;
+	list_for_each_entry(rela, &sec->rela_list, list) {
+		relas[idx].r_offset = rela->offset;
+		relas[idx].r_addend = rela->addend;
+		relas[idx].r_info = GELF_R_INFO(rela->sym->idx, rela->type);
+		idx++;
+	}
+
+	return 0;
+}
+
+int elf_write(struct elf *elf)
+{
+	struct section *sec;
+	Elf_Scn *s;
+
+	list_for_each_entry(sec, &elf->sections, list) {
+		if (sec->changed) {
+			s = elf_getscn(elf->elf, sec->idx);
+			if (!s) {
+				WARN_ELF("elf_getscn");
+				return -1;
+			}
+			if (!gelf_update_shdr (s, &sec->sh)) {
+				WARN_ELF("gelf_update_shdr");
+				return -1;
+			}
+		}
+	}
+
+	if (elf_update(elf->elf, ELF_C_WRITE) < 0) {
+		WARN_ELF("elf_update");
+		return -1;
+	}
+
+	return 0;
+}
+
 void elf_close(struct elf *elf)
 {
 	struct section *sec, *tmpsec;
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index 343968b778cb..d86e2ff14466 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -28,6 +28,13 @@
 # define elf_getshdrstrndx elf_getshstrndx
 #endif
 
+/*
+ * Fallback for systems without this "read, mmaping if possible" cmd.
+ */
+#ifndef ELF_C_READ_MMAP
+#define ELF_C_READ_MMAP ELF_C_READ
+#endif
+
 struct section {
 	struct list_head list;
 	GElf_Shdr sh;
@@ -41,6 +48,7 @@ struct section {
 	char *name;
 	int idx;
 	unsigned int len;
+	bool changed, text;
 };
 
 struct symbol {
@@ -75,7 +83,7 @@ struct elf {
 };
 
 
-struct elf *elf_open(const char *name);
+struct elf *elf_open(const char *name, int flags);
 struct section *find_section_by_name(struct elf *elf, const char *name);
 struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset);
 struct symbol *find_symbol_containing(struct section *sec, unsigned long offset);
@@ -83,6 +91,11 @@ struct rela *find_rela_by_dest(struct section *sec, unsigned long offset);
 struct rela *find_rela_by_dest_range(struct section *sec, unsigned long offset,
 				     unsigned int len);
 struct symbol *find_containing_func(struct section *sec, unsigned long offset);
+struct section *elf_create_section(struct elf *elf, const char *name, size_t
+				   entsize, int nr);
+struct section *elf_create_rela_section(struct elf *elf, struct section *base);
+int elf_rebuild_rela_section(struct section *sec);
+int elf_write(struct elf *elf);
 void elf_close(struct elf *elf);
 
 #define for_each_sec(file, sec)						\
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index ecc5b1b5d15d..31e0f9143840 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -42,10 +42,11 @@ struct cmd_struct {
 };
 
 static const char objtool_usage_string[] =
-	"objtool [OPTIONS] COMMAND [ARGS]";
+	"objtool COMMAND [ARGS]";
 
 static struct cmd_struct objtool_cmds[] = {
 	{"check",	cmd_check,	"Perform stack metadata validation on an object file" },
+	{"orc",		cmd_orc,	"Generate in-place ORC unwind tables for an object file" },
 };
 
 bool help;
diff --git a/tools/objtool/orc.h b/tools/objtool/orc.h
new file mode 100644
index 000000000000..a4139e386ef3
--- /dev/null
+++ b/tools/objtool/orc.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ORC_H
+#define _ORC_H
+
+#include "orc_types.h"
+
+struct objtool_file;
+
+int create_orc(struct objtool_file *file);
+int create_orc_sections(struct objtool_file *file);
+
+int orc_dump(const char *objname);
+
+#endif /* _ORC_H */
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
new file mode 100644
index 000000000000..36c5bf6a2675
--- /dev/null
+++ b/tools/objtool/orc_dump.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <unistd.h>
+#include "orc.h"
+#include "warn.h"
+
+static const char *reg_name(unsigned int reg)
+{
+	switch (reg) {
+	case ORC_REG_PREV_SP:
+		return "prevsp";
+	case ORC_REG_DX:
+		return "dx";
+	case ORC_REG_DI:
+		return "di";
+	case ORC_REG_BP:
+		return "bp";
+	case ORC_REG_SP:
+		return "sp";
+	case ORC_REG_R10:
+		return "r10";
+	case ORC_REG_R13:
+		return "r13";
+	case ORC_REG_BP_INDIRECT:
+		return "bp(ind)";
+	case ORC_REG_SP_INDIRECT:
+		return "sp(ind)";
+	default:
+		return "?";
+	}
+}
+
+static const char *orc_type_name(unsigned int type)
+{
+	switch (type) {
+	case ORC_TYPE_CALL:
+		return "call";
+	case ORC_TYPE_REGS:
+		return "regs";
+	case ORC_TYPE_REGS_IRET:
+		return "iret";
+	default:
+		return "?";
+	}
+}
+
+static void print_reg(unsigned int reg, int offset)
+{
+	if (reg == ORC_REG_BP_INDIRECT)
+		printf("(bp%+d)", offset);
+	else if (reg == ORC_REG_SP_INDIRECT)
+		printf("(sp%+d)", offset);
+	else if (reg == ORC_REG_UNDEFINED)
+		printf("(und)");
+	else
+		printf("%s%+d", reg_name(reg), offset);
+}
+
+int orc_dump(const char *_objname)
+{
+	int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0;
+	struct orc_entry *orc = NULL;
+	char *name;
+	unsigned long nr_sections, orc_ip_addr = 0;
+	size_t shstrtab_idx;
+	Elf *elf;
+	Elf_Scn *scn;
+	GElf_Shdr sh;
+	GElf_Rela rela;
+	GElf_Sym sym;
+	Elf_Data *data, *symtab = NULL, *rela_orc_ip = NULL;
+
+
+	objname = _objname;
+
+	elf_version(EV_CURRENT);
+
+	fd = open(objname, O_RDONLY);
+	if (fd == -1) {
+		perror("open");
+		return -1;
+	}
+
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (!elf) {
+		WARN_ELF("elf_begin");
+		return -1;
+	}
+
+	if (elf_getshdrnum(elf, &nr_sections)) {
+		WARN_ELF("elf_getshdrnum");
+		return -1;
+	}
+
+	if (elf_getshdrstrndx(elf, &shstrtab_idx)) {
+		WARN_ELF("elf_getshdrstrndx");
+		return -1;
+	}
+
+	for (i = 0; i < nr_sections; i++) {
+		scn = elf_getscn(elf, i);
+		if (!scn) {
+			WARN_ELF("elf_getscn");
+			return -1;
+		}
+
+		if (!gelf_getshdr(scn, &sh)) {
+			WARN_ELF("gelf_getshdr");
+			return -1;
+		}
+
+		name = elf_strptr(elf, shstrtab_idx, sh.sh_name);
+		if (!name) {
+			WARN_ELF("elf_strptr");
+			return -1;
+		}
+
+		data = elf_getdata(scn, NULL);
+		if (!data) {
+			WARN_ELF("elf_getdata");
+			return -1;
+		}
+
+		if (!strcmp(name, ".symtab")) {
+			symtab = data;
+		} else if (!strcmp(name, ".orc_unwind")) {
+			orc = data->d_buf;
+			orc_size = sh.sh_size;
+		} else if (!strcmp(name, ".orc_unwind_ip")) {
+			orc_ip = data->d_buf;
+			orc_ip_addr = sh.sh_addr;
+		} else if (!strcmp(name, ".rela.orc_unwind_ip")) {
+			rela_orc_ip = data;
+		}
+	}
+
+	if (!symtab || !orc || !orc_ip)
+		return 0;
+
+	if (orc_size % sizeof(*orc) != 0) {
+		WARN("bad .orc_unwind section size");
+		return -1;
+	}
+
+	nr_entries = orc_size / sizeof(*orc);
+	for (i = 0; i < nr_entries; i++) {
+		if (rela_orc_ip) {
+			if (!gelf_getrela(rela_orc_ip, i, &rela)) {
+				WARN_ELF("gelf_getrela");
+				return -1;
+			}
+
+			if (!gelf_getsym(symtab, GELF_R_SYM(rela.r_info), &sym)) {
+				WARN_ELF("gelf_getsym");
+				return -1;
+			}
+
+			scn = elf_getscn(elf, sym.st_shndx);
+			if (!scn) {
+				WARN_ELF("elf_getscn");
+				return -1;
+			}
+
+			if (!gelf_getshdr(scn, &sh)) {
+				WARN_ELF("gelf_getshdr");
+				return -1;
+			}
+
+			name = elf_strptr(elf, shstrtab_idx, sh.sh_name);
+			if (!name || !*name) {
+				WARN_ELF("elf_strptr");
+				return -1;
+			}
+
+			printf("%s+%lx:", name, rela.r_addend);
+
+		} else {
+			printf("%lx:", orc_ip_addr + (i * sizeof(int)) + orc_ip[i]);
+		}
+
+
+		printf(" sp:");
+
+		print_reg(orc[i].sp_reg, orc[i].sp_offset);
+
+		printf(" bp:");
+
+		print_reg(orc[i].bp_reg, orc[i].bp_offset);
+
+		printf(" type:%s\n", orc_type_name(orc[i].type));
+	}
+
+	elf_end(elf);
+	close(fd);
+
+	return 0;
+}
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
new file mode 100644
index 000000000000..e5ca31429c9b
--- /dev/null
+++ b/tools/objtool/orc_gen.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "orc.h"
+#include "check.h"
+#include "warn.h"
+
+int create_orc(struct objtool_file *file)
+{
+	struct instruction *insn;
+
+	for_each_insn(file, insn) {
+		struct orc_entry *orc = &insn->orc;
+		struct cfi_reg *cfa = &insn->state.cfa;
+		struct cfi_reg *bp = &insn->state.regs[CFI_BP];
+
+		if (cfa->base == CFI_UNDEFINED) {
+			orc->sp_reg = ORC_REG_UNDEFINED;
+			continue;
+		}
+
+		switch (cfa->base) {
+		case CFI_SP:
+			orc->sp_reg = ORC_REG_SP;
+			break;
+		case CFI_SP_INDIRECT:
+			orc->sp_reg = ORC_REG_SP_INDIRECT;
+			break;
+		case CFI_BP:
+			orc->sp_reg = ORC_REG_BP;
+			break;
+		case CFI_BP_INDIRECT:
+			orc->sp_reg = ORC_REG_BP_INDIRECT;
+			break;
+		case CFI_R10:
+			orc->sp_reg = ORC_REG_R10;
+			break;
+		case CFI_R13:
+			orc->sp_reg = ORC_REG_R13;
+			break;
+		case CFI_DI:
+			orc->sp_reg = ORC_REG_DI;
+			break;
+		case CFI_DX:
+			orc->sp_reg = ORC_REG_DX;
+			break;
+		default:
+			WARN_FUNC("unknown CFA base reg %d",
+				  insn->sec, insn->offset, cfa->base);
+			return -1;
+		}
+
+		switch(bp->base) {
+		case CFI_UNDEFINED:
+			orc->bp_reg = ORC_REG_UNDEFINED;
+			break;
+		case CFI_CFA:
+			orc->bp_reg = ORC_REG_PREV_SP;
+			break;
+		case CFI_BP:
+			orc->bp_reg = ORC_REG_BP;
+			break;
+		default:
+			WARN_FUNC("unknown BP base reg %d",
+				  insn->sec, insn->offset, bp->base);
+			return -1;
+		}
+
+		orc->sp_offset = cfa->offset;
+		orc->bp_offset = bp->offset;
+		orc->type = insn->state.type;
+	}
+
+	return 0;
+}
+
+static int create_orc_entry(struct section *u_sec, struct section *ip_relasec,
+				unsigned int idx, struct section *insn_sec,
+				unsigned long insn_off, struct orc_entry *o)
+{
+	struct orc_entry *orc;
+	struct rela *rela;
+
+	/* populate ORC data */
+	orc = (struct orc_entry *)u_sec->data->d_buf + idx;
+	memcpy(orc, o, sizeof(*orc));
+
+	/* populate rela for ip */
+	rela = malloc(sizeof(*rela));
+	if (!rela) {
+		perror("malloc");
+		return -1;
+	}
+	memset(rela, 0, sizeof(*rela));
+
+	rela->sym = insn_sec->sym;
+	rela->addend = insn_off;
+	rela->type = R_X86_64_PC32;
+	rela->offset = idx * sizeof(int);
+
+	list_add_tail(&rela->list, &ip_relasec->rela_list);
+	hash_add(ip_relasec->rela_hash, &rela->hash, rela->offset);
+
+	return 0;
+}
+
+int create_orc_sections(struct objtool_file *file)
+{
+	struct instruction *insn, *prev_insn;
+	struct section *sec, *u_sec, *ip_relasec;
+	unsigned int idx;
+
+	struct orc_entry empty = {
+		.sp_reg = ORC_REG_UNDEFINED,
+		.bp_reg  = ORC_REG_UNDEFINED,
+		.type    = ORC_TYPE_CALL,
+	};
+
+	sec = find_section_by_name(file->elf, ".orc_unwind");
+	if (sec) {
+		WARN("file already has .orc_unwind section, skipping");
+		return -1;
+	}
+
+	/* count the number of needed orcs */
+	idx = 0;
+	for_each_sec(file, sec) {
+		if (!sec->text)
+			continue;
+
+		prev_insn = NULL;
+		sec_for_each_insn(file, sec, insn) {
+			if (!prev_insn ||
+			    memcmp(&insn->orc, &prev_insn->orc,
+				   sizeof(struct orc_entry))) {
+				idx++;
+			}
+			prev_insn = insn;
+		}
+
+		/* section terminator */
+		if (prev_insn)
+			idx++;
+	}
+	if (!idx)
+		return -1;
+
+
+	/* create .orc_unwind_ip and .rela.orc_unwind_ip sections */
+	sec = elf_create_section(file->elf, ".orc_unwind_ip", sizeof(int), idx);
+
+	ip_relasec = elf_create_rela_section(file->elf, sec);
+	if (!ip_relasec)
+		return -1;
+
+	/* create .orc_unwind section */
+	u_sec = elf_create_section(file->elf, ".orc_unwind",
+				   sizeof(struct orc_entry), idx);
+
+	/* populate sections */
+	idx = 0;
+	for_each_sec(file, sec) {
+		if (!sec->text)
+			continue;
+
+		prev_insn = NULL;
+		sec_for_each_insn(file, sec, insn) {
+			if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc,
+						 sizeof(struct orc_entry))) {
+
+				if (create_orc_entry(u_sec, ip_relasec, idx,
+						     insn->sec, insn->offset,
+						     &insn->orc))
+					return -1;
+
+				idx++;
+			}
+			prev_insn = insn;
+		}
+
+		/* section terminator */
+		if (prev_insn) {
+			if (create_orc_entry(u_sec, ip_relasec, idx,
+					     prev_insn->sec,
+					     prev_insn->offset + prev_insn->len,
+					     &empty))
+				return -1;
+
+			idx++;
+		}
+	}
+
+	if (elf_rebuild_rela_section(ip_relasec))
+		return -1;
+
+	return 0;
+}
diff --git a/tools/objtool/orc_types.h b/tools/objtool/orc_types.h
new file mode 100644
index 000000000000..9c9dc579bd7d
--- /dev/null
+++ b/tools/objtool/orc_types.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and BP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED		0
+#define ORC_REG_PREV_SP			1
+#define ORC_REG_DX			2
+#define ORC_REG_DI			3
+#define ORC_REG_BP			4
+#define ORC_REG_SP			5
+#define ORC_REG_R10			6
+#define ORC_REG_R13			7
+#define ORC_REG_BP_INDIRECT		8
+#define ORC_REG_SP_INDIRECT		9
+#define ORC_REG_MAX			15
+
+/*
+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
+ * caller's SP right before it made the call).  Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
+ * to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
+ * points to the iret return frame.
+ *
+ * The UNWIND_HINT macros are used only for the unwind_hint struct.  They
+ * aren't used in struct orc_entry due to size and complexity constraints.
+ * Objtool converts them to real types when it converts the hints to orc
+ * entries.
+ */
+#define ORC_TYPE_CALL			0
+#define ORC_TYPE_REGS			1
+#define ORC_TYPE_REGS_IRET		2
+#define UNWIND_HINT_TYPE_SAVE		3
+#define UNWIND_HINT_TYPE_RESTORE	4
+
+#ifndef __ASSEMBLY__
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard.  It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder.  It tells the
+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on
+ * the stack for a given code address.  Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+	s16		sp_offset;
+	s16		bp_offset;
+	unsigned	sp_reg:4;
+	unsigned	bp_reg:4;
+	unsigned	type:2;
+} __packed;
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack for the ORC unwinder.
+ *
+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
+ */
+struct unwind_hint {
+	u32		ip;
+	s16		sp_offset;
+	u8		sp_reg;
+	u8		type;
+};
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ORC_TYPES_H */