From 64d816cba06c67eeee455b8c78ebcda349d49c24 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:21 -0500
Subject: locking/qspinlock: Use _acquire/_release() versions of cmpxchg() &
 xchg()

This patch replaces the cmpxchg() and xchg() calls in the native
qspinlock code with the more relaxed _acquire or _release versions of
those calls to enable other architectures to adopt queued spinlocks
with less memory barrier performance overhead.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-2-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

(limited to 'kernel/locking/qspinlock.c')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..7868418ea586 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
  * (C) Copyright 2013-2014 Red Hat, Inc.
  * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
  *
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
  *          Peter Zijlstra <peterz@infradead.org>
  */
 
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
 	struct __qspinlock *l = (void *)lock;
 
-	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+	/*
+	 * Use release semantics to make sure that the MCS node is properly
+	 * initialized before changing the tail code.
+	 */
+	return (u32)xchg_release(&l->tail,
+				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 
 #else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 
 	for (;;) {
 		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
-		old = atomic_cmpxchg(&lock->val, val, new);
+		/*
+		 * Use release semantics to make sure that the MCS node is
+		 * properly initialized before changing the tail code.
+		 */
+		old = atomic_cmpxchg_release(&lock->val, val, new);
 		if (old == val)
 			break;
 
@@ -319,7 +329,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 		if (val == new)
 			new |= _Q_PENDING_VAL;
 
-		old = atomic_cmpxchg(&lock->val, val, new);
+		/*
+		 * Acquire semantic is required here as the function may
+		 * return immediately if the lock was free.
+		 */
+		old = atomic_cmpxchg_acquire(&lock->val, val, new);
 		if (old == val)
 			break;
 
@@ -426,7 +440,12 @@ queue:
 			set_locked(lock);
 			break;
 		}
-		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+		/*
+		 * The smp_load_acquire() call above has provided the necessary
+		 * acquire semantics required for locking. At most two
+		 * iterations of this loop may be ran.
+		 */
+		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
 		if (old == val)
 			goto release;	/* No contention */
 
-- 
cgit 


From 81b5598665a24083dd889fbd8cb08b0d8de4b8ad Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:22 -0500
Subject: locking/qspinlock: Prefetch the next node cacheline

A queue head CPU, after acquiring the lock, will have to notify
the next CPU in the wait queue that it has became the new queue
head. This involves loading a new cacheline from the MCS node of the
next CPU. That operation can be expensive and add to the latency of
locking operation.

This patch addes code to optmistically prefetch the next MCS node
cacheline if the next pointer is defined and it has been spinning
for the MCS lock for a while. This reduces the locking latency and
improves the system throughput.

The performance change will depend on whether the prefetch overhead
can be hidden within the latency of the lock spin loop. On really
short critical section, there may not be performance gain at all. With
longer critical section, however, it was found to have a performance
boost of 5-10% over a range of different queue depths with a spinlock
loop microbenchmark.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-3-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel/locking/qspinlock.c')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 7868418ea586..365b2033f55e 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -407,6 +407,16 @@ queue:
 
 		pv_wait_node(node);
 		arch_mcs_spin_lock_contended(&node->locked);
+
+		/*
+		 * While waiting for the MCS lock, the next pointer may have
+		 * been set by another lock waiter. We optimistically load
+		 * the next pointer & prefetch the cacheline for writing
+		 * to reduce latency in the upcoming MCS unlock operation.
+		 */
+		next = READ_ONCE(node->next);
+		if (next)
+			prefetchw(next);
 	}
 
 	/*
-- 
cgit 


From aa68744f80bfb6f26fbe7f10e42876066f7dac1b Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:23 -0500
Subject: locking/qspinlock: Avoid redundant read of next pointer

With optimistic prefetch of the next node cacheline, the next pointer
may have been properly inititalized. As a result, the reading
of node->next in the contended path may be redundant. This patch
eliminates the redundant read if the next pointer value is not NULL.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-4-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel/locking/qspinlock.c')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 365b2033f55e..986207887def 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -396,6 +396,7 @@ queue:
 	 * p,*,* -> n,*,*
 	 */
 	old = xchg_tail(lock, tail);
+	next = NULL;
 
 	/*
 	 * if there was a previous node; link it and wait until reaching the
@@ -463,10 +464,12 @@ queue:
 	}
 
 	/*
-	 * contended path; wait for next, release.
+	 * contended path; wait for next if not observed yet, release.
 	 */
-	while (!(next = READ_ONCE(node->next)))
-		cpu_relax();
+	if (!next) {
+		while (!(next = READ_ONCE(node->next)))
+			cpu_relax();
+	}
 
 	arch_mcs_spin_unlock_contended(&next->locked);
 	pv_kick_node(lock, next);
-- 
cgit 


From b3e0b1b6d841a4b2f64fc09ea728913da8218424 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 16 Oct 2015 14:39:38 +0200
Subject: locking, sched: Introduce smp_cond_acquire() and use it

Introduce smp_cond_acquire() which combines a control dependency and a
read barrier to form acquire semantics.

This primitive has two benefits:

 - it documents control dependencies,
 - its typically cheaper than using smp_load_acquire() in a loop.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/locking/qspinlock.c')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 986207887def..ed9d96708f93 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -433,8 +433,7 @@ queue:
 	 *
 	 */
 	pv_wait_head(lock, node);
-	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
-		cpu_relax();
+	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
 
 	/*
 	 * claim the lock:
-- 
cgit 


From 1c4941fd53afb46ab15826628e4819866d008a28 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Tue, 10 Nov 2015 16:18:56 -0500
Subject: locking/pvqspinlock: Allow limited lock stealing

This patch allows one attempt for the lock waiter to steal the lock
when entering the PV slowpath. To prevent lock starvation, the pending
bit will be set by the queue head vCPU when it is in the active lock
spinning loop to disable any lock stealing attempt.  This helps to
reduce the performance penalty caused by lock waiter preemption while
not having much of the downsides of a real unfair lock.

The pv_wait_head() function was renamed as pv_wait_head_or_lock()
as it was modified to acquire the lock before returning. This is
necessary because of possible lock stealing attempts from other tasks.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

                    Westmere                    Haswell
  Patch         32 vCPUs    48 vCPUs    32 vCPUs    48 vCPUs
  -----         --------    --------    --------    --------
  Before patch   3m15.6s    10m56.1s     1m44.1s     5m29.1s
  After patch    3m02.3s     5m00.2s     1m43.7s     3m03.5s

For the overcommited case (48 vCPUs), this patch is able to reduce
kernel build time by more than 54% for Westmere and 44% for Haswell.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447190336-53317-1-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel/locking/qspinlock.c')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ed9d96708f93..2ea42999d2d8 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -251,15 +251,16 @@ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
-					   struct mcs_spinlock *node) { }
+static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
+						   struct mcs_spinlock *node)
+						   { return 0; }
 
 #define pv_enabled()		false
 
 #define pv_init_node		__pv_init_node
 #define pv_wait_node		__pv_wait_node
 #define pv_kick_node		__pv_kick_node
-#define pv_wait_head		__pv_wait_head
+#define pv_wait_head_or_lock	__pv_wait_head_or_lock
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
@@ -431,10 +432,22 @@ queue:
 	 * sequentiality; this is because the set_locked() function below
 	 * does not imply a full barrier.
 	 *
+	 * The PV pv_wait_head_or_lock function, if active, will acquire
+	 * the lock and return a non-zero value. So we have to skip the
+	 * smp_load_acquire() call. As the next PV queue head hasn't been
+	 * designated yet, there is no way for the locked value to become
+	 * _Q_SLOW_VAL. So both the set_locked() and the
+	 * atomic_cmpxchg_relaxed() calls will be safe.
+	 *
+	 * If PV isn't active, 0 will be returned instead.
+	 *
 	 */
-	pv_wait_head(lock, node);
+	if ((val = pv_wait_head_or_lock(lock, node)))
+		goto locked;
+
 	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
 
+locked:
 	/*
 	 * claim the lock:
 	 *
@@ -446,7 +459,8 @@ queue:
 	 * to grab the lock.
 	 */
 	for (;;) {
-		if (val != tail) {
+		/* In the PV case we might already have _Q_LOCKED_VAL set */
+		if ((val & _Q_TAIL_MASK) != tail) {
 			set_locked(lock);
 			break;
 		}
@@ -493,7 +507,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #undef pv_init_node
 #undef pv_wait_node
 #undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
 
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath	__pv_queued_spin_lock_slowpath
-- 
cgit 


From cd0272fab785077c121aa91ec2401090965bbc37 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Mon, 9 Nov 2015 19:09:27 -0500
Subject: locking/pvqspinlock: Queue node adaptive spinning

In an overcommitted guest where some vCPUs have to be halted to make
forward progress in other areas, it is highly likely that a vCPU later
in the spinlock queue will be spinning while the ones earlier in the
queue would have been halted. The spinning in the later vCPUs is then
just a waste of precious CPU cycles because they are not going to
get the lock soon as the earlier ones have to be woken up and take
their turn to get the lock.

This patch implements an adaptive spinning mechanism where the vCPU
will call pv_wait() if the previous vCPU is not running.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

		    Westmere			Haswell
  Patch		32 vCPUs    48 vCPUs	32 vCPUs    48 vCPUs
  -----		--------    --------    --------    --------
  Before patch   3m02.3s     5m00.2s     1m43.7s     3m03.5s
  After patch    3m03.0s     4m37.5s	 1m43.0s     2m47.2s

For 32 vCPUs, this patch doesn't cause any noticeable change in
performance. For 48 vCPUs (over-committed), there is about 8%
performance improvement.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-8-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel/locking/qspinlock.c')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2ea42999d2d8..393d1874b9e0 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
  */
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+					   struct mcs_spinlock *prev) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
 static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
@@ -407,7 +408,7 @@ queue:
 		prev = decode_tail(old);
 		WRITE_ONCE(prev->next, node);
 
-		pv_wait_node(node);
+		pv_wait_node(node, prev);
 		arch_mcs_spin_lock_contended(&node->locked);
 
 		/*
-- 
cgit