1 files changed, 80 insertions, 79 deletions
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index e4bd145255d0..95ab4cdf582e 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -9,6 +9,7 @@
 #include <linux/sched/clock.h>
 #include <asm/qspinlock.h>
 #include <asm/paravirt.h>
+#include <trace/events/lock.h>
 
 #define MAX_NODES	4
 
@@ -16,7 +17,8 @@ struct qnode {
 	struct qnode	*next;
 	struct qspinlock *lock;
 	int		cpu;
-	int		yield_cpu;
+	u8		sleepy; /* 1 if the previous vCPU was preempted or
+				 * if the previous node was sleepy */
 	u8		locked; /* 1 if lock acquired */
 };
 
@@ -43,7 +45,7 @@ static bool pv_sleepy_lock_sticky __read_mostly = false;
 static u64 pv_sleepy_lock_interval_ns __read_mostly = 0;
 static int pv_sleepy_lock_factor __read_mostly = 256;
 static bool pv_yield_prev __read_mostly = true;
-static bool pv_yield_propagate_owner __read_mostly = true;
+static bool pv_yield_sleepy_owner __read_mostly = true;
 static bool pv_prod_head __read_mostly = false;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
@@ -161,6 +163,8 @@ static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail)
 {
 	u32 prev, tmp;
 
+	kcsan_release();
+
 	asm volatile(
 "\t"	PPC_RELEASE_BARRIER "						\n"
 "1:	lwarx	%0,0,%2		# publish_tail_cpu			\n"
@@ -245,22 +249,18 @@ static __always_inline void seen_sleepy_lock(void)
 		this_cpu_write(sleepy_lock_seen_clock, sched_clock());
 }
 
-static __always_inline void seen_sleepy_node(struct qspinlock *lock, u32 val)
+static __always_inline void seen_sleepy_node(void)
 {
 	if (pv_sleepy_lock) {
 		if (pv_sleepy_lock_interval_ns)
 			this_cpu_write(sleepy_lock_seen_clock, sched_clock());
-		if (val & _Q_LOCKED_VAL) {
-			if (!(val & _Q_SLEEPY_VAL))
-				try_set_sleepy(lock, val);
-		}
+		/* Don't set sleepy because we likely have a stale val */
 	}
 }
 
-static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
+static struct qnode *get_tail_qnode(struct qspinlock *lock, int prev_cpu)
 {
-	int cpu = decode_tail_cpu(val);
-	struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu);
+	struct qnodes *qnodesp = per_cpu_ptr(&qnodes, prev_cpu);
 	int idx;
 
 	/*
@@ -351,74 +351,66 @@ static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u
 	return __yield_to_locked_owner(lock, val, paravirt, mustq);
 }
 
-static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt)
+static __always_inline void propagate_sleepy(struct qnode *node, u32 val, bool paravirt)
 {
 	struct qnode *next;
 	int owner;
 
 	if (!paravirt)
 		return;
-	if (!pv_yield_propagate_owner)
-		return;
-
-	owner = get_owner_cpu(val);
-	if (*set_yield_cpu == owner)
+	if (!pv_yield_sleepy_owner)
 		return;
 
 	next = READ_ONCE(node->next);
 	if (!next)
 		return;
 
-	if (vcpu_is_preempted(owner)) {
-		next->yield_cpu = owner;
-		*set_yield_cpu = owner;
-	} else if (*set_yield_cpu != -1) {
-		next->yield_cpu = owner;
-		*set_yield_cpu = owner;
-	}
+	if (next->sleepy)
+		return;
+
+	owner = get_owner_cpu(val);
+	if (vcpu_is_preempted(owner))
+		next->sleepy = 1;
 }
 
 /* Called inside spin_begin() */
-static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt)
+static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, int prev_cpu, bool paravirt)
 {
-	int prev_cpu = decode_tail_cpu(val);
 	u32 yield_count;
-	int yield_cpu;
 	bool preempted = false;
 
 	if (!paravirt)
 		goto relax;
 
-	if (!pv_yield_propagate_owner)
+	if (!pv_yield_sleepy_owner)
 		goto yield_prev;
 
-	yield_cpu = READ_ONCE(node->yield_cpu);
-	if (yield_cpu == -1) {
-		/* Propagate back the -1 CPU */
-		if (node->next && node->next->yield_cpu != -1)
-			node->next->yield_cpu = yield_cpu;
-		goto yield_prev;
-	}
-
-	yield_count = yield_count_of(yield_cpu);
-	if ((yield_count & 1) == 0)
-		goto yield_prev; /* owner vcpu is running */
-
-	spin_end();
-
-	preempted = true;
-	seen_sleepy_node(lock, val);
+	/*
+	 * If the previous waiter was preempted it might not be able to
+	 * propagate sleepy to us, so check the lock in that case too.
+	 */
+	if (node->sleepy || vcpu_is_preempted(prev_cpu)) {
+		u32 val = READ_ONCE(lock->val);
 
-	smp_rmb();
+		if (val & _Q_LOCKED_VAL) {
+			if (node->next && !node->next->sleepy) {
+				/*
+				 * Propagate sleepy to next waiter. Only if
+				 * owner is preempted, which allows the queue
+				 * to become "non-sleepy" if vCPU preemption
+				 * ceases to occur, even if the lock remains
+				 * highly contended.
+				 */
+				if (vcpu_is_preempted(get_owner_cpu(val)))
+					node->next->sleepy = 1;
+			}
 
-	if (yield_cpu == node->yield_cpu) {
-		if (node->next && node->next->yield_cpu != yield_cpu)
-			node->next->yield_cpu = yield_cpu;
-		yield_to_preempted(yield_cpu, yield_count);
-		spin_begin();
-		return preempted;
+			preempted = yield_to_locked_owner(lock, val, paravirt);
+			if (preempted)
+				return preempted;
+		}
+		node->sleepy = false;
 	}
-	spin_begin();
 
 yield_prev:
 	if (!pv_yield_prev)
@@ -431,11 +423,11 @@ yield_prev:
 	spin_end();
 
 	preempted = true;
-	seen_sleepy_node(lock, val);
+	seen_sleepy_node();
 
 	smp_rmb(); /* See __yield_to_locked_owner comment */
 
-	if (!node->locked) {
+	if (!READ_ONCE(node->locked)) {
 		yield_to_preempted(prev_cpu, yield_count);
 		spin_begin();
 		return preempted;
@@ -541,7 +533,6 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 	bool sleepy = false;
 	bool mustq = false;
 	int idx;
-	int set_yield_cpu = -1;
 	int iters = 0;
 
 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
@@ -565,11 +556,16 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 	node->next = NULL;
 	node->lock = lock;
 	node->cpu = smp_processor_id();
-	node->yield_cpu = -1;
+	node->sleepy = 0;
 	node->locked = 0;
 
 	tail = encode_tail_cpu(node->cpu);
 
+	/*
+	 * Assign all attributes of a node before it can be published.
+	 * Issues an lwsync, serving as a release barrier, as well as a
+	 * compiler barrier.
+	 */
 	old = publish_tail_cpu(lock, tail);
 
 	/*
@@ -577,26 +573,23 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 	 * head of the waitqueue.
 	 */
 	if (old & _Q_TAIL_CPU_MASK) {
-		struct qnode *prev = get_tail_qnode(lock, old);
+		int prev_cpu = decode_tail_cpu(old);
+		struct qnode *prev = get_tail_qnode(lock, prev_cpu);
 
 		/* Link @node into the waitqueue. */
 		WRITE_ONCE(prev->next, node);
 
 		/* Wait for mcs node lock to be released */
 		spin_begin();
-		while (!node->locked) {
+		while (!READ_ONCE(node->locked)) {
 			spec_barrier();
 
-			if (yield_to_prev(lock, node, old, paravirt))
+			if (yield_to_prev(lock, node, prev_cpu, paravirt))
 				seen_preempted = true;
 		}
 		spec_barrier();
 		spin_end();
 
-		/* Clear out stale propagated yield_cpu */
-		if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1)
-			node->yield_cpu = -1;
-
 		smp_rmb(); /* acquire barrier for the mcs lock */
 
 		/*
@@ -638,7 +631,7 @@ again:
 			}
 		}
 
-		propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
+		propagate_sleepy(node, val, paravirt);
 		preempted = yield_head_to_locked_owner(lock, val, paravirt);
 		if (!maybe_stealers)
 			continue;
@@ -705,29 +698,37 @@ again:
 	}
 
 release:
-	qnodesp->count--; /* release the node */
+	/*
+	 * Clear the lock before releasing the node, as another CPU might see stale
+	 * values if an interrupt occurs after we increment qnodesp->count
+	 * but before node->lock is initialized. The barrier ensures that
+	 * there are no further stores to the node after it has been released.
+	 */
+	node->lock = NULL;
+	barrier();
+	qnodesp->count--;
 }
 
-void queued_spin_lock_slowpath(struct qspinlock *lock)
+void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock)
 {
+	trace_contention_begin(lock, LCB_F_SPIN);
 	/*
 	 * This looks funny, but it induces the compiler to inline both
 	 * sides of the branch rather than share code as when the condition
 	 * is passed as the paravirt argument to the functions.
 	 */
 	if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) {
-		if (try_to_steal_lock(lock, true)) {
+		if (try_to_steal_lock(lock, true))
 			spec_barrier();
-			return;
-		}
-		queued_spin_lock_mcs_queue(lock, true);
+		else
+			queued_spin_lock_mcs_queue(lock, true);
 	} else {
-		if (try_to_steal_lock(lock, false)) {
+		if (try_to_steal_lock(lock, false))
 			spec_barrier();
-			return;
-		}
-		queued_spin_lock_mcs_queue(lock, false);
+		else
+			queued_spin_lock_mcs_queue(lock, false);
 	}
+	trace_contention_end(lock, 0);
 }
 EXPORT_SYMBOL(queued_spin_lock_slowpath);
 
@@ -942,21 +943,21 @@ static int pv_yield_prev_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n");
 
-static int pv_yield_propagate_owner_set(void *data, u64 val)
+static int pv_yield_sleepy_owner_set(void *data, u64 val)
 {
-	pv_yield_propagate_owner = !!val;
+	pv_yield_sleepy_owner = !!val;
 
 	return 0;
 }
 
-static int pv_yield_propagate_owner_get(void *data, u64 *val)
+static int pv_yield_sleepy_owner_get(void *data, u64 *val)
 {
-	*val = pv_yield_propagate_owner;
+	*val = pv_yield_sleepy_owner;
 
 	return 0;
 }
 
-DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_sleepy_owner, pv_yield_sleepy_owner_get, pv_yield_sleepy_owner_set, "%llu\n");
 
 static int pv_prod_head_set(void *data, u64 val)
 {
@@ -988,7 +989,7 @@ static __init int spinlock_debugfs_init(void)
 		debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns);
 		debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor);
 		debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
-		debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
+		debugfs_create_file("qspl_pv_yield_sleepy_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_sleepy_owner);
 		debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head);
 	}