3 files changed, 108 insertions, 76 deletions
diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
index 6d4244d643df..0f2dcbbfee2f 100644
--- a/include/asm-generic/rqspinlock.h
+++ b/include/asm-generic/rqspinlock.h
@@ -129,8 +129,8 @@ dec:
 	 * <error> for lock B
 	 * release_held_lock_entry
 	 *
-	 * try_cmpxchg_acquire for lock A
 	 * grab_held_lock_entry
+	 * try_cmpxchg_acquire for lock A
 	 *
 	 * Lack of any ordering means reordering may occur such that dec, inc
 	 * are done before entry is overwritten. This permits a remote lock
@@ -139,13 +139,8 @@ dec:
 	 * CPU holds a lock it is attempting to acquire, leading to false ABBA
 	 * diagnosis).
 	 *
-	 * In case of unlock, we will always do a release on the lock word after
-	 * releasing the entry, ensuring that other CPUs cannot hold the lock
-	 * (and make conclusions about deadlocks) until the entry has been
-	 * cleared on the local CPU, preventing any anomalies. Reordering is
-	 * still possible there, but a remote CPU cannot observe a lock in our
-	 * table which it is already holding, since visibility entails our
-	 * release store for the said lock has not retired.
+	 * The case of unlock is treated differently due to NMI reentrancy, see
+	 * comments in res_spin_unlock.
 	 *
 	 * In theory we don't have a problem if the dec and WRITE_ONCE above get
 	 * reordered with each other, we either notice an empty NULL entry on
@@ -175,10 +170,22 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock)
 {
 	int val = 0;
 
-	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
-		grab_held_lock_entry(lock);
+	/*
+	 * Grab the deadlock detection entry before doing the cmpxchg, so that
+	 * reentrancy due to NMIs between the succeeding cmpxchg and creation of
+	 * held lock entry can correctly detect an acquisition attempt in the
+	 * interrupted context.
+	 *
+	 * cmpxchg lock A
+	 * <NMI>
+	 * res_spin_lock(A) --> missed AA, leads to timeout
+	 * </NMI>
+	 * grab_held_lock_entry(A)
+	 */
+	grab_held_lock_entry(lock);
+
+	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
 		return 0;
-	}
 	return resilient_queued_spin_lock_slowpath(lock, val);
 }
 
@@ -192,28 +199,25 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock)
 {
 	struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
 
-	if (unlikely(rqh->cnt > RES_NR_HELD))
-		goto unlock;
-	WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
-unlock:
 	/*
-	 * Release barrier, ensures correct ordering. See release_held_lock_entry
-	 * for details.  Perform release store instead of queued_spin_unlock,
-	 * since we use this function for test-and-set fallback as well. When we
-	 * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
+	 * Release barrier, ensures correct ordering. Perform release store
+	 * instead of queued_spin_unlock, since we use this function for the TAS
+	 * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
+	 * the full 4-byte lockword.
 	 *
-	 * Like release_held_lock_entry, we can do the release before the dec.
-	 * We simply care about not seeing the 'lock' in our table from a remote
-	 * CPU once the lock has been released, which doesn't rely on the dec.
+	 * Perform the smp_store_release before clearing the lock entry so that
+	 * NMIs landing in the unlock path can correctly detect AA issues. The
+	 * opposite order shown below may lead to missed AA checks:
 	 *
-	 * Unlike smp_wmb(), release is not a two way fence, hence it is
-	 * possible for a inc to move up and reorder with our clearing of the
-	 * entry. This isn't a problem however, as for a misdiagnosis of ABBA,
-	 * the remote CPU needs to hold this lock, which won't be released until
-	 * the store below is done, which would ensure the entry is overwritten
-	 * to NULL, etc.
+	 * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
+	 * <NMI>
+	 * res_spin_lock(A) --> missed AA, leads to timeout
+	 * </NMI>
+	 * smp_store_release(A->locked, 0)
 	 */
 	smp_store_release(&lock->locked, 0);
+	if (likely(rqh->cnt <= RES_NR_HELD))
+		WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
 	this_cpu_dec(rqspinlock_held_locks.cnt);
 }
 
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 3cc23d79a9fc..f7d0c8d4644e 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -196,32 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
 	return 0;
 }
 
-static noinline int check_deadlock(rqspinlock_t *lock, u32 mask)
-{
-	int ret;
-
-	ret = check_deadlock_AA(lock);
-	if (ret)
-		return ret;
-	ret = check_deadlock_ABBA(lock, mask);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 				  struct rqspinlock_timeout *ts)
 {
-	u64 time = ktime_get_mono_fast_ns();
 	u64 prev = ts->cur;
+	u64 time;
 
 	if (!ts->timeout_end) {
-		ts->cur = time;
-		ts->timeout_end = time + ts->duration;
+		if (check_deadlock_AA(lock))
+			return -EDEADLK;
+		ts->cur = ktime_get_mono_fast_ns();
+		ts->timeout_end = ts->cur + ts->duration;
 		return 0;
 	}
 
+	time = ktime_get_mono_fast_ns();
 	if (time > ts->timeout_end)
 		return -ETIMEDOUT;
 
@@ -231,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 	 */
 	if (prev + NSEC_PER_MSEC < time) {
 		ts->cur = time;
-		return check_deadlock(lock, mask);
+		return check_deadlock_ABBA(lock, mask);
 	}
 
 	return 0;
@@ -275,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
 	int val, ret = 0;
 
 	RES_INIT_TIMEOUT(ts);
+	/*
+	 * The fast path is not invoked for the TAS fallback, so we must grab
+	 * the deadlock detection entry here.
+	 */
 	grab_held_lock_entry(lock);
 
 	/*
@@ -397,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 		goto queue;
 	}
 
-	/*
-	 * Grab an entry in the held locks array, to enable deadlock detection.
-	 */
-	grab_held_lock_entry(lock);
+	/* Deadlock detection entry already held after failing fast path. */
 
 	/*
 	 * We're pending, wait for the owner to go away.
@@ -447,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 	 * queuing.
 	 */
 queue:
-	lockevent_inc(lock_slowpath);
 	/*
-	 * Grab deadlock detection entry for the queue path.
+	 * Do not queue if we're a waiter and someone is attempting this lock on
+	 * the same CPU. In case of NMIs, this prevents long timeouts where we
+	 * interrupt the pending waiter, and the owner, that will eventually
+	 * signal the head of our queue, both of which are logically but not
+	 * physically part of the queue, hence outside the scope of the idx > 0
+	 * check above for the trylock fallback.
 	 */
-	grab_held_lock_entry(lock);
+	if (check_deadlock_AA(lock)) {
+		ret = -EDEADLK;
+		goto err_release_entry;
+	}
 
+	lockevent_inc(lock_slowpath);
+	/* Deadlock detection entry already held after failing fast path. */
 	node = this_cpu_ptr(&rqnodes[0].mcs);
 	idx = node->count++;
 	tail = encode_tail(smp_processor_id(), idx);
@@ -464,19 +463,17 @@ queue:
 	 * not be nested NMIs taking spinlocks. That may not be true in
 	 * some architectures even though the chance of needing more than
 	 * 4 nodes will still be extremely unlikely. When that happens,
-	 * we fall back to spinning on the lock directly without using
-	 * any MCS node. This is not the most elegant solution, but is
-	 * simple enough.
+	 * we fall back to attempting a trylock operation without using
+	 * any MCS node. Unlike qspinlock which cannot fail, we have the
+	 * option of failing the slow path, and under contention, such a
+	 * trylock spinning will likely be treated unfairly due to lack of
+	 * queueing, hence do not spin.
 	 */
-	if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
+	if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
 		lockevent_inc(lock_no_node);
-		RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
-		while (!queued_spin_trylock(lock)) {
-			if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
-				lockevent_inc(rqspinlock_lock_timeout);
-				goto err_release_node;
-			}
-			cpu_relax();
+		if (!queued_spin_trylock(lock)) {
+			ret = -EDEADLK;
+			goto err_release_node;
 		}
 		goto release;
 	}
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
index e8dd3fbc6ea5..7b4ae5e81d32 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
@@ -33,9 +33,16 @@ static const unsigned int rqsl_hist_ms[] = {
 };
 #define RQSL_NR_HIST_BUCKETS ARRAY_SIZE(rqsl_hist_ms)
 
+enum rqsl_context {
+	RQSL_CTX_NORMAL = 0,
+	RQSL_CTX_NMI,
+	RQSL_CTX_MAX,
+};
+
 struct rqsl_cpu_hist {
-	atomic64_t normal[RQSL_NR_HIST_BUCKETS];
-	atomic64_t nmi[RQSL_NR_HIST_BUCKETS];
+	atomic64_t hist[RQSL_CTX_MAX][RQSL_NR_HIST_BUCKETS];
+	atomic64_t success[RQSL_CTX_MAX];
+	atomic64_t failure[RQSL_CTX_MAX];
 };
 
 static DEFINE_PER_CPU(struct rqsl_cpu_hist, rqsl_cpu_hists);
@@ -117,14 +124,18 @@ static u32 rqsl_hist_bucket_idx(u32 delta_ms)
 	return RQSL_NR_HIST_BUCKETS - 1;
 }
 
-static void rqsl_record_lock_time(u64 delta_ns, bool is_nmi)
+static void rqsl_record_lock_result(u64 delta_ns, enum rqsl_context ctx, int ret)
 {
 	struct rqsl_cpu_hist *hist = this_cpu_ptr(&rqsl_cpu_hists);
 	u32 delta_ms = DIV_ROUND_UP_ULL(delta_ns, NSEC_PER_MSEC);
 	u32 bucket = rqsl_hist_bucket_idx(delta_ms);
-	atomic64_t *buckets = is_nmi ? hist->nmi : hist->normal;
+	atomic64_t *buckets = hist->hist[ctx];
 
 	atomic64_inc(&buckets[bucket]);
+	if (!ret)
+		atomic64_inc(&hist->success[ctx]);
+	else
+		atomic64_inc(&hist->failure[ctx]);
 }
 
 static int rqspinlock_worker_fn(void *arg)
@@ -147,7 +158,8 @@ static int rqspinlock_worker_fn(void *arg)
 			}
 			start_ns = ktime_get_mono_fast_ns();
 			ret = raw_res_spin_lock_irqsave(worker_lock, flags);
-			rqsl_record_lock_time(ktime_get_mono_fast_ns() - start_ns, false);
+			rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns,
+						RQSL_CTX_NORMAL, ret);
 			mdelay(normal_delay);
 			if (!ret)
 				raw_res_spin_unlock_irqrestore(worker_lock, flags);
@@ -190,7 +202,8 @@ static void nmi_cb(struct perf_event *event, struct perf_sample_data *data,
 	locks = rqsl_get_lock_pair(cpu);
 	start_ns = ktime_get_mono_fast_ns();
 	ret = raw_res_spin_lock_irqsave(locks.nmi_lock, flags);
-	rqsl_record_lock_time(ktime_get_mono_fast_ns() - start_ns, true);
+	rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns,
+				RQSL_CTX_NMI, ret);
 
 	mdelay(nmi_delay);
 
@@ -300,12 +313,14 @@ static void rqsl_print_histograms(void)
 		u64 norm_counts[RQSL_NR_HIST_BUCKETS];
 		u64 nmi_counts[RQSL_NR_HIST_BUCKETS];
 		u64 total_counts[RQSL_NR_HIST_BUCKETS];
+		u64 norm_success, nmi_success, success_total;
+		u64 norm_failure, nmi_failure, failure_total;
 		u64 norm_total = 0, nmi_total = 0, total = 0;
 		bool has_slow = false;
 
 		for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) {
-			norm_counts[i] = atomic64_read(&hist->normal[i]);
-			nmi_counts[i] = atomic64_read(&hist->nmi[i]);
+			norm_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NORMAL][i]);
+			nmi_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NMI][i]);
 			total_counts[i] = norm_counts[i] + nmi_counts[i];
 			norm_total += norm_counts[i];
 			nmi_total += nmi_counts[i];
@@ -315,17 +330,33 @@ static void rqsl_print_histograms(void)
 				has_slow = true;
 		}
 
+		norm_success = atomic64_read(&hist->success[RQSL_CTX_NORMAL]);
+		nmi_success = atomic64_read(&hist->success[RQSL_CTX_NMI]);
+		norm_failure = atomic64_read(&hist->failure[RQSL_CTX_NORMAL]);
+		nmi_failure = atomic64_read(&hist->failure[RQSL_CTX_NMI]);
+		success_total = norm_success + nmi_success;
+		failure_total = norm_failure + nmi_failure;
+
 		if (!total)
 			continue;
 
 		if (!has_slow) {
-			pr_err(" cpu%d: total %llu (normal %llu, nmi %llu), all within 0-%ums\n",
-			       cpu, total, norm_total, nmi_total, RQSL_SLOW_THRESHOLD_MS);
+			pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | "
+			       "success %llu (normal %llu, nmi %llu) | "
+			       "failure %llu (normal %llu, nmi %llu), all within 0-%ums\n",
+			       cpu, total, norm_total, nmi_total,
+			       success_total, norm_success, nmi_success,
+			       failure_total, norm_failure, nmi_failure,
+			       RQSL_SLOW_THRESHOLD_MS);
 			continue;
 		}
 
-		pr_err(" cpu%d: total %llu (normal %llu, nmi %llu)\n",
-		       cpu, total, norm_total, nmi_total);
+		pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | "
+		       "success %llu (normal %llu, nmi %llu) | "
+		       "failure %llu (normal %llu, nmi %llu)\n",
+		       cpu, total, norm_total, nmi_total,
+		       success_total, norm_success, nmi_success,
+		       failure_total, norm_failure, nmi_failure);
 		for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) {
 			unsigned int start_ms;