summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/asm-generic/rqspinlock.h60
-rw-r--r--kernel/bpf/rqspinlock.c69
-rw-r--r--tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c55
3 files changed, 108 insertions, 76 deletions
diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
index 6d4244d643df..0f2dcbbfee2f 100644
--- a/include/asm-generic/rqspinlock.h
+++ b/include/asm-generic/rqspinlock.h
@@ -129,8 +129,8 @@ dec:
* <error> for lock B
* release_held_lock_entry
*
- * try_cmpxchg_acquire for lock A
* grab_held_lock_entry
+ * try_cmpxchg_acquire for lock A
*
* Lack of any ordering means reordering may occur such that dec, inc
* are done before entry is overwritten. This permits a remote lock
@@ -139,13 +139,8 @@ dec:
* CPU holds a lock it is attempting to acquire, leading to false ABBA
* diagnosis).
*
- * In case of unlock, we will always do a release on the lock word after
- * releasing the entry, ensuring that other CPUs cannot hold the lock
- * (and make conclusions about deadlocks) until the entry has been
- * cleared on the local CPU, preventing any anomalies. Reordering is
- * still possible there, but a remote CPU cannot observe a lock in our
- * table which it is already holding, since visibility entails our
- * release store for the said lock has not retired.
+ * The case of unlock is treated differently due to NMI reentrancy, see
+ * comments in res_spin_unlock.
*
* In theory we don't have a problem if the dec and WRITE_ONCE above get
* reordered with each other, we either notice an empty NULL entry on
@@ -175,10 +170,22 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock)
{
int val = 0;
- if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) {
- grab_held_lock_entry(lock);
+ /*
+ * Grab the deadlock detection entry before doing the cmpxchg, so that
+ * reentrancy due to NMIs between the succeeding cmpxchg and creation of
+ * held lock entry can correctly detect an acquisition attempt in the
+ * interrupted context.
+ *
+ * cmpxchg lock A
+ * <NMI>
+ * res_spin_lock(A) --> missed AA, leads to timeout
+ * </NMI>
+ * grab_held_lock_entry(A)
+ */
+ grab_held_lock_entry(lock);
+
+ if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
return 0;
- }
return resilient_queued_spin_lock_slowpath(lock, val);
}
@@ -192,28 +199,25 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock)
{
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
- if (unlikely(rqh->cnt > RES_NR_HELD))
- goto unlock;
- WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
-unlock:
/*
- * Release barrier, ensures correct ordering. See release_held_lock_entry
- * for details. Perform release store instead of queued_spin_unlock,
- * since we use this function for test-and-set fallback as well. When we
- * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword.
+ * Release barrier, ensures correct ordering. Perform release store
+ * instead of queued_spin_unlock, since we use this function for the TAS
+ * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
+ * the full 4-byte lockword.
*
- * Like release_held_lock_entry, we can do the release before the dec.
- * We simply care about not seeing the 'lock' in our table from a remote
- * CPU once the lock has been released, which doesn't rely on the dec.
+ * Perform the smp_store_release before clearing the lock entry so that
+ * NMIs landing in the unlock path can correctly detect AA issues. The
+ * opposite order shown below may lead to missed AA checks:
*
- * Unlike smp_wmb(), release is not a two way fence, hence it is
- * possible for a inc to move up and reorder with our clearing of the
- * entry. This isn't a problem however, as for a misdiagnosis of ABBA,
- * the remote CPU needs to hold this lock, which won't be released until
- * the store below is done, which would ensure the entry is overwritten
- * to NULL, etc.
+ * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
+ * <NMI>
+ * res_spin_lock(A) --> missed AA, leads to timeout
+ * </NMI>
+ * smp_store_release(A->locked, 0)
*/
smp_store_release(&lock->locked, 0);
+ if (likely(rqh->cnt <= RES_NR_HELD))
+ WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
this_cpu_dec(rqspinlock_held_locks.cnt);
}
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 3cc23d79a9fc..f7d0c8d4644e 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -196,32 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
return 0;
}
-static noinline int check_deadlock(rqspinlock_t *lock, u32 mask)
-{
- int ret;
-
- ret = check_deadlock_AA(lock);
- if (ret)
- return ret;
- ret = check_deadlock_ABBA(lock, mask);
- if (ret)
- return ret;
-
- return 0;
-}
-
static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
struct rqspinlock_timeout *ts)
{
- u64 time = ktime_get_mono_fast_ns();
u64 prev = ts->cur;
+ u64 time;
if (!ts->timeout_end) {
- ts->cur = time;
- ts->timeout_end = time + ts->duration;
+ if (check_deadlock_AA(lock))
+ return -EDEADLK;
+ ts->cur = ktime_get_mono_fast_ns();
+ ts->timeout_end = ts->cur + ts->duration;
return 0;
}
+ time = ktime_get_mono_fast_ns();
if (time > ts->timeout_end)
return -ETIMEDOUT;
@@ -231,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
*/
if (prev + NSEC_PER_MSEC < time) {
ts->cur = time;
- return check_deadlock(lock, mask);
+ return check_deadlock_ABBA(lock, mask);
}
return 0;
@@ -275,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
int val, ret = 0;
RES_INIT_TIMEOUT(ts);
+ /*
+ * The fast path is not invoked for the TAS fallback, so we must grab
+ * the deadlock detection entry here.
+ */
grab_held_lock_entry(lock);
/*
@@ -397,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
goto queue;
}
- /*
- * Grab an entry in the held locks array, to enable deadlock detection.
- */
- grab_held_lock_entry(lock);
+ /* Deadlock detection entry already held after failing fast path. */
/*
* We're pending, wait for the owner to go away.
@@ -447,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
* queuing.
*/
queue:
- lockevent_inc(lock_slowpath);
/*
- * Grab deadlock detection entry for the queue path.
+ * Do not queue if we're a waiter and someone is attempting this lock on
+ * the same CPU. In case of NMIs, this prevents long timeouts where we
+ * interrupt the pending waiter, and the owner, that will eventually
+ * signal the head of our queue, both of which are logically but not
+ * physically part of the queue, hence outside the scope of the idx > 0
+ * check above for the trylock fallback.
*/
- grab_held_lock_entry(lock);
+ if (check_deadlock_AA(lock)) {
+ ret = -EDEADLK;
+ goto err_release_entry;
+ }
+ lockevent_inc(lock_slowpath);
+ /* Deadlock detection entry already held after failing fast path. */
node = this_cpu_ptr(&rqnodes[0].mcs);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
@@ -464,19 +463,17 @@ queue:
* not be nested NMIs taking spinlocks. That may not be true in
* some architectures even though the chance of needing more than
* 4 nodes will still be extremely unlikely. When that happens,
- * we fall back to spinning on the lock directly without using
- * any MCS node. This is not the most elegant solution, but is
- * simple enough.
+ * we fall back to attempting a trylock operation without using
+ * any MCS node. Unlike qspinlock which cannot fail, we have the
+ * option of failing the slow path, and under contention, such a
+ * trylock spinning will likely be treated unfairly due to lack of
+ * queueing, hence do not spin.
*/
- if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) {
+ if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) {
lockevent_inc(lock_no_node);
- RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
- while (!queued_spin_trylock(lock)) {
- if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
- lockevent_inc(rqspinlock_lock_timeout);
- goto err_release_node;
- }
- cpu_relax();
+ if (!queued_spin_trylock(lock)) {
+ ret = -EDEADLK;
+ goto err_release_node;
}
goto release;
}
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
index e8dd3fbc6ea5..7b4ae5e81d32 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c
@@ -33,9 +33,16 @@ static const unsigned int rqsl_hist_ms[] = {
};
#define RQSL_NR_HIST_BUCKETS ARRAY_SIZE(rqsl_hist_ms)
+enum rqsl_context {
+ RQSL_CTX_NORMAL = 0,
+ RQSL_CTX_NMI,
+ RQSL_CTX_MAX,
+};
+
struct rqsl_cpu_hist {
- atomic64_t normal[RQSL_NR_HIST_BUCKETS];
- atomic64_t nmi[RQSL_NR_HIST_BUCKETS];
+ atomic64_t hist[RQSL_CTX_MAX][RQSL_NR_HIST_BUCKETS];
+ atomic64_t success[RQSL_CTX_MAX];
+ atomic64_t failure[RQSL_CTX_MAX];
};
static DEFINE_PER_CPU(struct rqsl_cpu_hist, rqsl_cpu_hists);
@@ -117,14 +124,18 @@ static u32 rqsl_hist_bucket_idx(u32 delta_ms)
return RQSL_NR_HIST_BUCKETS - 1;
}
-static void rqsl_record_lock_time(u64 delta_ns, bool is_nmi)
+static void rqsl_record_lock_result(u64 delta_ns, enum rqsl_context ctx, int ret)
{
struct rqsl_cpu_hist *hist = this_cpu_ptr(&rqsl_cpu_hists);
u32 delta_ms = DIV_ROUND_UP_ULL(delta_ns, NSEC_PER_MSEC);
u32 bucket = rqsl_hist_bucket_idx(delta_ms);
- atomic64_t *buckets = is_nmi ? hist->nmi : hist->normal;
+ atomic64_t *buckets = hist->hist[ctx];
atomic64_inc(&buckets[bucket]);
+ if (!ret)
+ atomic64_inc(&hist->success[ctx]);
+ else
+ atomic64_inc(&hist->failure[ctx]);
}
static int rqspinlock_worker_fn(void *arg)
@@ -147,7 +158,8 @@ static int rqspinlock_worker_fn(void *arg)
}
start_ns = ktime_get_mono_fast_ns();
ret = raw_res_spin_lock_irqsave(worker_lock, flags);
- rqsl_record_lock_time(ktime_get_mono_fast_ns() - start_ns, false);
+ rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns,
+ RQSL_CTX_NORMAL, ret);
mdelay(normal_delay);
if (!ret)
raw_res_spin_unlock_irqrestore(worker_lock, flags);
@@ -190,7 +202,8 @@ static void nmi_cb(struct perf_event *event, struct perf_sample_data *data,
locks = rqsl_get_lock_pair(cpu);
start_ns = ktime_get_mono_fast_ns();
ret = raw_res_spin_lock_irqsave(locks.nmi_lock, flags);
- rqsl_record_lock_time(ktime_get_mono_fast_ns() - start_ns, true);
+ rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns,
+ RQSL_CTX_NMI, ret);
mdelay(nmi_delay);
@@ -300,12 +313,14 @@ static void rqsl_print_histograms(void)
u64 norm_counts[RQSL_NR_HIST_BUCKETS];
u64 nmi_counts[RQSL_NR_HIST_BUCKETS];
u64 total_counts[RQSL_NR_HIST_BUCKETS];
+ u64 norm_success, nmi_success, success_total;
+ u64 norm_failure, nmi_failure, failure_total;
u64 norm_total = 0, nmi_total = 0, total = 0;
bool has_slow = false;
for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) {
- norm_counts[i] = atomic64_read(&hist->normal[i]);
- nmi_counts[i] = atomic64_read(&hist->nmi[i]);
+ norm_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NORMAL][i]);
+ nmi_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NMI][i]);
total_counts[i] = norm_counts[i] + nmi_counts[i];
norm_total += norm_counts[i];
nmi_total += nmi_counts[i];
@@ -315,17 +330,33 @@ static void rqsl_print_histograms(void)
has_slow = true;
}
+ norm_success = atomic64_read(&hist->success[RQSL_CTX_NORMAL]);
+ nmi_success = atomic64_read(&hist->success[RQSL_CTX_NMI]);
+ norm_failure = atomic64_read(&hist->failure[RQSL_CTX_NORMAL]);
+ nmi_failure = atomic64_read(&hist->failure[RQSL_CTX_NMI]);
+ success_total = norm_success + nmi_success;
+ failure_total = norm_failure + nmi_failure;
+
if (!total)
continue;
if (!has_slow) {
- pr_err(" cpu%d: total %llu (normal %llu, nmi %llu), all within 0-%ums\n",
- cpu, total, norm_total, nmi_total, RQSL_SLOW_THRESHOLD_MS);
+ pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | "
+ "success %llu (normal %llu, nmi %llu) | "
+ "failure %llu (normal %llu, nmi %llu), all within 0-%ums\n",
+ cpu, total, norm_total, nmi_total,
+ success_total, norm_success, nmi_success,
+ failure_total, norm_failure, nmi_failure,
+ RQSL_SLOW_THRESHOLD_MS);
continue;
}
- pr_err(" cpu%d: total %llu (normal %llu, nmi %llu)\n",
- cpu, total, norm_total, nmi_total);
+ pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | "
+ "success %llu (normal %llu, nmi %llu) | "
+ "failure %llu (normal %llu, nmi %llu)\n",
+ cpu, total, norm_total, nmi_total,
+ success_total, norm_success, nmi_success,
+ failure_total, norm_failure, nmi_failure);
for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) {
unsigned int start_ms;