diff options
| -rw-r--r-- | include/asm-generic/rqspinlock.h | 60 | ||||
| -rw-r--r-- | kernel/bpf/rqspinlock.c | 69 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c | 55 |
3 files changed, 108 insertions, 76 deletions
diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 6d4244d643df..0f2dcbbfee2f 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -129,8 +129,8 @@ dec: * <error> for lock B * release_held_lock_entry * - * try_cmpxchg_acquire for lock A * grab_held_lock_entry + * try_cmpxchg_acquire for lock A * * Lack of any ordering means reordering may occur such that dec, inc * are done before entry is overwritten. This permits a remote lock @@ -139,13 +139,8 @@ dec: * CPU holds a lock it is attempting to acquire, leading to false ABBA * diagnosis). * - * In case of unlock, we will always do a release on the lock word after - * releasing the entry, ensuring that other CPUs cannot hold the lock - * (and make conclusions about deadlocks) until the entry has been - * cleared on the local CPU, preventing any anomalies. Reordering is - * still possible there, but a remote CPU cannot observe a lock in our - * table which it is already holding, since visibility entails our - * release store for the said lock has not retired. + * The case of unlock is treated differently due to NMI reentrancy, see + * comments in res_spin_unlock. * * In theory we don't have a problem if the dec and WRITE_ONCE above get * reordered with each other, we either notice an empty NULL entry on @@ -175,10 +170,22 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock) { int val = 0; - if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { - grab_held_lock_entry(lock); + /* + * Grab the deadlock detection entry before doing the cmpxchg, so that + * reentrancy due to NMIs between the succeeding cmpxchg and creation of + * held lock entry can correctly detect an acquisition attempt in the + * interrupted context. + * + * cmpxchg lock A + * <NMI> + * res_spin_lock(A) --> missed AA, leads to timeout + * </NMI> + * grab_held_lock_entry(A) + */ + grab_held_lock_entry(lock); + + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) return 0; - } return resilient_queued_spin_lock_slowpath(lock, val); } @@ -192,28 +199,25 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); - if (unlikely(rqh->cnt > RES_NR_HELD)) - goto unlock; - WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); -unlock: /* - * Release barrier, ensures correct ordering. See release_held_lock_entry - * for details. Perform release store instead of queued_spin_unlock, - * since we use this function for test-and-set fallback as well. When we - * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. + * Release barrier, ensures correct ordering. Perform release store + * instead of queued_spin_unlock, since we use this function for the TAS + * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear + * the full 4-byte lockword. * - * Like release_held_lock_entry, we can do the release before the dec. - * We simply care about not seeing the 'lock' in our table from a remote - * CPU once the lock has been released, which doesn't rely on the dec. + * Perform the smp_store_release before clearing the lock entry so that + * NMIs landing in the unlock path can correctly detect AA issues. The + * opposite order shown below may lead to missed AA checks: * - * Unlike smp_wmb(), release is not a two way fence, hence it is - * possible for a inc to move up and reorder with our clearing of the - * entry. This isn't a problem however, as for a misdiagnosis of ABBA, - * the remote CPU needs to hold this lock, which won't be released until - * the store below is done, which would ensure the entry is overwritten - * to NULL, etc. + * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL) + * <NMI> + * res_spin_lock(A) --> missed AA, leads to timeout + * </NMI> + * smp_store_release(A->locked, 0) */ smp_store_release(&lock->locked, 0); + if (likely(rqh->cnt <= RES_NR_HELD)) + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); this_cpu_dec(rqspinlock_held_locks.cnt); } diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 3cc23d79a9fc..f7d0c8d4644e 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -196,32 +196,21 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask) return 0; } -static noinline int check_deadlock(rqspinlock_t *lock, u32 mask) -{ - int ret; - - ret = check_deadlock_AA(lock); - if (ret) - return ret; - ret = check_deadlock_ABBA(lock, mask); - if (ret) - return ret; - - return 0; -} - static noinline int check_timeout(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) { - u64 time = ktime_get_mono_fast_ns(); u64 prev = ts->cur; + u64 time; if (!ts->timeout_end) { - ts->cur = time; - ts->timeout_end = time + ts->duration; + if (check_deadlock_AA(lock)) + return -EDEADLK; + ts->cur = ktime_get_mono_fast_ns(); + ts->timeout_end = ts->cur + ts->duration; return 0; } + time = ktime_get_mono_fast_ns(); if (time > ts->timeout_end) return -ETIMEDOUT; @@ -231,7 +220,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask, */ if (prev + NSEC_PER_MSEC < time) { ts->cur = time; - return check_deadlock(lock, mask); + return check_deadlock_ABBA(lock, mask); } return 0; @@ -275,6 +264,10 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) int val, ret = 0; RES_INIT_TIMEOUT(ts); + /* + * The fast path is not invoked for the TAS fallback, so we must grab + * the deadlock detection entry here. + */ grab_held_lock_entry(lock); /* @@ -397,10 +390,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) goto queue; } - /* - * Grab an entry in the held locks array, to enable deadlock detection. - */ - grab_held_lock_entry(lock); + /* Deadlock detection entry already held after failing fast path. */ /* * We're pending, wait for the owner to go away. @@ -447,12 +437,21 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * queuing. */ queue: - lockevent_inc(lock_slowpath); /* - * Grab deadlock detection entry for the queue path. + * Do not queue if we're a waiter and someone is attempting this lock on + * the same CPU. In case of NMIs, this prevents long timeouts where we + * interrupt the pending waiter, and the owner, that will eventually + * signal the head of our queue, both of which are logically but not + * physically part of the queue, hence outside the scope of the idx > 0 + * check above for the trylock fallback. */ - grab_held_lock_entry(lock); + if (check_deadlock_AA(lock)) { + ret = -EDEADLK; + goto err_release_entry; + } + lockevent_inc(lock_slowpath); + /* Deadlock detection entry already held after failing fast path. */ node = this_cpu_ptr(&rqnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); @@ -464,19 +463,17 @@ queue: * not be nested NMIs taking spinlocks. That may not be true in * some architectures even though the chance of needing more than * 4 nodes will still be extremely unlikely. When that happens, - * we fall back to spinning on the lock directly without using - * any MCS node. This is not the most elegant solution, but is - * simple enough. + * we fall back to attempting a trylock operation without using + * any MCS node. Unlike qspinlock which cannot fail, we have the + * option of failing the slow path, and under contention, such a + * trylock spinning will likely be treated unfairly due to lack of + * queueing, hence do not spin. */ - if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { + if (unlikely(idx >= _Q_MAX_NODES || (in_nmi() && idx > 0))) { lockevent_inc(lock_no_node); - RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); - while (!queued_spin_trylock(lock)) { - if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { - lockevent_inc(rqspinlock_lock_timeout); - goto err_release_node; - } - cpu_relax(); + if (!queued_spin_trylock(lock)) { + ret = -EDEADLK; + goto err_release_node; } goto release; } diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c index e8dd3fbc6ea5..7b4ae5e81d32 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_test_rqspinlock.c @@ -33,9 +33,16 @@ static const unsigned int rqsl_hist_ms[] = { }; #define RQSL_NR_HIST_BUCKETS ARRAY_SIZE(rqsl_hist_ms) +enum rqsl_context { + RQSL_CTX_NORMAL = 0, + RQSL_CTX_NMI, + RQSL_CTX_MAX, +}; + struct rqsl_cpu_hist { - atomic64_t normal[RQSL_NR_HIST_BUCKETS]; - atomic64_t nmi[RQSL_NR_HIST_BUCKETS]; + atomic64_t hist[RQSL_CTX_MAX][RQSL_NR_HIST_BUCKETS]; + atomic64_t success[RQSL_CTX_MAX]; + atomic64_t failure[RQSL_CTX_MAX]; }; static DEFINE_PER_CPU(struct rqsl_cpu_hist, rqsl_cpu_hists); @@ -117,14 +124,18 @@ static u32 rqsl_hist_bucket_idx(u32 delta_ms) return RQSL_NR_HIST_BUCKETS - 1; } -static void rqsl_record_lock_time(u64 delta_ns, bool is_nmi) +static void rqsl_record_lock_result(u64 delta_ns, enum rqsl_context ctx, int ret) { struct rqsl_cpu_hist *hist = this_cpu_ptr(&rqsl_cpu_hists); u32 delta_ms = DIV_ROUND_UP_ULL(delta_ns, NSEC_PER_MSEC); u32 bucket = rqsl_hist_bucket_idx(delta_ms); - atomic64_t *buckets = is_nmi ? hist->nmi : hist->normal; + atomic64_t *buckets = hist->hist[ctx]; atomic64_inc(&buckets[bucket]); + if (!ret) + atomic64_inc(&hist->success[ctx]); + else + atomic64_inc(&hist->failure[ctx]); } static int rqspinlock_worker_fn(void *arg) @@ -147,7 +158,8 @@ static int rqspinlock_worker_fn(void *arg) } start_ns = ktime_get_mono_fast_ns(); ret = raw_res_spin_lock_irqsave(worker_lock, flags); - rqsl_record_lock_time(ktime_get_mono_fast_ns() - start_ns, false); + rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns, + RQSL_CTX_NORMAL, ret); mdelay(normal_delay); if (!ret) raw_res_spin_unlock_irqrestore(worker_lock, flags); @@ -190,7 +202,8 @@ static void nmi_cb(struct perf_event *event, struct perf_sample_data *data, locks = rqsl_get_lock_pair(cpu); start_ns = ktime_get_mono_fast_ns(); ret = raw_res_spin_lock_irqsave(locks.nmi_lock, flags); - rqsl_record_lock_time(ktime_get_mono_fast_ns() - start_ns, true); + rqsl_record_lock_result(ktime_get_mono_fast_ns() - start_ns, + RQSL_CTX_NMI, ret); mdelay(nmi_delay); @@ -300,12 +313,14 @@ static void rqsl_print_histograms(void) u64 norm_counts[RQSL_NR_HIST_BUCKETS]; u64 nmi_counts[RQSL_NR_HIST_BUCKETS]; u64 total_counts[RQSL_NR_HIST_BUCKETS]; + u64 norm_success, nmi_success, success_total; + u64 norm_failure, nmi_failure, failure_total; u64 norm_total = 0, nmi_total = 0, total = 0; bool has_slow = false; for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) { - norm_counts[i] = atomic64_read(&hist->normal[i]); - nmi_counts[i] = atomic64_read(&hist->nmi[i]); + norm_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NORMAL][i]); + nmi_counts[i] = atomic64_read(&hist->hist[RQSL_CTX_NMI][i]); total_counts[i] = norm_counts[i] + nmi_counts[i]; norm_total += norm_counts[i]; nmi_total += nmi_counts[i]; @@ -315,17 +330,33 @@ static void rqsl_print_histograms(void) has_slow = true; } + norm_success = atomic64_read(&hist->success[RQSL_CTX_NORMAL]); + nmi_success = atomic64_read(&hist->success[RQSL_CTX_NMI]); + norm_failure = atomic64_read(&hist->failure[RQSL_CTX_NORMAL]); + nmi_failure = atomic64_read(&hist->failure[RQSL_CTX_NMI]); + success_total = norm_success + nmi_success; + failure_total = norm_failure + nmi_failure; + if (!total) continue; if (!has_slow) { - pr_err(" cpu%d: total %llu (normal %llu, nmi %llu), all within 0-%ums\n", - cpu, total, norm_total, nmi_total, RQSL_SLOW_THRESHOLD_MS); + pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | " + "success %llu (normal %llu, nmi %llu) | " + "failure %llu (normal %llu, nmi %llu), all within 0-%ums\n", + cpu, total, norm_total, nmi_total, + success_total, norm_success, nmi_success, + failure_total, norm_failure, nmi_failure, + RQSL_SLOW_THRESHOLD_MS); continue; } - pr_err(" cpu%d: total %llu (normal %llu, nmi %llu)\n", - cpu, total, norm_total, nmi_total); + pr_err(" cpu%d: total %llu (normal %llu, nmi %llu) | " + "success %llu (normal %llu, nmi %llu) | " + "failure %llu (normal %llu, nmi %llu)\n", + cpu, total, norm_total, nmi_total, + success_total, norm_success, nmi_success, + failure_total, norm_failure, nmi_failure); for (i = 0; i < RQSL_NR_HIST_BUCKETS; i++) { unsigned int start_ms; |
