summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernel/sched/ext.c62
1 files changed, 18 insertions, 44 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5da699cacde1..56a6d453543a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1818,49 +1818,12 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
return dst_rq;
}
-/*
- * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
- * banging on the same DSQ on a large NUMA system to the point where switching
- * to the bypass mode can take a long time. Inject artificial delays while the
- * bypass mode is switching to guarantee timely completion.
- */
-static void scx_breather(struct rq *rq)
-{
- u64 until;
-
- lockdep_assert_rq_held(rq);
-
- if (likely(!READ_ONCE(scx_aborting)))
- return;
-
- raw_spin_rq_unlock(rq);
-
- until = ktime_get_ns() + NSEC_PER_MSEC;
-
- do {
- int cnt = 1024;
- while (READ_ONCE(scx_aborting) && --cnt)
- cpu_relax();
- } while (READ_ONCE(scx_aborting) &&
- time_before64(ktime_get_ns(), until));
-
- raw_spin_rq_lock(rq);
-}
-
static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_bypass() dequeueing
- * tasks from @dsq trying to put the system into the bypass mode. On
- * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
- * the machine into soft lockups. Give a breather.
- */
- scx_breather(rq);
-
- /*
* The caller can't expect to successfully consume a task if the task's
* addition to @dsq isn't guaranteed to be visible somehow. Test
* @dsq->list without locking and skip if it seems empty.
@@ -1873,6 +1836,17 @@ retry:
nldsq_for_each_task(p, dsq) {
struct rq *task_rq = task_rq(p);
+ /*
+ * This loop can lead to multiple lockup scenarios, e.g. the BPF
+ * scheduler can put an enormous number of affinitized tasks into
+ * a contended DSQ, or the outer retry loop can repeatedly race
+ * against scx_bypass() dequeueing tasks from @dsq trying to put
+ * the system into the bypass mode. This can easily live-lock the
+ * machine. If aborting, exit from all non-bypass DSQs.
+ */
+ if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
+ break;
+
if (rq == task_rq) {
task_unlink_from_dsq(p, dsq);
move_local_task_to_local_dsq(p, 0, dsq, rq);
@@ -5637,6 +5611,13 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
return false;
/*
+ * If the BPF scheduler keeps calling this function repeatedly, it can
+ * cause similar live-lock conditions as consume_dispatch_q().
+ */
+ if (unlikely(READ_ONCE(scx_aborting)))
+ return false;
+
+ /*
* Can be called from either ops.dispatch() locking this_rq() or any
* context where no rq lock is held. If latter, lock @p's task_rq which
* we'll likely need anyway.
@@ -5656,13 +5637,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
raw_spin_rq_lock(src_rq);
}
- /*
- * If the BPF scheduler keeps calling this function repeatedly, it can
- * cause similar live-lock conditions as consume_dispatch_q(). Insert a
- * breather if necessary.
- */
- scx_breather(src_rq);
-
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);