diff options
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r-- | kernel/sched/ext.c | 95 |
1 files changed, 64 insertions, 31 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5900b06fd036..751d73d500e5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -862,7 +862,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); +static int scx_ops_bypass_depth; +static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock); static bool scx_ops_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -2633,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~SCX_RQ_BAL_KEEP; + rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); if (static_branch_unlikely(&scx_ops_cpu_preempt) && unlikely(rq->scx.cpu_released)) { @@ -2644,7 +2645,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * emitted in scx_next_task_picked(). */ if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL); + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2947,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; + bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + bool kick_idle = false; /* - * If balance_scx() is telling us to keep running @prev, replenish slice - * if necessary and keep running @prev. Otherwise, pop the first one - * from the local DSQ. - * * WORKAROUND: * * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just @@ -2961,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq) * which then ends up calling pick_task_scx() without preceding * balance_scx(). * - * For now, ignore cases where $prev is not on SCX. This isn't great and - * can theoretically lead to stalls. However, for switch_all cases, this - * happens only while a BPF scheduler is being loaded or unloaded, and, - * for partial cases, fair will likely keep triggering this CPU. + * Keep running @prev if possible and avoid stalling from entering idle + * without balancing. * - * Once fair is fixed, restore WARN_ON_ONCE(). + * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() + * if pick_task_scx() is called without preceding balance_scx(). */ - if ((rq->scx.flags & SCX_RQ_BAL_KEEP) && - prev->sched_class == &ext_sched_class) { + if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { + if (prev_on_scx) { + keep_prev = true; + } else { + keep_prev = false; + kick_idle = true; + } + } else if (unlikely(keep_prev && !prev_on_scx)) { + /* only allowed during transitions */ + WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + keep_prev = false; + } + + /* + * If balance_scx() is telling us to keep running @prev, replenish slice + * if necessary and keep running @prev. Otherwise, pop the first one + * from the local DSQ. + */ + if (keep_prev) { p = prev; if (!p->scx.slice) p->scx.slice = SCX_SLICE_DFL; } else { p = first_local_task(rq); - if (!p) + if (!p) { + if (kick_idle) + scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); return NULL; + } if (unlikely(!p->scx.slice)) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { @@ -4256,14 +4275,14 @@ static const struct kset_uevent_ops scx_uevent_ops = { * Used by sched_fork() and __setscheduler_prio() to pick the matching * sched_class. dl/rt are already handled. */ -bool task_should_scx(struct task_struct *p) +bool task_should_scx(int policy) { if (!scx_enabled() || unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; - return p->policy == SCHED_EXT; + return policy == SCHED_EXT; } /** @@ -4298,18 +4317,20 @@ bool task_should_scx(struct task_struct *p) */ static void scx_ops_bypass(bool bypass) { - int depth, cpu; + int cpu; + unsigned long flags; + raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags); if (bypass) { - depth = atomic_inc_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth <= 0); - if (depth != 1) - return; + scx_ops_bypass_depth++; + WARN_ON_ONCE(scx_ops_bypass_depth <= 0); + if (scx_ops_bypass_depth != 1) + goto unlock; } else { - depth = atomic_dec_return(&scx_ops_bypass_depth); - WARN_ON_ONCE(depth < 0); - if (depth != 0) - return; + scx_ops_bypass_depth--; + WARN_ON_ONCE(scx_ops_bypass_depth < 0); + if (scx_ops_bypass_depth != 0) + goto unlock; } /* @@ -4326,7 +4347,7 @@ static void scx_ops_bypass(bool bypass) struct rq_flags rf; struct task_struct *p, *n; - rq_lock_irqsave(rq, &rf); + rq_lock(rq, &rf); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); @@ -4362,11 +4383,13 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); /* resched to restore ticks and idle state */ resched_cpu(cpu); } +unlock: + raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags); } static void free_exit_info(struct scx_exit_info *ei) @@ -4489,11 +4512,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - p->sched_class = __setscheduler_class(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); @@ -4969,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { - pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation"); + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } @@ -5199,12 +5227,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); p->scx.slice = SCX_SLICE_DFL; - p->sched_class = __setscheduler_class(p, p->prio); + p->sched_class = new_class; check_class_changing(task_rq(p), p, old_class); sched_enq_and_set_task(&ctx); |