diff options
Diffstat (limited to 'kernel/sched')
31 files changed, 15427 insertions, 4299 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd45..8ae86371ddcd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -22,6 +22,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING +endif # # Build efficiency: # diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 991fc9002535..2b331822c7e7 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -9,7 +9,7 @@ static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; #ifdef CONFIG_SYSCTL -static struct ctl_table sched_autogroup_sysctls[] = { +static const struct ctl_table sched_autogroup_sysctls[] = { { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, @@ -19,7 +19,6 @@ static struct ctl_table sched_autogroup_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static void __init sched_autogroup_sysctl_init(void) @@ -151,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -183,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index d9dc9ab3773f..72d97aa8b726 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -16,18 +16,25 @@ #include <linux/sched/clock.h> #include <linux/sched/cputime.h> #include <linux/sched/hotplug.h> +#include <linux/sched/isolation.h> #include <linux/sched/posix-timers.h> #include <linux/sched/rt.h> #include <linux/cpuidle.h> #include <linux/jiffies.h> +#include <linux/kobject.h> #include <linux/livepatch.h> +#include <linux/pm.h> #include <linux/psi.h> +#include <linux/rhashtable.h> +#include <linux/seq_buf.h> #include <linux/seqlock_api.h> #include <linux/slab.h> #include <linux/suspend.h> #include <linux/tsacct_kern.h> #include <linux/vtime.h> +#include <linux/sysrq.h> +#include <linux/percpu-rwsem.h> #include <uapi/linux/sched/types.h> @@ -52,3 +59,9 @@ #include "cputime.c" #include "deadline.c" +#ifdef CONFIG_SCHED_CLASS_EXT +# include "ext.c" +# include "ext_idle.c" +#endif + +#include "syscalls.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 80a3df49ab47..bf9d8db94b70 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -68,9 +68,7 @@ # include "cpufreq_schedutil.c" #endif -#ifdef CONFIG_SCHED_DEBUG -# include "debug.c" -#endif +#include "debug.c" #ifdef CONFIG_SCHEDSTATS # include "stats.c" diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 3c6193de9cde..a09655b48140 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -340,7 +340,7 @@ again: this_clock = sched_clock_local(my_scd); /* * We must enforce atomic readout on 32-bit, otherwise the - * update on the remote CPU can hit inbetween the readout of + * update on the remote CPU can hit in between the readout of * the low 32-bit and the high 32-bit portion. */ remote_clock = cmpxchg64(&scd->clock, 0, 0); @@ -444,7 +444,7 @@ notrace void sched_clock_tick_stable(void) } /* - * We are going deep-idle (irqs are disabled): + * We are going deep-idle (IRQs are disabled): */ notrace void sched_clock_idle_sleep_event(void) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7019a40457a6..62b3416f5e43 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2,9 +2,10 @@ /* * kernel/sched/core.c * - * Core kernel scheduler code and related syscalls + * Core kernel CPU scheduler code * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ #include <linux/highmem.h> #include <linux/hrtimer_api.h> @@ -65,6 +66,7 @@ #include <linux/vtime.h> #include <linux/wait_api.h> #include <linux/workqueue_api.h> +#include <linux/livepatch_sched.h> #ifdef CONFIG_PREEMPT_DYNAMIC # ifdef CONFIG_GENERIC_ENTRY @@ -90,7 +92,6 @@ #include "autogroup.h" #include "pelt.h" #include "smp.h" -#include "stats.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" @@ -108,7 +109,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); @@ -118,7 +119,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -128,7 +128,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); */ #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | -const_debug unsigned int sysctl_sched_features = +__read_mostly unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT @@ -142,13 +142,12 @@ const_debug unsigned int sysctl_sched_features = */ __read_mostly int sysctl_resched_latency_warn_ms = 100; __read_mostly int sysctl_resched_latency_warn_once = 1; -#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; +__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -162,13 +161,19 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2; - if (rt_prio(p->prio)) /* includes deadline */ + if (p->dl_server) + return -1; /* deadline */ + + if (rt_or_dl_prio(p->prio)) return p->prio; /* [-1, 99] */ if (p->sched_class == &idle_sched_class) return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ + if (task_on_scx(p)) + return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ + + return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ } /* @@ -191,12 +196,33 @@ static inline bool prio_less(const struct task_struct *a, if (-pb < -pa) return false; - if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ - return !dl_time_before(a->dl.deadline, b->dl.deadline); + if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ + const struct sched_dl_entity *a_dl, *b_dl; + + a_dl = &a->dl; + /* + * Since,'a' and 'b' can be CFS tasks served by DL server, + * __task_prio() can return -1 (for DL) even for those. In that + * case, get to the dl_server's DL entity. + */ + if (a->dl_server) + a_dl = a->dl_server; + + b_dl = &b->dl; + if (b->dl_server) + b_dl = b->dl_server; + + return !dl_time_before(a_dl->deadline, b_dl->deadline); + } if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); +#ifdef CONFIG_SCHED_CLASS_EXT + if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ + return scx_prio_less(a, b, in_fi); +#endif + return false; } @@ -239,6 +265,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) void sched_core_enqueue(struct rq *rq, struct task_struct *p) { + if (p->se.sched_delayed) + return; + rq->core->core_task_seq++; if (!p->core_cookie) @@ -249,6 +278,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { + if (p->se.sched_delayed) + return; + rq->core->core_task_seq++; if (sched_core_enqueued(p)) { @@ -457,6 +489,16 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ +/* need a wrapper since we may need to trace from modules */ +EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); + +/* Call via the helper macro trace_set_current_state. */ +void __trace_set_current_state(int state_value) +{ + trace_sched_set_state_tp(current, state_value); +} +EXPORT_SYMBOL(__trace_set_current_state); + /* * Serialization rules: * @@ -514,6 +556,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * + * Additionally it is possible to be ->on_rq but still be considered not + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue + * but will be dequeued as soon as they get picked again. See the + * task_is_runnable() helper. + * * p->on_cpu <- { 0, 1 }: * * is set by prepare_task() and cleared by finish_task() such that it will be @@ -701,40 +748,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) s64 __maybe_unused steal = 0, irq_delta = 0; #ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + if (irqtime_enabled()) { + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}irq - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; - - rq->prev_irq_time += irq_delta; - delta -= irq_delta; - psi_account_irqtime(rq->curr, irq_delta); - delayacct_irq(rq->curr, irq_delta); + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}IRQ region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}IRQ + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + delayacct_irq(rq->curr, irq_delta); + } #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - steal = paravirt_steal_clock(cpu_of(rq)); + u64 prev_steal; + + steal = prev_steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - rq->prev_steal_time_rq += steal; + rq->prev_steal_time_rq = prev_steal; delta -= steal; } #endif @@ -751,22 +801,25 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) void update_rq_clock(struct rq *rq) { s64 delta; + u64 clock; lockdep_assert_rq_held(rq); if (rq->clock_update_flags & RQCF_ACT_SKIP) return; -#ifdef CONFIG_SCHED_DEBUG if (sched_feat(WARN_DOUBLE_CLOCK)) - SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; -#endif - delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + clock = sched_clock_cpu(cpu_of(rq)); + scx_rq_clock_update(rq, clock); + + delta = clock - rq->clock; if (delta < 0) return; rq->clock += delta; + update_rq_clock_task(rq, delta); } @@ -794,7 +847,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->curr, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; @@ -826,7 +879,7 @@ static void __hrtick_start(void *arg) /* * Called to set the hrtick timer state. * - * called with rq->lock held and irqs disabled + * called with rq->lock held and IRQs disabled */ void hrtick_start(struct rq *rq, u64 delay) { @@ -850,7 +903,7 @@ void hrtick_start(struct rq *rq, u64 delay) /* * Called to set the hrtick timer state. * - * called with rq->lock held and irqs disabled + * called with rq->lock held and IRQs disabled */ void hrtick_start(struct rq *rq, u64 delay) { @@ -870,8 +923,7 @@ static void hrtick_rq_init(struct rq *rq) #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - rq->hrtick_timer.function = hrtick; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -884,7 +936,7 @@ static inline void hrtick_rq_init(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ /* - * cmpxchg based fetch_or, macro so it works for different integer types + * try_cmpxchg based fetch_or() macro so it works for different integer types: */ #define fetch_or(ptr, mask) \ ({ \ @@ -903,10 +955,9 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - struct thread_info *ti = task_thread_info(p); - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); } /* @@ -931,9 +982,9 @@ static bool set_nr_if_polling(struct task_struct *p) } #else -static inline bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - set_tsk_need_resched(p); + set_ti_thread_flag(ti, tif); return true; } @@ -1018,9 +1069,10 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - /* Task can safely be re-inserted now: */ node = node->next; - task->wake_q.next = NULL; + /* pairs with cmpxchg_relaxed() in __wake_q_add() */ + WRITE_ONCE(task->wake_q.next, NULL); + /* Task can safely be re-inserted now. */ /* * wake_up_process() executes a full barrier, which pairs with @@ -1038,28 +1090,70 @@ void wake_up_q(struct wake_q_head *head) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_curr(struct rq *rq) +static void __resched_curr(struct rq *rq, int tif) { struct task_struct *curr = rq->curr; + struct thread_info *cti = task_thread_info(curr); int cpu; lockdep_assert_rq_held(rq); - if (test_tsk_need_resched(curr)) + /* + * Always immediately preempt the idle task; no point in delaying doing + * actual work. + */ + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) + tif = TIF_NEED_RESCHED; + + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return; cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(curr); - set_preempt_need_resched(); + set_ti_thread_flag(cti, tif); + if (tif == TIF_NEED_RESCHED) + set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(curr)) - smp_send_reschedule(cpu); - else + if (set_nr_and_not_polling(cti, tif)) { + if (tif == TIF_NEED_RESCHED) + smp_send_reschedule(cpu); + } else { trace_sched_wake_idle_without_ipi(cpu); + } +} + +void resched_curr(struct rq *rq) +{ + __resched_curr(rq, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_PREEMPT_DYNAMIC +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); +static __always_inline bool dynamic_preempt_lazy(void) +{ + return static_branch_unlikely(&sk_dynamic_preempt_lazy); +} +#else +static __always_inline bool dynamic_preempt_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} +#endif + +static __always_inline int get_lazy_tif_bit(void) +{ + if (dynamic_preempt_lazy()) + return TIF_NEED_RESCHED_LAZY; + + return TIF_NEED_RESCHED; +} + +void resched_curr_lazy(struct rq *rq) +{ + __resched_curr(rq, get_lazy_tif_bit()); } void resched_cpu(int cpu) @@ -1081,7 +1175,7 @@ void resched_cpu(int cpu) * * We don't do similar optimization for completely idle system, as * selecting an idle CPU will add more delays to the timers than intended - * (as that CPU's timer base may not be uptodate wrt jiffies etc). + * (as that CPU's timer base may not be up to date wrt jiffies etc). */ int get_nohz_timer_target(void) { @@ -1089,13 +1183,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); guard(rcu)(); @@ -1110,7 +1204,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); return default_cpu; } @@ -1141,7 +1235,7 @@ static void wake_up_idle_cpu(int cpu) * nohz functions that would need to follow TIF_NR_POLLING * clearing: * - * - On most archs, a simple fetch_or on ti::flags with a + * - On most architectures, a simple fetch_or on ti::flags with a * "0" value would be enough to know if an IPI needs to be sent. * * - x86 needs to perform a last need_resched() check between @@ -1154,7 +1248,7 @@ static void wake_up_idle_cpu(int cpu) * and testing of the above solutions didn't appear to report * much benefits. */ - if (set_nr_and_not_polling(rq->idle)) + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -1204,9 +1298,9 @@ static void nohz_csd_func(void *info) WARN_ON(!(flags & NOHZ_KICK_MASK)); rq->idle_balance = idle_cpu(cpu); - if (rq->idle_balance && !need_resched()) { + if (rq->idle_balance) { rq->nohz_idle_balance = flags; - raise_softirq_irqoff(SCHED_SOFTIRQ); + __raise_softirq_irqoff(SCHED_SOFTIRQ); } } @@ -1255,11 +1349,14 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; - * if there's more than one we need the tick for involuntary - * preemption. + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks + * left. For CFS, if there's more than one we need the tick for + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) + if (scx_enabled() && !scx_can_stop_tick(rq)) + return false; + + if (rq->cfs.h_nr_queued > 1) return false; /* @@ -1269,7 +1366,7 @@ bool sched_can_stop_tick(struct rq *rq) * dequeued by migrating while the constrained task continues to run. * E.g. going from 2->1 without going through pick_next_task(). */ - if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (__need_bw_check(rq, rq->curr)) { if (cfs_task_bw_constrained(rq->curr)) return false; } @@ -1324,30 +1421,27 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p, bool update_load) +void set_load_weight(struct task_struct *p, bool update_load) { int prio = p->static_prio - MAX_RT_PRIO; - struct load_weight *load = &p->se.load; + struct load_weight lw; - /* - * SCHED_IDLE tasks get minimal weight: - */ if (task_has_idle_policy(p)) { - load->weight = scale_load(WEIGHT_IDLEPRIO); - load->inv_weight = WMULT_IDLEPRIO; - return; + lw.weight = scale_load(WEIGHT_IDLEPRIO); + lw.inv_weight = WMULT_IDLEPRIO; + } else { + lw.weight = scale_load(sched_prio_to_weight[prio]); + lw.inv_weight = sched_prio_to_wmult[prio]; } /* * SCHED_OTHER tasks have to update their load when changing their * weight */ - if (update_load && p->sched_class == &fair_sched_class) { - reweight_task(p, prio); - } else { - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; - } + if (update_load && p->sched_class->reweight_task) + p->sched_class->reweight_task(task_rq(p), p, &lw); + else + p->se.load = lw; } #ifdef CONFIG_UCLAMP_TASK @@ -1361,7 +1455,7 @@ static void set_load_weight(struct task_struct *p, bool update_load) * requests are serialized using a mutex to reduce the risk of conflicting * updates or API abuses. */ -static DEFINE_MUTEX(uclamp_mutex); +static __maybe_unused DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -1384,7 +1478,7 @@ static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY * This knob will not override the system default sched_util_clamp_min defined * above. */ -static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -1409,32 +1503,6 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT]; */ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); -/* Integer rounded range for each bucket */ -#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) - -#define for_each_clamp_id(clamp_id) \ - for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) - -static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) -{ - return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); -} - -static inline unsigned int uclamp_none(enum uclamp_id clamp_id) -{ - if (clamp_id == UCLAMP_MIN) - return 0; - return SCHED_CAPACITY_SCALE; -} - -static inline void uclamp_se_set(struct uclamp_se *uc_se, - unsigned int value, bool user_defined) -{ - uc_se->value = value; - uc_se->bucket_id = uclamp_bucket_id(value); - uc_se->user_defined = user_defined; -} - static inline unsigned int uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) @@ -1658,7 +1726,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; - SCHED_WARN_ON(!bucket->tasks); + WARN_ON_ONCE(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; @@ -1676,16 +1744,16 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, rq_clamp = uclamp_rq_get(rq, clamp_id); /* * Defensive programming: this should never happen. If it happens, - * e.g. due to future modification, warn and fixup the expected value. + * e.g. due to future modification, warn and fix up the expected value. */ - SCHED_WARN_ON(bucket->value > rq_clamp); + WARN_ON_ONCE(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); uclamp_rq_set(rq, clamp_id, bkt_clamp); } } -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { enum uclamp_id clamp_id; @@ -1695,12 +1763,16 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) return; + /* Only inc the delayed task which being woken up. */ + if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)) + return; + for_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id); @@ -1719,12 +1791,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) return; + if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_dec_id(rq, p, clamp_id); } @@ -1835,7 +1910,7 @@ static void uclamp_sync_util_min_rt_default(void) uclamp_update_util_min_rt_default(p); } -static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, +static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; @@ -1874,12 +1949,12 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, } if (update_root_tg) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_update_root_tg(); } if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_sync_util_min_rt_default(); } @@ -1898,107 +1973,6 @@ undo: } #endif -static int uclamp_validate(struct task_struct *p, - const struct sched_attr *attr) -{ - int util_min = p->uclamp_req[UCLAMP_MIN].value; - int util_max = p->uclamp_req[UCLAMP_MAX].value; - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { - util_min = attr->sched_util_min; - - if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) - return -EINVAL; - } - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { - util_max = attr->sched_util_max; - - if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) - return -EINVAL; - } - - if (util_min != -1 && util_max != -1 && util_min > util_max) - return -EINVAL; - - /* - * We have valid uclamp attributes; make sure uclamp is enabled. - * - * We need to do that here, because enabling static branches is a - * blocking operation which obviously cannot be done while holding - * scheduler locks. - */ - static_branch_enable(&sched_uclamp_used); - - return 0; -} - -static bool uclamp_reset(const struct sched_attr *attr, - enum uclamp_id clamp_id, - struct uclamp_se *uc_se) -{ - /* Reset on sched class change for a non user-defined clamp value. */ - if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && - !uc_se->user_defined) - return true; - - /* Reset on sched_util_{min,max} == -1. */ - if (clamp_id == UCLAMP_MIN && - attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && - attr->sched_util_min == -1) { - return true; - } - - if (clamp_id == UCLAMP_MAX && - attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && - attr->sched_util_max == -1) { - return true; - } - - return false; -} - -static void __setscheduler_uclamp(struct task_struct *p, - const struct sched_attr *attr) -{ - enum uclamp_id clamp_id; - - for_each_clamp_id(clamp_id) { - struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int value; - - if (!uclamp_reset(attr, clamp_id, uc_se)) - continue; - - /* - * RT by default have a 100% boost value that could be modified - * at runtime. - */ - if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - value = sysctl_sched_uclamp_util_min_rt_default; - else - value = uclamp_none(clamp_id); - - uclamp_se_set(uc_se, value, false); - - } - - if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) - return; - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && - attr->sched_util_min != -1) { - uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], - attr->sched_util_min, true); - } - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && - attr->sched_util_max != -1) { - uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], - attr->sched_util_max, true); - } -} - static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; @@ -2064,15 +2038,8 @@ static void __init init_uclamp(void) } #else /* !CONFIG_UCLAMP_TASK */ -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } -static inline int uclamp_validate(struct task_struct *p, - const struct sched_attr *attr) -{ - return -EOPNOTSUPP; -} -static void __setscheduler_uclamp(struct task_struct *p, - const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } @@ -2102,24 +2069,33 @@ unsigned long get_wchan(struct task_struct *p) return ip; } -static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) { - sched_info_enqueue(rq, p); - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); - } + /* + * Can be before ->enqueue_task() because uclamp considers the + * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared + * in ->enqueue_task(). + */ + uclamp_rq_inc(rq, p, flags); - uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + psi_enqueue(p, flags); + + if (!(flags & ENQUEUE_RESTORE)) + sched_info_enqueue(rq, p); + if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } -static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +/* + * Must only return false when DEQUEUE_SLEEP. + */ +inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (sched_core_enabled(rq)) sched_core_dequeue(rq, p, flags); @@ -2127,13 +2103,17 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) { + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeue(rq, p); - psi_dequeue(p, flags & DEQUEUE_SLEEP); - } + psi_dequeue(p, flags); + + /* + * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' + * and mark the task ->sched_delayed. + */ uclamp_rq_dec(rq, p); - p->sched_class->dequeue_task(rq, p, flags); + return p->sched_class->dequeue_task(rq, p, flags); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -2151,56 +2131,23 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); - ASSERT_EXCLUSIVE_WRITER(p->on_rq); - - dequeue_task(rq, p, flags); -} - -static inline int __normal_prio(int policy, int rt_prio, int nice) -{ - int prio; + WARN_ON_ONCE(flags & DEQUEUE_SLEEP); - if (dl_policy(policy)) - prio = MAX_DL_PRIO - 1; - else if (rt_policy(policy)) - prio = MAX_RT_PRIO - 1 - rt_prio; - else - prio = NICE_TO_PRIO(nice); + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); - return prio; -} + /* + * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* + * dequeue_task() and cleared *after* enqueue_task(). + */ -/* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. - */ -static inline int normal_prio(struct task_struct *p) -{ - return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); + dequeue_task(rq, p, flags); } -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) +static void block_task(struct rq *rq, struct task_struct *p, int flags) { - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) + __block_task(rq, p); } /** @@ -2215,15 +2162,26 @@ inline int task_curr(const struct task_struct *p) } /* + * ->switching_to() is called with the pi_lock and rq_lock held and must not + * mess with locking. + */ +void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class) +{ + if (prev_class != p->sched_class && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); +} + +/* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. * * this means any call to check_class_changed() must be followed by a call to * balance_callback(). */ -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) +void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) @@ -2236,16 +2194,18 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { - if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, rq->curr->sched_class)) + struct task_struct *donor = rq->donor; + + if (p->sched_class == donor->sched_class) + donor->sched_class->wakeup_preempt(rq, p, flags); + else if (sched_class_above(p->sched_class, donor->sched_class)) resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr)) rq_clock_skip_update(rq); } @@ -2327,6 +2287,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * just go back and repeat. */ rq = task_rq_lock(p, &rf); + /* + * If task is sched_delayed, force dequeue it, to avoid always + * hitting the tick timeout in the queued case + */ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); trace_sched_wait_task(p); running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); @@ -2392,9 +2358,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); -static int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx); - static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { struct affinity_context ac = { @@ -2409,7 +2372,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) return; /* - * Violates locking rules! see comment in __do_set_cpus_allowed(). + * Violates locking rules! See comment in __do_set_cpus_allowed(). */ __do_set_cpus_allowed(p, &ac); } @@ -2419,6 +2382,12 @@ void migrate_disable(void) struct task_struct *p = current; if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif p->migration_disabled++; return; } @@ -2437,14 +2406,20 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + if (p->migration_disabled > 1) { p->migration_disabled--; return; } - if (WARN_ON_ONCE(!p->migration_disabled)) - return; - /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). @@ -2475,7 +2450,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { /* When not in the task's cpumask, no point in looking further. */ - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!task_allowed_on_cpu(p, cpu)) return false; /* migrate_disabled() must be allowed to finish. */ @@ -2484,7 +2459,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu) && task_cpu_possible(cpu, p); + return cpu_active(cpu); /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) @@ -2576,7 +2551,7 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, } /* - * migration_cpu_stop - this will be executed by a highprio stopper thread + * migration_cpu_stop - this will be executed by a high-prio stopper thread * and performs thread migration by bumping thread off CPU then * 'pushing' onto another runqueue. */ @@ -2712,9 +2687,7 @@ int push_cpu_stop(void *arg) // XXX validate p is still the highest prio task if (task_rq(p) == rq) { - deactivate_task(rq, p, 0); - set_task_cpu(p, lowest_rq->cpu); - activate_task(lowest_rq, p, 0); + move_queued_task_locked(rq, lowest_rq, p); resched_curr(lowest_rq); } @@ -2769,12 +2742,12 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) * XXX do further audits, this smells like something putrid. */ if (ctx->flags & SCA_MIGRATE_DISABLE) - SCHED_WARN_ON(!p->on_cpu); + WARN_ON_ONCE(!p->on_cpu); else lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) { /* @@ -2788,6 +2761,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -2821,16 +2795,6 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu); } -static cpumask_t *alloc_user_cpus_ptr(int node) -{ - /* - * See do_set_cpus_allowed() above for the rcu_head usage. - */ - int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); - - return kmalloc_node(size, GFP_KERNEL, node); -} - int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { @@ -3199,8 +3163,7 @@ out: * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) +int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx) { struct rq_flags rf; struct rq *rq; @@ -3319,9 +3282,6 @@ out_free_mask: free_cpumask_var(new_mask); } -static int -__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); - /* * Restore the affinity of a task @p which was previously restricted by a * call to force_compatible_cpus_allowed_ptr(). @@ -3347,7 +3307,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { -#ifdef CONFIG_SCHED_DEBUG unsigned int state = READ_ONCE(p->__state); /* @@ -3385,7 +3344,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(!cpu_online(new_cpu)); WARN_ON_ONCE(is_migration_disabled(p)); -#endif trace_sched_migrate_task(p, new_cpu); @@ -3414,9 +3372,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); + move_queued_task_locked(src_rq, dst_rq, p); wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -3600,7 +3556,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, task_cpu_possible_mask(p)); + do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -3629,14 +3585,16 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); - else + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); + *wake_flags |= WF_RQ_SELECTED; + } else { cpu = cpumask_any(p->cpus_ptr); + } /* * In order not to call set_task_cpu() on a blocking task we need @@ -3701,12 +3659,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) #else /* CONFIG_SMP */ -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) -{ - return set_cpus_allowed_ptr(p, ctx->new_mask); -} - static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } static inline bool rq_has_pinned_tasks(struct rq *rq) @@ -3714,11 +3666,6 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) return false; } -static inline cpumask_t *alloc_user_cpus_ptr(int node) -{ - return NULL; -} - #endif /* !CONFIG_SMP */ static void @@ -3781,6 +3728,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->nr_uninterruptible--; #ifdef CONFIG_SMP + if (wake_flags & WF_RQ_SELECTED) + en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else @@ -3818,8 +3767,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } #endif - - p->dl_server = NULL; } /* @@ -3855,12 +3802,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { + update_rq_clock(rq); + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); if (!task_on_cpu(rq, p)) { /* * When on_rq && !on_cpu the task is preempted, see if * it should preempt the task that is current now. */ - update_rq_clock(rq); wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); @@ -3901,8 +3850,8 @@ void sched_ttwu_pending(void *arg) * it is possible for select_idle_siblings() to stack a number * of tasks on this CPU during that window. * - * It is ok to clear ttwu_pending when another task pending. - * We will receive IPI after local irq enabled and then enqueue it. + * It is OK to clear ttwu_pending when another task pending. + * We will receive IPI after local IRQ enabled and then enqueue it. * Since now nr_running > 0, idle_cpu() will always get correct result. */ WRITE_ONCE(rq->ttwu_pending, 0); @@ -3986,6 +3935,10 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ + if (!scx_allow_ttwu_queue(p)) + return false; + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. @@ -4233,6 +4186,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) guard(preempt)(); int cpu, success = 0; + wake_flags |= WF_TTWU; + if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4240,11 +4195,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * + * Specifically, given current runs ttwu() we must be before + * schedule()'s block_task(), as such this must not observe + * sched_delayed. + * * In particular: * - we rely on Program-Order guarantees for all the ordering, * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ + WARN_ON_ONCE(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4360,7 +4320,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); if (task_cpu(p) != cpu) { if (p->in_iowait) { delayacct_blkio_end(p); @@ -4425,9 +4385,10 @@ static bool __task_needs_rq_lock(struct task_struct *p) * @arg: Argument to function. * * Fix the task in it's current state by avoiding wakeups and or rq operations - * and call @func(@arg) on it. This function can use ->on_rq and task_curr() - * to work out what the state is, if required. Given that @func can be invoked - * with a runqueue lock held, it had better be quite lightweight. + * and call @func(@arg) on it. This function can use task_is_runnable() and + * task_curr() to work out what the state is, if required. Given that @func + * can be invoked with a runqueue lock held, it had better be quite + * lightweight. * * Returns: * Whatever @func returns @@ -4467,12 +4428,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) * @cpu: The CPU on which to snapshot the task. * * Returns the task_struct pointer of the task "currently" running on - * the specified CPU. If the same task is running on that CPU throughout, - * the return value will be a pointer to that task's task_struct structure. - * If the CPU did any context switches even vaguely concurrently with the - * execution of this function, the return value will be a pointer to the - * task_struct structure of a randomly chosen task that was running on - * that CPU somewhere around the time that this function was executing. + * the specified CPU. * * If the specified CPU was offline, the return value is whatever it * is, perhaps a pointer to the task_struct structure of that CPU's idle @@ -4486,11 +4442,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) */ struct task_struct *cpu_curr_snapshot(int cpu) { + struct rq *rq = cpu_rq(cpu); struct task_struct *t; + struct rq_flags rf; - smp_mb(); /* Pairing determined by caller's synchronization design. */ + rq_lock_irqsave(rq, &rf); + smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */ t = rcu_dereference(cpu_curr(cpu)); + rq_unlock_irqrestore(rq, &rf); smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; } @@ -4520,7 +4481,8 @@ int wake_up_state(struct task_struct *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. * - * __sched_fork() is basic setup used by init_idle() too: + * __sched_fork() is basic setup which is also used by sched_init() to + * initialize the boot CPU's idle task. */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { @@ -4533,9 +4495,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vlag = 0; - p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); + /* A delayed task cannot be in clone(). */ + WARN_ON_ONCE(p->se.sched_delayed); + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif @@ -4553,6 +4517,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->rt.on_rq = 0; p->rt.on_list = 0; +#ifdef CONFIG_SCHED_CLASS_EXT + init_scx_entity(&p->scx); +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif @@ -4603,7 +4571,7 @@ static void reset_memory_tiering(void) } } -static int sysctl_numa_balancing(struct ctl_table *table, int write, +static int sysctl_numa_balancing(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -4672,7 +4640,7 @@ out: __setup("schedstats=", setup_schedstats); #ifdef CONFIG_PROC_SYSCTL -static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, +static int sysctl_schedstats(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -4695,7 +4663,7 @@ static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SYSCTL -static struct ctl_table sched_core_sysctls[] = { +static const struct ctl_table sched_core_sysctls[] = { #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", @@ -4741,7 +4709,6 @@ static struct ctl_table sched_core_sysctls[] = { .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_core_sysctl_init(void) { @@ -4784,6 +4751,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); + p->se.custom_slice = 0; + p->se.slice = sysctl_sched_base_slice; /* * We don't need the reset flag anymore after the fork. It has @@ -4794,10 +4763,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (dl_prio(p->prio)) return -EAGAIN; - else if (rt_prio(p->prio)) + + scx_pre_fork(p); + + if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; - else +#ifdef CONFIG_SCHED_CLASS_EXT + } else if (task_should_scx(p->policy)) { + p->sched_class = &ext_sched_class; +#endif + } else { p->sched_class = &fair_sched_class; + } init_entity_runnable_average(&p->se); @@ -4817,7 +4794,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; @@ -4844,11 +4821,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return scx_fork(p); +} + +void sched_cancel_fork(struct task_struct *p) +{ + scx_cancel_fork(p); } void sched_post_fork(struct task_struct *p) { uclamp_post_fork(p); + scx_post_fork(p); } unsigned long to_ratio(u64 period, u64 runtime) @@ -4878,6 +4863,7 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; + int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); @@ -4892,15 +4878,15 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); - activate_task(rq, p, ENQUEUE_NOCLOCK); + activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); - wakeup_preempt(rq, p, WF_FORK); + wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -5096,7 +5082,7 @@ __splice_balance_callbacks(struct rq *rq, bool split) return head; } -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +struct balance_callback *splice_balance_callbacks(struct rq *rq) { return __splice_balance_callbacks(rq, true); } @@ -5106,7 +5092,7 @@ static void __balance_callbacks(struct rq *rq) do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); } -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +void balance_callbacks(struct rq *rq, struct balance_callback *head) { unsigned long flags; @@ -5123,15 +5109,6 @@ static inline void __balance_callbacks(struct rq *rq) { } -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -{ - return NULL; -} - -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -{ -} - #endif static inline void @@ -5234,7 +5211,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * * The context switch have flipped the stack from under us and restored the * local variables which were saved when this task called schedule() in the - * past. prev == current is still correct but we need to recalculate this_rq + * past. 'prev == current' is still correct but we need to recalculate this_rq * because prev may have moved to another CPU. */ static struct rq *finish_task_switch(struct task_struct *prev) @@ -5338,6 +5315,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ finish_task_switch(prev); + /* + * This is a special case: the newly created task has just + * switched the context for the first time. It is returning from + * schedule for the first time in this path. + */ + trace_sched_exit_tp(true, CALLER_ADDR0); preempt_enable(); if (current->set_child_tid) @@ -5557,9 +5540,9 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat); static inline void prefetch_curr_exec_start(struct task_struct *p) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct sched_entity *curr = (&p->se)->cfs_rq->curr; + struct sched_entity *curr = p->se.cfs_rq->curr; #else - struct sched_entity *curr = (&task_rq(p)->cfs)->curr; + struct sched_entity *curr = task_rq(p)->cfs.curr; #endif prefetch(curr); prefetch(&curr->exec_start); @@ -5580,7 +5563,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. - * Reading ->on_cpu is racy, but this is ok. + * Reading ->on_cpu is racy, but this is OK. * * If we race with it leaving CPU, we'll take a lock. So we're correct. * If we race with it entering CPU, unaccounted time is 0. This is @@ -5598,7 +5581,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && task_on_rq_queued(p)) { + if (task_current_donor(rq, p) && task_on_rq_queued(p)) { prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); @@ -5609,7 +5592,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_SCHED_DEBUG static u64 cpu_resched_latency(struct rq *rq) { int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); @@ -5654,39 +5636,45 @@ static int __init setup_resched_latency_warn_ms(char *str) return 1; } __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); -#else -static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } -#endif /* CONFIG_SCHED_DEBUG */ /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -void scheduler_tick(void) +void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; + /* accounting goes to the donor task */ + struct task_struct *donor; struct rq_flags rf; - unsigned long thermal_pressure; + unsigned long hw_pressure; u64 resched_latency; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) arch_scale_freq_tick(); sched_clock_tick(); rq_lock(rq, &rf); + donor = rq->donor; + + psi_account_irqtime(rq, donor, NULL); update_rq_clock(rq); - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); - curr->sched_class->task_tick(rq, curr, 0); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); + + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + resched_curr(rq); + + donor->sched_class->task_tick(rq, donor, 0); if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, curr); + task_tick_mm_cid(rq, donor); + scx_tick(rq); rq_unlock(rq, &rf); @@ -5695,12 +5683,14 @@ void scheduler_tick(void) perf_event_task_tick(); - if (curr->flags & PF_WQ_WORKER) - wq_worker_tick(curr); + if (donor->flags & PF_WQ_WORKER) + wq_worker_tick(donor); #ifdef CONFIG_SMP - rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + if (!scx_switched_all()) { + rq->idle_balance = idle_cpu(cpu); + sched_balance_trigger(rq); + } #endif } @@ -5761,6 +5751,12 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr = rq->curr; if (cpu_online(cpu)) { + /* + * Since this is a remote tick for full dynticks mode, + * we are always sure that there is no proxy (only a + * single task is running). + */ + WARN_ON_ONCE(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5794,7 +5790,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5815,7 +5811,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5980,18 +5976,32 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CONTEXT_USER); + WARN_ON_ONCE(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq()->sched_count); } -static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) +static void prev_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) { -#ifdef CONFIG_SMP + const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; + +#ifdef CONFIG_SCHED_CLASS_EXT + /* + * SCX requires a balance() call before every pick_task() including when + * waking up from SCHED_IDLE. If @start_class is below SCX, start from + * SCX instead. Also, set a flag to detect missing balance() call. + */ + if (scx_enabled()) { + rq->scx.flags |= SCX_RQ_BAL_PENDING; + if (sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; + } +#endif + /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -6000,13 +6010,10 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * We can terminate the balance pass as soon as we know there is * a runnable task of @class priority or higher. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) + for_active_class_range(class, start_class, &idle_sched_class) { + if (class->balance && class->balance(rq, prev, rf)) break; } -#endif - - put_prev_task(rq, prev); } /* @@ -6018,6 +6025,11 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; struct task_struct *p; + rq->dl_server = NULL; + + if (scx_enabled()) + goto restart; + /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a @@ -6025,7 +6037,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * opportunity to pull in more work from other CPUs. */ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && - rq->nr_running == rq->cfs.h_nr_running)) { + rq->nr_running == rq->cfs.h_nr_queued)) { p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) @@ -6033,35 +6045,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - put_prev_task(rq, prev); - p = pick_next_task_idle(rq); + p = pick_task_idle(rq); + put_prev_set_next_task(rq, prev, p); } - /* - * This is the fast path; it cannot be a DL server pick; - * therefore even if @p == @prev, ->dl_server must be NULL. - */ - if (p->dl_server) - p->dl_server = NULL; - return p; } restart: - put_prev_task_balance(rq, prev, rf); - - /* - * We've updated @prev and no longer need the server link, clear it. - * Must be done before ->pick_next_task() because that can (re)set - * ->dl_server. - */ - if (prev->dl_server) - prev->dl_server = NULL; + prev_balance(rq, prev, rf); - for_each_class(class) { - p = class->pick_next_task(rq); - if (p) - return p; + for_each_active_class(class) { + if (class->pick_next_task) { + p = class->pick_next_task(rq, prev); + if (p) + return p; + } else { + p = class->pick_task(rq); + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; + } + } } BUG(); /* The idle class should always have a runnable task. */ @@ -6091,7 +6096,9 @@ static inline struct task_struct *pick_task(struct rq *rq) const struct sched_class *class; struct task_struct *p; - for_each_class(class) { + rq->dl_server = NULL; + + for_each_active_class(class) { p = class->pick_task(rq); if (p) return p; @@ -6129,6 +6136,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * another cpu during offline. */ rq->core_pick = NULL; + rq->core_dl_server = NULL; return __pick_next_task(rq, prev, rf); } @@ -6147,16 +6155,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); next = rq->core_pick; - if (next != prev) { - put_prev_task(rq, prev); - set_next_task(rq, next); - } - + rq->dl_server = rq->core_dl_server; rq->core_pick = NULL; - goto out; + rq->core_dl_server = NULL; + goto out_set_next; } - put_prev_task_balance(rq, prev, rf); + prev_balance(rq, prev, rf); smt_mask = cpu_smt_mask(cpu); need_sync = !!rq->core->core_cookie; @@ -6197,6 +6202,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) next = pick_task(rq); if (!next->core_cookie) { rq->core_pick = NULL; + rq->core_dl_server = NULL; /* * For robustness, update the min_vruntime_fi for * unconstrained picks as well. @@ -6224,7 +6230,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - p = rq_i->core_pick = pick_task(rq_i); + rq_i->core_pick = p = pick_task(rq_i); + rq_i->core_dl_server = rq_i->dl_server; + if (!max || prio_less(max, p, fi_before)) max = p; } @@ -6248,6 +6256,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } rq_i->core_pick = p; + rq_i->core_dl_server = NULL; if (p == rq_i->idle) { if (rq_i->nr_running) { @@ -6308,6 +6317,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i == cpu) { rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; continue; } @@ -6316,6 +6326,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (rq_i->curr == rq_i->core_pick) { rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; continue; } @@ -6323,8 +6334,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } out_set_next: - set_next_task(rq, next); -out: + put_prev_set_next_task(rq, prev, next); if (rq->core->core_forceidle_count && next == rq->idle) queue_core_balance(rq); @@ -6370,10 +6380,7 @@ static bool try_steal_cookie(int this, int that) if (sched_task_is_throttled(p, this)) goto next; - deactivate_task(src, p, 0); - set_task_cpu(p, this); - activate_task(dst, p, 0); - + move_queued_task_locked(src, dst, p); resched_curr(dst); success = true; @@ -6560,19 +6567,53 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * Constants for the sched_mode argument of __schedule(). * * The mode argument allows RT enabled kernels to differentiate a - * preemption from blocking on an 'sleeping' spin/rwlock. Note that - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to - * optimize the AND operation out and just check for zero. + * preemption from blocking on an 'sleeping' spin/rwlock. */ -#define SM_NONE 0x0 -#define SM_PREEMPT 0x1 -#define SM_RTLOCK_WAIT 0x2 +#define SM_IDLE (-1) +#define SM_NONE 0 +#define SM_PREEMPT 1 +#define SM_RTLOCK_WAIT 2 -#ifndef CONFIG_PREEMPT_RT -# define SM_MASK_PREEMPT (~0U) -#else -# define SM_MASK_PREEMPT SM_PREEMPT -#endif +/* + * Helper function for __schedule() + * + * If a task does not have signals pending, deactivate it + * Otherwise marks the task's __state as RUNNING + */ +static bool try_to_block_task(struct rq *rq, struct task_struct *p, + unsigned long *task_state_p) +{ + unsigned long task_state = *task_state_p; + int flags = DEQUEUE_NOCLOCK; + + if (signal_pending_state(task_state, p)) { + WRITE_ONCE(p->__state, TASK_RUNNING); + *task_state_p = TASK_RUNNING; + return false; + } + + p->sched_contributes_to_load = + (task_state & TASK_UNINTERRUPTIBLE) && + !(task_state & TASK_NOLOAD) && + !(task_state & TASK_FROZEN); + + if (unlikely(is_special_task_state(task_state))) + flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ + block_task(rq, p, flags); + return true; +} /* * __schedule() is the main scheduler function. @@ -6585,7 +6626,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * paths. For example, see arch/x86/entry_64.S. * * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). + * interrupt handler sched_tick(). * * 3. Wakeups don't really cause entry into schedule(). They add a * task to the run-queue and that's it. @@ -6613,26 +6654,36 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(unsigned int sched_mode) +static void __sched notrace __schedule(int sched_mode) { struct task_struct *prev, *next; + /* + * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted + * as a preemption by schedule_debug() and RCU. + */ + bool preempt = sched_mode > SM_NONE; + bool is_switch = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; + trace_sched_entry_tp(preempt, CALLER_ADDR0); + cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, !!sched_mode); + schedule_debug(prev, preempt); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); + klp_sched_try_switch(prev); + local_irq_disable(); - rcu_note_context_switch(!!sched_mode); + rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6661,52 +6712,34 @@ static void __sched notrace __schedule(unsigned int sched_mode) switch_count = &prev->nivcsw; + /* Task state changes only considers SM_PREEMPT as preemption */ + preempt = sched_mode == SM_PREEMPT; + /* * We must load prev->state once (task_struct::state is volatile), such * that we form a control dependency vs deactivate_task() below. */ prev_state = READ_ONCE(prev->__state); - if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { - if (signal_pending_state(prev_state, prev)) { - WRITE_ONCE(prev->__state, TASK_RUNNING); - } else { - prev->sched_contributes_to_load = - (prev_state & TASK_UNINTERRUPTIBLE) && - !(prev_state & TASK_NOLOAD) && - !(prev_state & TASK_FROZEN); - - if (prev->sched_contributes_to_load) - rq->nr_uninterruptible++; - - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) - * if (prev_state) goto out; - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); - * p->state = TASK_WAKING - * - * Where __schedule() and ttwu() have matching control dependencies. - * - * After this, schedule() must not care about p->state any more. - */ - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - - if (prev->in_iowait) { - atomic_inc(&rq->nr_iowait); - delayacct_blkio_start(); - } + if (sched_mode == SM_IDLE) { + /* SCX must consult the BPF scheduler to tell if rq is empty */ + if (!rq->nr_running && !scx_enabled()) { + next = prev; + goto picked; } + } else if (!preempt && prev_state) { + try_to_block_task(rq, prev, &prev_state); switch_count = &prev->nvcsw; } next = pick_next_task(rq, prev, &rf); + rq_set_donor(rq, next); +picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); -#ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -#endif - if (likely(prev != next)) { + is_switch = prev != next; + if (likely(is_switch)) { rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -6738,9 +6771,11 @@ static void __sched notrace __schedule(unsigned int sched_mode) ++*switch_count; migrate_disable_switch(rq, prev); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + psi_account_irqtime(rq, prev, next); + psi_sched_switch(prev, next, !task_on_rq_queued(prev) || + prev->se.sched_delayed); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); + trace_sched_switch(preempt, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -6749,6 +6784,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); } + trace_sched_exit_tp(is_switch, CALLER_ADDR0); } void __noreturn do_task_dead(void) @@ -6793,7 +6829,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * deadlock if the callback attempts to acquire a lock which is * already acquired. */ - SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); + WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, @@ -6816,7 +6852,7 @@ static void sched_update_worker(struct task_struct *tsk) } } -static __always_inline void __schedule_loop(unsigned int sched_mode) +static __always_inline void __schedule_loop(int sched_mode) { do { preempt_disable(); @@ -6854,14 +6890,14 @@ void __sched schedule_idle(void) { /* * As this skips calling sched_submit_work(), which the idle task does - * regardless because that function is a nop when the task is in a + * regardless because that function is a NOP when the task is in a * TASK_RUNNING state, make sure this isn't used someplace that the * current task can be in any other state. Note, idle is always in the * TASK_RUNNING state. */ WARN_ON_ONCE(current->__state); do { - __schedule(SM_NONE); + __schedule(SM_IDLE); } while (need_resched()); } @@ -6875,7 +6911,7 @@ asmlinkage __visible void __sched schedule_user(void) * we find a better solution. * * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != CONTEXT_USER, but that will trigger + * should warn if prev_state != CT_STATE_USER, but that will trigger * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); @@ -7049,9 +7085,9 @@ EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); /* * This is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * off of IRQ context. + * Note, that this is called and return with IRQs disabled. This will + * protect us against recursive calling from IRQ contexts. */ asmlinkage __visible void __sched preempt_schedule_irq(void) { @@ -7076,21 +7112,25 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); + WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); -static void __setscheduler_prio(struct task_struct *p, int prio) +const struct sched_class *__setscheduler_class(int policy, int prio) { if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; + return &dl_sched_class; - p->prio = prio; + if (rt_prio(prio)) + return &rt_sched_class; + +#ifdef CONFIG_SCHED_CLASS_EXT + if (task_should_scx(policy)) + return &ext_sched_class; +#endif + + return &fair_sched_class; } #ifdef CONFIG_RT_MUTEXES @@ -7121,21 +7161,6 @@ void rt_mutex_post_schedule(void) lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); } -static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -{ - if (pi_task) - prio = min(prio, pi_task->prio); - - return prio; -} - -static inline int rt_effective_prio(struct task_struct *p, int prio) -{ - struct task_struct *pi_task = rt_mutex_get_top_task(p); - - return __rt_effective_prio(pi_task, prio); -} - /* * rt_mutex_setprio - set the current priority of a task * @p: task to boost @@ -7151,7 +7176,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct rq_flags rf; struct rq *rq; @@ -7185,7 +7210,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) goto out_unlock; /* - * Idle task boosting is a nono in general. There is one + * Idle task boosting is a no-no in general. There is one * exception, when PREEMPT_RT and NOHZ is active: * * The idle task calls get_next_timer_interrupt() and holds @@ -7209,8 +7234,13 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + next_class = __setscheduler_class(p->policy, prio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flag); if (running) @@ -7246,7 +7276,10 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->rt.timeout = 0; } - __setscheduler_prio(p, prio); + p->sched_class = next_class; + p->prio = prio; + + check_class_changing(rq, p, prev_class); if (queued) enqueue_task(rq, p, queue_flag); @@ -7264,1334 +7297,17 @@ out_unlock: preempt_enable(); } -#else -static inline int rt_effective_prio(struct task_struct *p, int prio) -{ - return prio; -} -#endif - -void set_user_nice(struct task_struct *p, long nice) -{ - bool queued, running; - struct rq *rq; - int old_prio; - - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) - return; - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - CLASS(task_rq_lock, rq_guard)(p); - rq = rq_guard.rq; - - update_rq_clock(rq); - - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it won't have any effect on scheduling until the task is - * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: - */ - if (task_has_dl_policy(p) || task_has_rt_policy(p)) { - p->static_prio = NICE_TO_PRIO(nice); - return; - } - - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); -} -EXPORT_SYMBOL(set_user_nice); - -/* - * is_nice_reduction - check if nice value is an actual reduction - * - * Similar to can_nice() but does not perform a capability check. - * - * @p: task - * @nice: nice value - */ -static bool is_nice_reduction(const struct task_struct *p, const int nice) -{ - /* Convert nice value [19,-20] to rlimit style value [1,40]: */ - int nice_rlim = nice_to_rlimit(nice); - - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); -} - -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE); -} - -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) -{ - long nice, retval; - - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); - nice = task_nice(current) + increment; - - nice = clamp_val(nice, MIN_NICE, MAX_NICE); - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); - return 0; -} - -#endif - -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * Return: The priority value as seen by users in /proc. - * - * sched policy return value kernel prio user prio/nice - * - * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] - * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] - * deadline -101 -1 0 - */ -int task_prio(const struct task_struct *p) -{ - return p->prio - MAX_RT_PRIO; -} - -/** - * idle_cpu - is a given CPU idle currently? - * @cpu: the processor in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->curr != rq->idle) - return 0; - - if (rq->nr_running) - return 0; - -#ifdef CONFIG_SMP - if (rq->ttwu_pending) - return 0; -#endif - - return 1; -} - -/** - * available_idle_cpu - is a given CPU idle for enqueuing work. - * @cpu: the CPU in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int available_idle_cpu(int cpu) -{ - if (!idle_cpu(cpu)) - return 0; - - if (vcpu_is_preempted(cpu)) - return 0; - - return 1; -} - -/** - * idle_task - return the idle task for a given CPU. - * @cpu: the processor in question. - * - * Return: The idle task for the CPU @cpu. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -#ifdef CONFIG_SCHED_CORE -int sched_core_idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (sched_core_enabled(rq) && rq->curr == rq->idle) - return 1; - - return idle_cpu(cpu); -} - -#endif - -#ifdef CONFIG_SMP -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long *min, - unsigned long *max) -{ - unsigned long util, irq, scale; - struct rq *rq = cpu_rq(cpu); - - scale = arch_scale_cpu_capacity(cpu); - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= scale)) { - if (min) - *min = scale; - if (max) - *max = scale; - return scale; - } - - if (min) { - /* - * The minimum utilization returns the highest level between: - * - the computed DL bandwidth needed with the IRQ pressure which - * steals time to the deadline task. - * - The minimum performance requirement for CFS and/or RT. - */ - *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); - - /* - * When an RT task is runnable and uclamp is not used, we must - * ensure that the task will run at maximum compute capacity. - */ - if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) - *min = max(*min, scale); - } - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - */ - util = util_cfs + cpu_util_rt(rq); - util += cpu_util_dl(rq); - - /* - * The maximum hint is a soft bandwidth requirement, which can be lower - * than the actual utilization because of uclamp_max requirements. - */ - if (max) - *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); - - if (util >= scale) - return scale; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, scale); - util += irq; - - return min(scale, util); -} - -unsigned long sched_cpu_util(int cpu) -{ - return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); -} -#endif /* CONFIG_SMP */ - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - * - * The task of @pid, if found. %NULL otherwise. - */ -static struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; -} - -static struct task_struct *find_get_task(pid_t pid) -{ - struct task_struct *p; - guard(rcu)(); - - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - - return p; -} - -DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), - find_get_task(pid), pid_t pid) - -/* - * sched_setparam() passes in -1 for its policy, to let the functions - * it calls know not to change it. - */ -#define SETPARAM_POLICY -1 - -static void __setscheduler_params(struct task_struct *p, - const struct sched_attr *attr) -{ - int policy = attr->sched_policy; - - if (policy == SETPARAM_POLICY) - policy = p->policy; - - p->policy = policy; - - if (dl_policy(policy)) - __setparam_dl(p, attr); - else if (fair_policy(policy)) - p->static_prio = NICE_TO_PRIO(attr->sched_nice); - - /* - * __sched_setscheduler() ensures attr->sched_priority == 0 when - * !rt_policy. Always setting this ensures that things like - * getparam()/getattr() don't report silly values for !rt tasks. - */ - p->rt_priority = attr->sched_priority; - p->normal_prio = normal_prio(p); - set_load_weight(p, true); -} - -/* - * Check the target process has a UID that matches the current process's: - */ -static bool check_same_owner(struct task_struct *p) -{ - const struct cred *cred = current_cred(), *pcred; - guard(rcu)(); - - pcred = __task_cred(p); - return (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); -} - -/* - * Allow unprivileged RT tasks to decrease priority. - * Only issue a capable test if needed and only once to avoid an audit - * event on permitted non-privileged operations: - */ -static int user_check_sched_setscheduler(struct task_struct *p, - const struct sched_attr *attr, - int policy, int reset_on_fork) -{ - if (fair_policy(policy)) { - if (attr->sched_nice < task_nice(p) && - !is_nice_reduction(p, attr->sched_nice)) - goto req_priv; - } - - if (rt_policy(policy)) { - unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - - /* Can't set/change the rt policy: */ - if (policy != p->policy && !rlim_rtprio) - goto req_priv; - - /* Can't increase priority: */ - if (attr->sched_priority > p->rt_priority && - attr->sched_priority > rlim_rtprio) - goto req_priv; - } - - /* - * Can't set/change SCHED_DEADLINE policy at all for now - * (safest behavior); in the future we would like to allow - * unprivileged DL tasks to increase their relative deadline - * or reduce their runtime (both ways reducing utilization) - */ - if (dl_policy(policy)) - goto req_priv; - - /* - * Treat SCHED_IDLE as nice 20. Only allow a switch to - * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. - */ - if (task_has_idle_policy(p) && !idle_policy(policy)) { - if (!is_nice_reduction(p, task_nice(p))) - goto req_priv; - } - - /* Can't change other user's priorities: */ - if (!check_same_owner(p)) - goto req_priv; - - /* Normal users shall not reset the sched_reset_on_fork flag: */ - if (p->sched_reset_on_fork && !reset_on_fork) - goto req_priv; - - return 0; - -req_priv: - if (!capable(CAP_SYS_NICE)) - return -EPERM; - - return 0; -} - -static int __sched_setscheduler(struct task_struct *p, - const struct sched_attr *attr, - bool user, bool pi) -{ - int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; - const struct sched_class *prev_class; - struct balance_callback *head; - struct rq_flags rf; - int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct rq *rq; - bool cpuset_locked = false; - - /* The pi code expects interrupts enabled */ - BUG_ON(pi && in_interrupt()); -recheck: - /* Double check policy once rq lock held: */ - if (policy < 0) { - reset_on_fork = p->sched_reset_on_fork; - policy = oldpolicy = p->policy; - } else { - reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - - if (!valid_policy(policy)) - return -EINVAL; - } - - if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) - return -EINVAL; - - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. - */ - if (attr->sched_priority > MAX_RT_PRIO-1) - return -EINVAL; - if ((dl_policy(policy) && !__checkparam_dl(attr)) || - (rt_policy(policy) != (attr->sched_priority != 0))) - return -EINVAL; - - if (user) { - retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork); - if (retval) - return retval; - - if (attr->sched_flags & SCHED_FLAG_SUGOV) - return -EINVAL; - - retval = security_task_setscheduler(p); - if (retval) - return retval; - } - - /* Update task specific "requested" clamps */ - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { - retval = uclamp_validate(p, attr); - if (retval) - return retval; - } - - /* - * SCHED_DEADLINE bandwidth accounting relies on stable cpusets - * information. - */ - if (dl_policy(policy) || dl_policy(p->policy)) { - cpuset_locked = true; - cpuset_lock(); - } - - /* - * Make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: - * - * To be able to change p->policy safely, the appropriate - * runqueue lock must be held. - */ - rq = task_rq_lock(p, &rf); - update_rq_clock(rq); - - /* - * Changing the policy of the stop threads its a very bad idea: - */ - if (p == rq->stop) { - retval = -EINVAL; - goto unlock; - } - - /* - * If not changing anything there's no need to proceed further, - * but store a possible modification of reset_on_fork. - */ - if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != task_nice(p)) - goto change; - if (rt_policy(policy) && attr->sched_priority != p->rt_priority) - goto change; - if (dl_policy(policy) && dl_param_changed(p, attr)) - goto change; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) - goto change; - - p->sched_reset_on_fork = reset_on_fork; - retval = 0; - goto unlock; - } -change: - - if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0 && - !task_group_is_autogroup(task_group(p))) { - retval = -EPERM; - goto unlock; - } -#endif -#ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy) && - !(attr->sched_flags & SCHED_FLAG_SUGOV)) { - cpumask_t *span = rq->rd->span; - - /* - * Don't allow tasks with an affinity mask smaller than - * the entire root_domain to become SCHED_DEADLINE. We - * will also fail if there's no bandwidth available. - */ - if (!cpumask_subset(span, p->cpus_ptr) || - rq->rd->dl_bw.bw == 0) { - retval = -EPERM; - goto unlock; - } - } #endif - } - - /* Re-check policy now with rq lock held: */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - task_rq_unlock(rq, p, &rf); - if (cpuset_locked) - cpuset_unlock(); - goto recheck; - } - - /* - * If setscheduling to SCHED_DEADLINE (or changing the parameters - * of a SCHED_DEADLINE task) we need to check if enough bandwidth - * is available. - */ - if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { - retval = -EBUSY; - goto unlock; - } - - p->sched_reset_on_fork = reset_on_fork; - oldprio = p->prio; - - newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); - if (pi) { - /* - * Take priority boosted tasks into account. If the new - * effective priority is unchanged, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. - */ - newprio = rt_effective_prio(p, newprio); - if (newprio == oldprio) - queue_flags &= ~DEQUEUE_MOVE; - } - - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); - - prev_class = p->sched_class; - - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); - } - __setscheduler_uclamp(p, attr); - - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; - - enqueue_task(rq, p, queue_flags); - } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); - - /* Avoid rq from going away on us: */ - preempt_disable(); - head = splice_balance_callbacks(rq); - task_rq_unlock(rq, p, &rf); - - if (pi) { - if (cpuset_locked) - cpuset_unlock(); - rt_mutex_adjust_pi(p); - } - - /* Run balance callbacks after we've adjusted the PI chain: */ - balance_callbacks(rq, head); - preempt_enable(); - - return 0; - -unlock: - task_rq_unlock(rq, p, &rf); - if (cpuset_locked) - cpuset_unlock(); - return retval; -} - -static int _sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool check) -{ - struct sched_attr attr = { - .sched_policy = policy, - .sched_priority = param->sched_priority, - .sched_nice = PRIO_TO_NICE(p->static_prio), - }; - - /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ - if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - policy &= ~SCHED_RESET_ON_FORK; - attr.sched_policy = policy; - } - - return __sched_setscheduler(p, &attr, check, true); -} -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Use sched_set_fifo(), read its comment. - * - * Return: 0 on success. An error code otherwise. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return _sched_setscheduler(p, policy, param, true); -} - -int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -{ - return __sched_setscheduler(p, attr, true, true); -} - -int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -{ - return __sched_setscheduler(p, attr, false, true); -} -EXPORT_SYMBOL_GPL(sched_setattr_nocheck); - -/** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. - * - * Return: 0 on success. An error code otherwise. - */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return _sched_setscheduler(p, policy, param, false); -} - -/* - * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally - * incapable of resource management, which is the one thing an OS really should - * be doing. - * - * This is of course the reason it is limited to privileged users only. - * - * Worse still; it is fundamentally impossible to compose static priority - * workloads. You cannot take two correctly working static prio workloads - * and smash them together and still expect them to work. - * - * For this reason 'all' FIFO tasks the kernel creates are basically at: - * - * MAX_RT_PRIO / 2 - * - * The administrator _MUST_ configure the system, the kernel simply doesn't - * know enough information to make a sensible choice. - */ -void sched_set_fifo(struct task_struct *p) -{ - struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; - WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -} -EXPORT_SYMBOL_GPL(sched_set_fifo); - -/* - * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. - */ -void sched_set_fifo_low(struct task_struct *p) -{ - struct sched_param sp = { .sched_priority = 1 }; - WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -} -EXPORT_SYMBOL_GPL(sched_set_fifo_low); - -void sched_set_normal(struct task_struct *p, int nice) -{ - struct sched_attr attr = { - .sched_policy = SCHED_NORMAL, - .sched_nice = nice, - }; - WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); -} -EXPORT_SYMBOL_GPL(sched_set_normal); - -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -{ - struct sched_param lparam; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - CLASS(find_get_task, p)(pid); - if (!p) - return -ESRCH; - - return sched_setscheduler(p, policy, &lparam); -} - -/* - * Mimics kernel/events/core.c perf_copy_attr(). - */ -static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -{ - u32 size; - int ret; - - /* Zero the full structure, so that a short copy will be nice: */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - /* ABI compatibility quirk: */ - if (!size) - size = SCHED_ATTR_SIZE_VER0; - if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) - goto err_size; - - ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); - if (ret) { - if (ret == -E2BIG) - goto err_size; - return ret; - } - - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && - size < SCHED_ATTR_SIZE_VER1) - return -EINVAL; - - /* - * XXX: Do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? - */ - attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); - - return 0; - -err_size: - put_user(sizeof(*attr), &uattr->size); - return -E2BIG; -} - -static void get_params(struct task_struct *p, struct sched_attr *attr) -{ - if (task_has_dl_policy(p)) - __getparam_dl(p, attr); - else if (task_has_rt_policy(p)) - attr->sched_priority = p->rt_priority; - else - attr->sched_nice = task_nice(p); -} - -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -{ - if (policy < 0) - return -EINVAL; - - return do_sched_setscheduler(pid, policy, param); -} - -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -{ - return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -} - -/** - * sys_sched_setattr - same as above, but with extended sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @flags: for future extension. - */ -SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, flags) -{ - struct sched_attr attr; - int retval; - - if (!uattr || pid < 0 || flags) - return -EINVAL; - - retval = sched_copy_attr(uattr, &attr); - if (retval) - return retval; - - if ((int)attr.sched_policy < 0) - return -EINVAL; - if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) - attr.sched_policy = SETPARAM_POLICY; - - CLASS(find_get_task, p)(pid); - if (!p) - return -ESRCH; - - if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); - - return sched_setattr(p, &attr); -} - -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - * - * Return: On success, the policy of the thread. Otherwise, a negative error - * code. - */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -{ - struct task_struct *p; - int retval; - - if (pid < 0) - return -EINVAL; - - guard(rcu)(); - p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (!retval) { - retval = p->policy; - if (p->sched_reset_on_fork) - retval |= SCHED_RESET_ON_FORK; - } - return retval; -} - -/** - * sys_sched_getparam - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - * - * Return: On success, 0 and the RT priority is in @param. Otherwise, an error - * code. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -{ - struct sched_param lp = { .sched_priority = 0 }; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - - scoped_guard (rcu) { - p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (retval) - return retval; - - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - } - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -} - -/* - * Copy the kernel size attribute structure (which might be larger - * than what user-space knows about) to user-space. - * - * Note that all cases are valid: user-space buffer can be larger or - * smaller than the kernel-space buffer. The usual case is that both - * have the same size. - */ -static int -sched_attr_copy_to_user(struct sched_attr __user *uattr, - struct sched_attr *kattr, - unsigned int usize) -{ - unsigned int ksize = sizeof(*kattr); - - if (!access_ok(uattr, usize)) - return -EFAULT; - - /* - * sched_getattr() ABI forwards and backwards compatibility: - * - * If usize == ksize then we just copy everything to user-space and all is good. - * - * If usize < ksize then we only copy as much as user-space has space for, - * this keeps ABI compatibility as well. We skip the rest. - * - * If usize > ksize then user-space is using a newer version of the ABI, - * which part the kernel doesn't know about. Just ignore it - tooling can - * detect the kernel's knowledge of attributes from the attr->size value - * which is set to ksize in this case. - */ - kattr->size = min(usize, ksize); - - if (copy_to_user(uattr, kattr, kattr->size)) - return -EFAULT; - - return 0; -} - -/** - * sys_sched_getattr - similar to sched_getparam, but with sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) for fwd/bwd comp. - * @flags: for future extension. - */ -SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, usize, unsigned int, flags) -{ - struct sched_attr kattr = { }; - struct task_struct *p; - int retval; - - if (!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags) - return -EINVAL; - - scoped_guard (rcu) { - p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (retval) - return retval; - - kattr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; - -#ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine - * because it'll correctly read the old or the new value. We don't need - * to guarantee who wins the race as long as it doesn't return garbage. - */ - kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -#endif - } - - return sched_attr_copy_to_user(uattr, &kattr, usize); -} - -#ifdef CONFIG_SMP -int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) -{ - /* - * If the task isn't a deadline task or admission control is - * disabled then we don't care about affinity changes. - */ - if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) - return 0; - - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ - guard(rcu)(); - if (!cpumask_subset(task_rq(p)->rd->span, mask)) - return -EBUSY; - - return 0; -} -#endif - -static int -__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) -{ - int retval; - cpumask_var_t cpus_allowed, new_mask; - - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) - return -ENOMEM; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, ctx->new_mask, cpus_allowed); - - ctx->new_mask = new_mask; - ctx->flags |= SCA_CHECK; - - retval = dl_task_check_affinity(p, new_mask); - if (retval) - goto out_free_new_mask; - - retval = __set_cpus_allowed_ptr(p, ctx); - if (retval) - goto out_free_new_mask; - - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset update. - * Just reset the cpumask to the cpuset's cpus_allowed. - */ - cpumask_copy(new_mask, cpus_allowed); - - /* - * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr() - * will restore the previous user_cpus_ptr value. - * - * In the unlikely event a previous user_cpus_ptr exists, - * we need to further restrict the mask to what is allowed - * by that old user_cpus_ptr. - */ - if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) { - bool empty = !cpumask_and(new_mask, new_mask, - ctx->user_mask); - - if (WARN_ON_ONCE(empty)) - cpumask_copy(new_mask, cpus_allowed); - } - __set_cpus_allowed_ptr(p, ctx); - retval = -EINVAL; - } - -out_free_new_mask: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); - return retval; -} - -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -{ - struct affinity_context ac; - struct cpumask *user_mask; - int retval; - - CLASS(find_get_task, p)(pid); - if (!p) - return -ESRCH; - - if (p->flags & PF_NO_SETAFFINITY) - return -EINVAL; - - if (!check_same_owner(p)) { - guard(rcu)(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) - return -EPERM; - } - - retval = security_task_setscheduler(p); - if (retval) - return retval; - - /* - * With non-SMP configs, user_cpus_ptr/user_mask isn't used and - * alloc_user_cpus_ptr() returns NULL. - */ - user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); - if (user_mask) { - cpumask_copy(user_mask, in_mask); - } else if (IS_ENABLED(CONFIG_SMP)) { - return -ENOMEM; - } - - ac = (struct affinity_context){ - .new_mask = in_mask, - .user_mask = user_mask, - .flags = SCA_USER, - }; - - retval = __sched_setaffinity(p, &ac); - kfree(ac.user_mask); - - return retval; -} - -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - struct cpumask *new_mask) -{ - if (len < cpumask_size()) - cpumask_clear(new_mask); - else if (len > cpumask_size()) - len = cpumask_size(); - - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} - -/** - * sys_sched_setaffinity - set the CPU affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new CPU mask - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval == 0) - retval = sched_setaffinity(pid, new_mask); - free_cpumask_var(new_mask); - return retval; -} - -long sched_getaffinity(pid_t pid, struct cpumask *mask) -{ - struct task_struct *p; - int retval; - - guard(rcu)(); - p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (retval) - return retval; - - guard(raw_spinlock_irqsave)(&p->pi_lock); - cpumask_and(mask, &p->cpus_mask, cpu_active_mask); - - return 0; -} - -/** - * sys_sched_getaffinity - get the CPU affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current CPU mask - * - * Return: size of CPU mask copied to user_mask_ptr on success. An - * error code otherwise. - */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - int ret; - cpumask_var_t mask; - - if ((len * BITS_PER_BYTE) < nr_cpu_ids) - return -EINVAL; - if (len & (sizeof(unsigned long)-1)) - return -EINVAL; - - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - unsigned int retlen = min(len, cpumask_size()); - - if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) - ret = -EFAULT; - else - ret = retlen; - } - free_cpumask_var(mask); - - return ret; -} - -static void do_sched_yield(void) -{ - struct rq_flags rf; - struct rq *rq; - - rq = this_rq_lock_irq(&rf); - - schedstat_inc(rq->yld_count); - current->sched_class->yield_task(rq); - - preempt_disable(); - rq_unlock_irq(rq, &rf); - sched_preempt_enable_no_resched(); - - schedule(); -} - -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - * - * Return: 0. - */ -SYSCALL_DEFINE0(sched_yield) -{ - do_sched_yield(); - return 0; -} #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) { - if (should_resched(0)) { + if (should_resched(0) && !irqs_disabled()) { preempt_schedule_common(); return 1; } /* - * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick * whether the current CPU is in an RCU read-side critical section, * so the tick can report quiescent states even for CPUs looping * in kernel context. In contrast, in non-preemptible kernels, @@ -8600,6 +7316,8 @@ int __sched __cond_resched(void) * RCU quiescent state. Therefore, the following code causes * cond_resched() to report a quiescent state, but only when RCU * is in urgent need of one. + * A third case, preemptible, but non-PREEMPT_RCU provides for + * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); @@ -8624,7 +7342,6 @@ EXPORT_STATIC_CALL_TRAMP(might_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { - klp_sched_try_switch(); if (!static_branch_unlikely(&sk_dynamic_cond_resched)) return 0; return __cond_resched(); @@ -8724,6 +7441,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * VOLUNTARY: * cond_resched <- __cond_resched @@ -8731,6 +7449,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false * * FULL: * cond_resched <- RET0 @@ -8738,6 +7457,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); * preempt_schedule <- preempt_schedule * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- false + * + * LAZY: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- true */ enum { @@ -8745,36 +7473,46 @@ enum { preempt_dynamic_none, preempt_dynamic_voluntary, preempt_dynamic_full, + preempt_dynamic_lazy, }; int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { +#ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; +#endif if (!strcmp(str, "full")) return preempt_dynamic_full; +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY + if (!strcmp(str, "lazy")) + return preempt_dynamic_lazy; +#endif + return -EINVAL; } +#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) + #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) +#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) #else #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif static DEFINE_MUTEX(sched_dynamic_mutex); -static bool klp_override; static void __sched_dynamic_update(int mode) { @@ -8782,46 +7520,57 @@ static void __sched_dynamic_update(int mode) * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in * the ZERO state, which is invalid. */ - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); switch (mode) { case preempt_dynamic_none: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: none\n"); break; case preempt_dynamic_voluntary: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: voluntary\n"); break; case preempt_dynamic_full: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: full\n"); break; + + case preempt_dynamic_lazy: + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_enable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: lazy\n"); + break; } preempt_dynamic_mode = mode; @@ -8834,36 +7583,6 @@ void sched_dynamic_update(int mode) mutex_unlock(&sched_dynamic_mutex); } -#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL - -static int klp_cond_resched(void) -{ - __klp_sched_try_switch(); - return __cond_resched(); -} - -void sched_dynamic_klp_enable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = true; - static_call_update(cond_resched, klp_cond_resched); - - mutex_unlock(&sched_dynamic_mutex); -} - -void sched_dynamic_klp_disable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = false; - __sched_dynamic_update(preempt_dynamic_mode); - - mutex_unlock(&sched_dynamic_mutex); -} - -#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ - static int __init setup_preempt_mode(char *str) { int mode = sched_dynamic_mode(str); @@ -8884,6 +7603,8 @@ static void __init preempt_dynamic_init(void) sched_dynamic_update(preempt_dynamic_none); } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { sched_dynamic_update(preempt_dynamic_voluntary); + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + sched_dynamic_update(preempt_dynamic_lazy); } else { /* Default static call setting, nothing to do */ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); @@ -8904,106 +7625,60 @@ static void __init preempt_dynamic_init(void) PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(full); +PREEMPT_MODEL_ACCESSOR(lazy); + +#else /* !CONFIG_PREEMPT_DYNAMIC: */ -#else /* !CONFIG_PREEMPT_DYNAMIC */ +#define preempt_dynamic_mode -1 static inline void preempt_dynamic_init(void) { } -#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ +#endif /* CONFIG_PREEMPT_DYNAMIC */ -/** - * yield - yield the current processor to other threads. - * - * Do not ever use this function, there's a 99% chance you're doing it wrong. - * - * The scheduler is at all times free to pick the calling task as the most - * eligible task to run, if removing the yield() call from your code breaks - * it, it's already broken. - * - * Typical broken usage is: - * - * while (!event) - * yield(); - * - * where one assumes that yield() will let 'the other' process run that will - * make event true. If the current task is a SCHED_FIFO task that will never - * happen. Never use yield() as a progress guarantee!! - * - * If you want to use yield() to wait for something, use wait_event(). - * If you want to use yield() to be 'nice' for others, use cond_resched(). - * If you still want to use yield(), do not! - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - do_sched_yield(); -} -EXPORT_SYMBOL(yield); +const char *preempt_modes[] = { + "none", "voluntary", "full", "lazy", NULL, +}; -/** - * yield_to - yield the current processor to another thread in - * your thread group, or accelerate that thread toward the - * processor it's on. - * @p: target task - * @preempt: whether task preemption is allowed or not - * - * It's the caller's job to ensure that the target task struct - * can't go away on us before we can do any checks. - * - * Return: - * true (>0) if we indeed boosted the target task. - * false (0) if we failed to boost the target. - * -ESRCH if there's no task to yield to. - */ -int __sched yield_to(struct task_struct *p, bool preempt) +const char *preempt_model_str(void) { - struct task_struct *curr = current; - struct rq *rq, *p_rq; - int yielded = 0; - - scoped_guard (irqsave) { - rq = this_rq(); - -again: - p_rq = task_rq(p); - /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. - */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) - return -ESRCH; + bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) && + (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) || + IS_ENABLED(CONFIG_PREEMPT_LAZY)); + static char buf[128]; - guard(double_rq_lock)(rq, p_rq); - if (task_rq(p) != p_rq) - goto again; + if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) { + struct seq_buf s; - if (!curr->sched_class->yield_to_task) - return 0; + seq_buf_init(&s, buf, sizeof(buf)); + seq_buf_puts(&s, "PREEMPT"); - if (curr->sched_class != p->sched_class) - return 0; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + seq_buf_printf(&s, "%sRT%s", + brace ? "_{" : "_", + brace ? "," : ""); - if (task_on_cpu(p_rq, p) || !task_is_running(p)) - return 0; + if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { + seq_buf_printf(&s, "(%s)%s", + preempt_dynamic_mode > 0 ? + preempt_modes[preempt_dynamic_mode] : "undef", + brace ? "}" : ""); + return seq_buf_str(&s); + } - yielded = curr->sched_class->yield_to_task(rq, p); - if (yielded) { - schedstat_inc(rq->yld_count); - /* - * Make p's CPU reschedule; pick_next_entity - * takes care of fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); + if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + seq_buf_printf(&s, "LAZY%s", + brace ? "}" : ""); + return seq_buf_str(&s); } + + return seq_buf_str(&s); } - if (yielded) - schedule(); + if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD)) + return "VOLUNTARY"; - return yielded; + return "NONE"; } -EXPORT_SYMBOL_GPL(yield_to); int io_schedule_prepare(void) { @@ -9046,126 +7721,9 @@ void __sched io_schedule(void) } EXPORT_SYMBOL(io_schedule); -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the maximum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_RT_PRIO-1; - break; - case SCHED_DEADLINE: - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the minimum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_DEADLINE: - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - } - return ret; -} - -static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -{ - unsigned int time_slice = 0; - int retval; - - if (pid < 0) - return -EINVAL; - - scoped_guard (rcu) { - struct task_struct *p = find_process_by_pid(pid); - if (!p) - return -ESRCH; - - retval = security_task_getscheduler(p); - if (retval) - return retval; - - scoped_guard (task_rq_lock, p) { - struct rq *rq = scope.rq; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - } - } - - jiffies_to_timespec64(time_slice, t); - return 0; -} - -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct __kernel_timespec __user *, interval) -{ - struct timespec64 t; - int retval = sched_rr_get_interval(pid, &t); - - if (retval == 0) - retval = put_timespec64(&t, interval); - - return retval; -} - -#ifdef CONFIG_COMPAT_32BIT_TIME -SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, - struct old_timespec32 __user *, interval) -{ - struct timespec64 t; - int retval = sched_rr_get_interval(pid, &t); - - if (retval == 0) - retval = put_old_timespec32(&t, interval); - return retval; -} -#endif - void sched_show_task(struct task_struct *p) { - unsigned long free = 0; + unsigned long free; int ppid; if (!try_get_task_stack(p)) @@ -9175,20 +7733,19 @@ void sched_show_task(struct task_struct *p) if (task_is_running(p)) pr_cont(" running task "); -#ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); -#endif ppid = 0; rcu_read_lock(); if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n", + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n", free, task_pid_nr(p), task_tgid_nr(p), - ppid, read_task_thread_flags(p)); + ppid, p->flags, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); + print_scx_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } @@ -9237,10 +7794,9 @@ void show_state_filter(unsigned int state_filter) sched_show_task(p); } -#ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -#endif + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: @@ -9268,8 +7824,6 @@ void __init init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - __sched_fork(0, idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -9284,10 +7838,8 @@ void __init init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP /* - * It's possible that init_idle() gets called multiple times on a task, - * in that case do_set_cpus_allowed() will not do the right thing. - * - * And since this is boot we can forgo the serialization. + * No validation and serialization required at boot time and for + * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); #endif @@ -9306,6 +7858,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->idle = idle; + rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP @@ -9395,7 +7948,7 @@ void sched_setnuma(struct task_struct *p, int nid) rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); @@ -9414,19 +7967,26 @@ void sched_setnuma(struct task_struct *p, int nid) #ifdef CONFIG_HOTPLUG_CPU /* - * Ensure that the idle task is using init_mm right before its CPU goes - * offline. + * Invoked on the outgoing CPU in context of the CPU hotplug thread + * after ensuring that there are no user space tasks left on the CPU. + * + * If there is a lazy mm in use on the hotplug thread, drop it and + * switch to init_mm. + * + * The reference count on init_mm is dropped in finish_cpu(). */ -void idle_task_exit(void) +static void sched_force_init_mm(void) { struct mm_struct *mm = current->active_mm; - BUG_ON(cpu_online(smp_processor_id())); - BUG_ON(current != this_rq()->idle); - if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + mmgrab_lazy_tlb(&init_mm); + local_irq_disable(); + current->active_mm = &init_mm; + switch_mm_irqs_off(mm, &init_mm, current); + local_irq_enable(); finish_arch_post_lock_switch(); + mmdrop_lazy_tlb(mm); } /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ @@ -9605,6 +8165,30 @@ void set_rq_offline(struct rq *rq) } } +static inline void sched_set_rq_online(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + +static inline void sched_set_rq_offline(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + /* * used to mark begin/end of suspend/resume: */ @@ -9627,7 +8211,7 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); if (--num_cpus_frozen) return; /* @@ -9640,25 +8224,35 @@ static void cpuset_cpu_active(void) cpuset_update_active_cpus(); } -static int cpuset_cpu_inactive(unsigned int cpu) +static void cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - int ret = dl_bw_check_overflow(cpu); - - if (ret) - return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); } - return 0; +} + +static inline void sched_smt_present_inc(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(&sched_smt_present); +#endif +} + +static inline void sched_smt_present_dec(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); +#endif } int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; /* * Clear the balance_push callback and prepare to schedule @@ -9666,13 +8260,10 @@ int sched_cpu_activate(unsigned int cpu) */ balance_push_set(cpu, false); -#ifdef CONFIG_SCHED_SMT /* * When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_inc_cpuslocked(&sched_smt_present); -#endif + sched_smt_present_inc(cpu); set_cpu_active(cpu, true); if (sched_smp_initialized) { @@ -9681,6 +8272,8 @@ int sched_cpu_activate(unsigned int cpu) cpuset_cpu_active(); } + scx_rq_activate(rq); + /* * Put the rq online, if not already. This happens: * @@ -9690,12 +8283,7 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_online(rq, cpu); return 0; } @@ -9703,9 +8291,13 @@ int sched_cpu_activate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; int ret; + ret = dl_bw_deactivate(cpu); + + if (ret) + return ret; + /* * Remove CPU from nohz.idle_cpus_mask to prevent participating in * load balancing when not active @@ -9730,24 +8322,20 @@ int sched_cpu_deactivate(unsigned int cpu) * Specifically, we rely on ttwu to no longer target this CPU, see * ttwu_queue_cond() and is_cpu_allowed(). * - * Do sync before park smpboot threads to take care the rcu boost case. + * Do sync before park smpboot threads to take care the RCU boost case. */ synchronize_rcu(); - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_offline(rq, cpu); + + scx_rq_deactivate(rq); -#ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_dec_cpuslocked(&sched_smt_present); + sched_smt_present_dec(cpu); +#ifdef CONFIG_SCHED_SMT sched_core_cpu_deactivate(cpu); #endif @@ -9755,13 +8343,7 @@ int sched_cpu_deactivate(unsigned int cpu) return 0; sched_update_numa(cpu, false); - ret = cpuset_cpu_inactive(cpu); - if (ret) { - balance_push_set(cpu, false); - set_cpu_active(cpu, true); - sched_update_numa(cpu, true); - return ret; - } + cpuset_cpu_inactive(cpu); sched_domains_numa_masks_clear(cpu); return 0; } @@ -9798,6 +8380,7 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_wait_empty(unsigned int cpu) { balance_hotplug_wait(); + sched_force_init_mm(); return 0; } @@ -9805,7 +8388,7 @@ int sched_cpu_wait_empty(unsigned int cpu) * Since this CPU is going 'away' for a while, fold any nr_active delta we * might have. Called from the CPU stopper task after ensuring that the * stopper is the last running task on the CPU, so nr_active count is - * stable. We need to take the teardown thread which is calling this into + * stable. We need to take the tear-down thread which is calling this into * account, so we hand in adjust = 1 to the load calculation. * * Also see the comment "Global load-average calculations". @@ -9869,9 +8452,9 @@ void __init sched_init_smp(void) * CPU masks are stable and all blatant races in the below code cannot * happen. */ - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); sched_init_domains(cpu_active_mask); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) @@ -9924,11 +8507,15 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class != &fair_sched_class + 1 || - &fair_sched_class != &rt_sched_class + 1 || - &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class != &stop_sched_class + 1); + BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); +#endif + BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); + BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); + BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); +#ifdef CONFIG_SCHED_CLASS_EXT + BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); #endif wait_bit_init(); @@ -9952,6 +8539,9 @@ void __init sched_init(void) root_task_group.shares = ROOT_TASK_GROUP_LOAD; init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EXT_GROUP_SCHED + root_task_group.scx_weight = CGROUP_WEIGHT_DFL; +#endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -9962,8 +8552,6 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } - init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_SMP init_defrootdomain(); #endif @@ -9999,7 +8587,7 @@ void __init sched_init(void) /* * How much CPU bandwidth does root_task_group get? * - * In case of task-groups formed thr' the cgroup filesystem, it + * In case of task-groups formed through the cgroup filesystem, it * gets 100% of the CPU resources in the system. This overall * system CPU resource is divided among the tasks of * root_task_group and its child task-groups in a fair manner, @@ -10018,8 +8606,13 @@ void __init sched_init(void) init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED + /* + * This is required for init cpu because rt.c:__enable_runtime() + * starts working after scheduler_running, which is not the case + * yet. + */ + rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif #ifdef CONFIG_SMP @@ -10051,10 +8644,12 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + fair_server_init(rq); #ifdef CONFIG_SCHED_CORE rq->core = rq; rq->core_pick = NULL; + rq->core_dl_server = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; rq->core_forceidle_count = 0; @@ -10067,6 +8662,7 @@ void __init sched_init(void) } set_load_weight(&init_task, false); + init_task.se.slice = sysctl_sched_base_slice, /* * The boot idle thread does lazy MMU switching as well: @@ -10088,6 +8684,7 @@ void __init sched_init(void) * but because we are the idle thread, we just pick up running again * when this runqueue becomes "idle". */ + __sched_fork(0, current); init_idle(current, smp_processor_id()); calc_load_update = jiffies + LOAD_FREQ; @@ -10097,6 +8694,7 @@ void __init sched_init(void) balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); + init_sched_ext_class(); psi_init(); @@ -10282,7 +8880,7 @@ void normalize_rt_tasks(void) schedstat_set(p->stats.sleep_start, 0); schedstat_set(p->stats.block_start, 0); - if (!dl_task(p) && !rt_task(p)) { + if (!rt_or_dl_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: @@ -10301,7 +8899,7 @@ void normalize_rt_tasks(void) #if defined(CONFIG_KGDB_KDB) /* - * These functions are only useful for kdb. + * These functions are only useful for KDB. * * They can only be called when the whole system has been * stopped - every CPU needs to be quiescent, and no scheduling @@ -10382,6 +8980,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); alloc_uclamp_sched_group(tg, parent); return tg; @@ -10396,7 +8995,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) unsigned long flags; spin_lock_irqsave(&task_group_lock, flags); - list_add_rcu(&tg->list, &task_groups); + list_add_tail_rcu(&tg->list, &task_groups); /* Root should already exist: */ WARN_ON(!parent); @@ -10409,7 +9008,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) online_fair_sched_group(tg); } -/* rcu callback to free various structures associated with a task group */ +/* RCU callback to free various structures associated with a task group */ static void sched_unregister_group_rcu(struct rcu_head *rhp) { /* Now it should be safe to free those cfs_rqs: */ @@ -10445,7 +9044,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static struct task_group *sched_get_task_group(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -10457,13 +9056,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - - return tg; -} - -static void sched_change_group(struct task_struct *tsk, struct task_group *group) -{ - tsk->sched_task_group = group; + tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -10480,27 +9073,18 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct task_group *group; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - /* - * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous - * group changes. - */ - group = sched_get_task_group(tsk); - if (group == tsk->sched_task_group) - return; - update_rq_clock(rq); - running = task_current(rq, tsk); + running = task_current_donor(rq, tsk); queued = task_on_rq_queued(tsk); if (queued) @@ -10508,7 +9092,9 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, group); + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10523,11 +9109,6 @@ void sched_move_task(struct task_struct *tsk) } } -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct task_group, css) : NULL; -} - static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -10551,6 +9132,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); + int ret; + + ret = scx_tg_online(tg); + if (ret) + return ret; if (parent) sched_online_group(tg, parent); @@ -10565,6 +9151,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) return 0; } +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + scx_tg_offline(tg); +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -10582,19 +9175,23 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -#ifdef CONFIG_RT_GROUP_SCHED static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_RT_GROUP_SCHED struct task_struct *task; struct cgroup_subsys_state *css; + if (!rt_group_sched_enabled()) + goto scx_check; + cgroup_taskset_for_each(task, css, tset) { if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } - return 0; +scx_check: +#endif /* CONFIG_RT_GROUP_SCHED */ + return scx_cgroup_can_attach(tset); } -#endif static void cpu_cgroup_attach(struct cgroup_taskset *tset) { @@ -10602,7 +9199,14 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); + + scx_cgroup_finish_attach(); +} + +static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + scx_cgroup_cancel_attach(tset); } #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -10616,7 +9220,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) unsigned int clamps; lockdep_assert_held(&uclamp_mutex); - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent @@ -10708,7 +9312,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); guard(mutex)(&uclamp_mutex); guard(rcu)(); @@ -10779,22 +9383,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) } #endif /* CONFIG_UCLAMP_TASK_GROUP */ +#ifdef CONFIG_GROUP_SCHED_WEIGHT +static unsigned long tg_weight(struct task_group *tg) +{ #ifdef CONFIG_FAIR_GROUP_SCHED + return scale_load_down(tg->shares); +#else + return sched_weight_from_cgroup(tg->scx_weight); +#endif +} + static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + int ret; + if (shareval > scale_load_down(ULONG_MAX)) shareval = MAX_SHARES; - return sched_group_set_shares(css_tg(css), scale_load(shareval)); + ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(shareval)); + return ret; } static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - - return (u64) scale_load_down(tg->shares); + return tg_weight(css_tg(css)); } +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); @@ -11140,7 +9758,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, @@ -11168,7 +9785,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -11178,12 +9795,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, static int cpu_idle_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 idle) { - return sched_group_set_idle(css_tg(css), idle); + int ret; + + ret = sched_group_set_idle(css_tg(css), idle); + if (!ret) + scx_group_set_idle(css_tg(css), idle); + return ret; } #endif static struct cftype cpu_legacy_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT { .name = "shares", .read_u64 = cpu_shares_read_u64, @@ -11220,18 +9842,6 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_cfs_local_stat_show, }, #endif -#ifdef CONFIG_RT_GROUP_SCHED - { - .name = "rt_runtime_us", - .read_s64 = cpu_rt_runtime_read, - .write_s64 = cpu_rt_runtime_write, - }, - { - .name = "rt_period_us", - .read_u64 = cpu_rt_period_read_uint, - .write_u64 = cpu_rt_period_write_uint, - }, -#endif #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -11249,6 +9859,55 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; +#ifdef CONFIG_RT_GROUP_SCHED +static struct cftype rt_group_files[] = { + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, + { } /* Terminate */ +}; + +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DEFINE_STATIC_KEY_FALSE(rt_group_sched); +# else +DEFINE_STATIC_KEY_TRUE(rt_group_sched); +# endif + +static int __init setup_rt_group_sched(char *str) +{ + long val; + + if (kstrtol(str, 0, &val) || val < 0 || val > 1) { + pr_warn("Unable to set rt_group_sched\n"); + return 1; + } + if (val) + static_branch_enable(&rt_group_sched); + else + static_branch_disable(&rt_group_sched); + + return 1; +} +__setup("rt_group_sched=", setup_rt_group_sched); + +static int __init cpu_rt_group_init(void) +{ + if (!rt_group_sched_enabled()) + return 0; + + WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files)); + return 0; +} +subsys_initcall(cpu_rt_group_init); +#endif /* CONFIG_RT_GROUP_SCHED */ + static int cpu_extra_stat_show(struct seq_file *sf, struct cgroup_subsys_state *css) { @@ -11293,38 +9952,35 @@ static int cpu_local_stat_show(struct seq_file *sf, return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - u64 weight = scale_load_down(tg->shares); - - return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); + return sched_weight_to_cgroup(tg_weight(css_tg(css))); } static int cpu_weight_write_u64(struct cgroup_subsys_state *css, - struct cftype *cft, u64 weight) + struct cftype *cft, u64 cgrp_weight) { - /* - * cgroup weight knobs should use the common MIN, DFL and MAX - * values which are 1, 100 and 10000 respectively. While it loses - * a bit of range on both ends, it maps pretty well onto the shares - * value used by scheduler and the round-trip conversions preserve - * the original value over the entire range. - */ - if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + unsigned long weight; + int ret; + + if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; - weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), cgrp_weight); + return ret; } static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - unsigned long weight = scale_load_down(css_tg(css)->shares); + unsigned long weight = tg_weight(css_tg(css)); int last_delta = INT_MAX; int prio, delta; @@ -11343,7 +9999,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; - int idx; + int idx, ret; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; @@ -11352,9 +10008,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, idx = array_index_nospec(idx, 40); weight = sched_prio_to_weight[idx]; - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(weight)); + return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, long period, long quota) @@ -11402,7 +10062,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, { struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); - u64 burst = tg_get_cfs_burst(tg); + u64 burst = tg->cfs_bandwidth.burst; u64 quota; int ret; @@ -11414,7 +10074,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, #endif static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, @@ -11468,14 +10128,14 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, .css_local_stat_show = cpu_local_stat_show, -#ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, -#endif .attach = cpu_cgroup_attach, + .cancel_attach = cpu_cgroup_cancel_attach, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, .early_init = true, @@ -11486,7 +10146,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { - if (cpu == smp_processor_id() && in_hardirq()) { + if (in_hardirq() && cpu == smp_processor_id()) { struct pt_regs *regs; regs = get_irq_regs(); @@ -11527,10 +10187,10 @@ const int sched_prio_to_weight[40] = { }; /* - * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated. * * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions + * pre-calculated inverse to speed up arithmetics by turning divisions * into multiplications: */ const u32 sched_prio_to_wmult[40] = { @@ -11759,6 +10419,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, */ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) return -1; + WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); return src_cid; } @@ -11771,7 +10432,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; struct mm_struct *mm = t->mm; - int src_cid, dst_cid, src_cpu; + int src_cid, src_cpu; + bool dst_cid_is_set; struct rq *src_rq; lockdep_assert_rq_held(dst_rq); @@ -11786,21 +10448,21 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) /* * Move the src cid if the dst cid is unset. This keeps id * allocation closest to 0 in cases where few threads migrate around - * many cpus. + * many CPUs. * - * If destination cid is already set, we may have to just clear - * the src cid to ensure compactness in frequent migrations - * scenarios. + * If destination cid or recent cid is already set, we may have + * to just clear the src cid to ensure compactness in frequent + * migrations scenarios. * * It is not useful to clear the src cid when the number of threads is - * greater or equal to the number of allowed cpus, because user-space + * greater or equal to the number of allowed CPUs, because user-space * can expect that the number of allowed cids can reach the number of - * allowed cpus. + * allowed CPUs. */ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid = READ_ONCE(dst_pcpu_cid->cid); - if (!mm_cid_is_unset(dst_cid) && - atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) + dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || + !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); + if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) return; src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); src_rq = cpu_rq(src_cpu); @@ -11811,13 +10473,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) src_cid); if (src_cid == -1) return; - if (!mm_cid_is_unset(dst_cid)) { + if (dst_cid_is_set) { __mm_cid_put(mm, src_cid); return; } /* Move src_cid to dst cpu. */ mm_cid_snapshot_time(dst_rq, mm); WRITE_ONCE(dst_pcpu_cid->cid, src_cid); + WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); } static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, @@ -11929,7 +10592,7 @@ static void task_mm_cid_work(struct callback_head *work) struct mm_struct *mm; int weight, cpu; - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); work->next = work; /* Prevent double-add */ if (t->flags & PF_EXITING) @@ -11989,6 +10652,8 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) return; + + /* No page allocation under rq lock */ task_work_add(curr, work, TWA_RESUME); } @@ -12054,9 +10719,8 @@ void sched_mm_cid_after_execve(struct task_struct *t) * Matches barrier in sched_mm_cid_remote_clear_old(). */ smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } - rseq_set_notify_resume(t); } void sched_mm_cid_fork(struct task_struct *t) @@ -12065,3 +10729,38 @@ void sched_mm_cid_fork(struct task_struct *t) t->mm_cid_active = 1; } #endif + +#ifdef CONFIG_SCHED_CLASS_EXT +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + + *ctx = (struct sched_enq_and_set_ctx){ + .p = p, + .queue_flags = queue_flags, + .queued = task_on_rq_queued(p), + .running = task_current(rq, p), + }; + + update_rq_clock(rq); + if (ctx->queued) + dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + if (ctx->running) + put_prev_task(rq, p); +} + +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(ctx->p); + + lockdep_assert_rq_held(rq); + + if (ctx->queued) + enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + if (ctx->running) + set_next_task(rq, ctx->p); +} +#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index a57fd8f27498..c4606ca89210 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -65,7 +65,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * a cookie until after we've removed it, we must have core scheduling * enabled here. */ - SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq)); if (sched_core_enqueued(p)) sched_core_dequeue(rq, p, DEQUEUE_SAVE); @@ -279,7 +279,7 @@ void __sched_core_account_forceidle(struct rq *rq) continue; /* - * Note: this will account forceidle to the current cpu, even + * Note: this will account forceidle to the current CPU, even * if it comes from our SMT sibling. */ __account_forceidle_time(p, delta); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index eece6244f9d2..461242ec958a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -81,9 +81,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + + return true; + } else if (sg_policy->need_freq_update) { + /* ignore_dl_rate_limit() wants a new frequency to be found. */ return true; } @@ -95,10 +109,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); - else if (sg_policy->next_freq == next_freq) + if (sg_policy->need_freq_update) { + sg_policy->need_freq_update = false; + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -197,8 +223,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { - unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); + unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); + if (!scx_switched_all()) + util += cpu_util_cfs_boost(sg_cpu->cpu); util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); util = max(util, boost); sg_cpu->bw_min = min; @@ -325,16 +353,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, } #ifdef CONFIG_NO_HZ_COMMON -static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); - bool ret = idle_calls == sg_cpu->saved_idle_calls; + unsigned long idle_calls; + bool ret; + + /* + * The heuristics in this function is for the fair class. For SCX, the + * performance target comes directly from the BPF scheduler. Let's just + * follow it. + */ + if (scx_switched_all()) + return false; + + /* if capped by uclamp_max, always update to be in compliance */ + if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) + return false; + + /* + * Maintain the frequency if the CPU has not been idle recently, as + * reduction is likely to be premature. + */ + idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); + ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; return ret; } #else -static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } #endif /* CONFIG_NO_HZ_COMMON */ /* @@ -344,7 +391,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) - sg_cpu->sg_policy->limits_changed = true; + sg_cpu->sg_policy->need_freq_update = true; } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, @@ -382,14 +429,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, return; next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - * - * Except when the rq is capped by uclamp_max. - */ - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && + + if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && !sg_policy->need_freq_update) { next_f = sg_policy->next_freq; @@ -436,14 +477,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - /* - * Do not reduce the target performance level if the CPU has not been - * idle recently, as the reduction is likely to be premature then. - * - * Except when the rq is capped by uclamp_max. - */ - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, @@ -596,32 +630,7 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -#ifdef CONFIG_ENERGY_MODEL -static void rebuild_sd_workfn(struct work_struct *work) -{ - rebuild_sched_domains_energy(); -} - -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -static void sugov_eas_rebuild_sd(void) -{ - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); -} -#else -static inline void sugov_eas_rebuild_sd(void) { }; -#endif - -struct cpufreq_governor schedutil_gov; +static struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -654,9 +663,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) * Fake (unused) bandwidth; workaround to "fix" * priority inheritance. */ - .sched_runtime = 1000000, - .sched_deadline = 10000000, - .sched_period = 10000000, + .sched_runtime = NSEC_PER_MSEC, + .sched_deadline = 10 * NSEC_PER_MSEC, + .sched_period = 10 * NSEC_PER_MSEC, }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -683,7 +692,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + if (policy->dvfs_possible_from_any_cpu) + set_cpus_allowed_ptr(thread, policy->related_cpus); + else + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -775,9 +788,12 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - sugov_eas_rebuild_sd(); - out: + /* + * Schedutil is the preferred governor for EAS, so rebuild sched domains + * on governor changes to make sure the scheduler knows about them. + */ + em_rebuild_sched_domains(); mutex_unlock(&global_tunables_lock); return 0; @@ -819,7 +835,7 @@ static void sugov_exit(struct cpufreq_policy *policy) sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); - sugov_eas_rebuild_sd(); + em_rebuild_sched_domains(); } static int sugov_start(struct cpufreq_policy *policy) @@ -881,10 +897,19 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } -struct cpufreq_governor schedutil_gov = { +static struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, @@ -902,4 +927,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif +bool sugov_is_governor(struct cpufreq_policy *policy) +{ + return policy->governor == &schedutil_gov; +} + cpufreq_governor_init(schedutil_gov); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index af7952f12e6c..6dab4854c6c0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -14,15 +14,15 @@ * They are only modified in vtime_account, on corresponding CPU * with interrupts disabled. So, writes are safe. * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can + * This may result in other CPU reading this CPU's IRQ time and can * race with irq/vtime_account on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. + * or new value with a side effect of accounting a slice of IRQ time to wrong + * task when IRQ is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each IRQ in account_system_time. */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -static int sched_clock_irqtime; +int sched_clock_irqtime; void enable_sched_clock_irqtime(void) { @@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) s64 delta; int cpu; - if (!sched_clock_irqtime) + if (!irqtime_enabled()) return; cpu = smp_processor_id(); @@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime) #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -#define sched_clock_irqtime (0) - static u64 irqtime_tick_accounted(u64 dummy) { return 0; @@ -269,7 +267,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) } /* - * Account how much elapsed time was spent in steal, irq, or softirq time. + * Account how much elapsed time was spent in steal, IRQ, or softirq time. */ static inline u64 account_other_time(u64 max) { @@ -370,7 +368,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) * Check for hardirq is done both for system and user time as there is * no timer going off while we are on hardirq and hence we may never get an * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq + * p->stime and friends are only updated on system time and not on IRQ * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, @@ -380,7 +378,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, /* * When returning from idle, many ticks can get accounted at - * once, including some ticks of steal, irq, and softirq time. + * once, including some ticks of steal, IRQ, and softirq time. * Subtract those ticks from the amount of time accounted to * idle, or potentially user or system time. Due to rounding, * other time can exceed ticks occasionally. @@ -424,19 +422,6 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -# ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_kernel(prev); - - vtime_flush(prev); - arch_vtime_task_switch(prev); -} -# endif - void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { unsigned int pc = irq_count() - offset; @@ -491,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (vtime_accounting_enabled_this_cpu()) return; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_process_tick(p, user_tick, 1); return; } @@ -520,7 +505,7 @@ void account_idle_ticks(unsigned long ticks) { u64 cputime, steal; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_idle_ticks(ticks); return; } @@ -595,6 +580,12 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, } stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); + /* + * Because mul_u64_u64_div_u64() can approximate on some + * achitectures; enforce the constraint that: a*b/(b+c) <= a. + */ + if (unlikely(stime > rtime)) + stime = rtime; update: /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af8cc..ad45a8fea245 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -26,7 +26,7 @@ static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ #ifdef CONFIG_SYSCTL -static struct ctl_table sched_dl_sysctls[] = { +static const struct ctl_table sched_dl_sysctls[] = { { .procname = "sched_deadline_period_max_us", .data = &sysctl_sched_dl_period_max, @@ -43,7 +43,6 @@ static struct ctl_table sched_dl_sysctls[] = { .proc_handler = proc_douintvec_minmax, .extra2 = (void *)&sysctl_sched_dl_period_max, }, - {} }; static int __init sched_dl_sysctl_init(void) @@ -167,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i) } } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { struct root_domain *rd = cpu_rq(cpu)->rd; - if (rd->visit_gen == gen) + if (rd->visit_cookie == cookie) return true; - rd->visit_gen = gen; + rd->visit_cookie = cookie; return false; } @@ -208,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i) return SCHED_CAPACITY_SCALE; } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { return false; } @@ -250,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; - SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } @@ -263,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ @@ -277,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; - SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */ } static inline @@ -287,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) dl_rq->this_bw = 0; - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); } static inline @@ -321,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -static void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) { - struct rq *rq; - - WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); - - if (task_on_rq_queued(p)) - return; + if (dl_se->dl_non_contending) { + sub_running_bw(dl_se, &rq->dl); + dl_se->dl_non_contending = 0; - rq = task_rq(p); - if (p->dl.dl_non_contending) { - sub_running_bw(&p->dl, &rq->dl); - p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the * timer cannot be canceled, inactive_task_timer() @@ -341,13 +333,48 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } - __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __sub_rq_bw(dl_se->dl_bw, &rq->dl); __add_rq_bw(new_bw, &rq->dl); } +static __always_inline +void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer) +{ + /* + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). + */ + if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); +} + +static __always_inline +void cancel_replenish_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->dl_timer); +} + +static __always_inline +void cancel_inactive_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->inactive_timer); +} + +static void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); + + if (task_on_rq_queued(p)) + return; + + dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); +} + static void __dl_clear_params(struct sched_dl_entity *dl_se); /* @@ -491,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { - if (!dl_server(dl_se)) - put_task_struct(dl_task_of(dl_se)); - } + cancel_inactive_timer(dl_se); } else { /* * Since "dl_non_contending" is not set, the @@ -709,7 +733,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p } /* - * And we finally need to fixup root_domain(s) bandwidth accounting, + * And we finally need to fix up root_domain(s) bandwidth accounting, * since p is still hanging out in the old (now moved to default) root * domain. */ @@ -772,6 +796,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, /* for non-boosted task, pi_of(dl_se) == dl_se */ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; dl_se->runtime = pi_of(dl_se)->dl_runtime; + + /* + * If it is a deferred reservation, and the server + * is not handling an starvation case, defer it. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running) { + dl_se->dl_throttled = 1; + dl_se->dl_defer_armed = 1; + } } /* @@ -810,6 +843,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) replenish_dl_new_period(dl_se, rq); } +static int start_dl_timer(struct sched_dl_entity *dl_se); +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); + /* * Pure Earliest Deadline First (EDF) scheduling does not deal with the * possibility of a entity lasting more than what it declared, and thus @@ -838,9 +874,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. + * + * Or, it could be the case of a deferred reservation that + * was not able to consume its runtime in background and + * reached this point with current u > U. + * + * In both cases, set a new period. */ - if (dl_se->dl_deadline == 0) - replenish_dl_new_period(dl_se, rq); + if (dl_se->dl_deadline == 0 || + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; + } if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -874,6 +919,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) dl_se->dl_yielded = 0; if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + + /* + * If this is the replenishment of a deferred reservation, + * clear the flag and return. + */ + if (dl_se->dl_defer_armed) { + dl_se->dl_defer_armed = 0; + return; + } + + /* + * A this point, if the deferred server is not armed, and the deadline + * is in the future, if it is not running already, throttle the server + * and arm the defer timer. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running && + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + + /* + * Set dl_se->dl_defer_armed and dl_throttled variables to + * inform the start_dl_timer() that this is a deferred + * activation. + */ + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + if (!start_dl_timer(dl_se)) { + /* + * If for whatever reason (delays), a previous timer was + * queued but not serviced, cancel it and clean the + * deferrable server variables intended for start_dl_timer(). + */ + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + } + } } /* @@ -993,7 +1076,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * is detected, the runtime and deadline need to be updated. * * If the task has an implicit deadline, i.e., deadline == period, the Original - * CBS is applied. the runtime is replenished and a new absolute deadline is + * CBS is applied. The runtime is replenished and a new absolute deadline is * set, as in the previous cases. * * However, the Original CBS does not work properly for tasks with @@ -1024,6 +1107,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) } replenish_dl_new_period(dl_se, rq); + } else if (dl_server(dl_se) && dl_se->dl_defer) { + /* + * The server can still use its previous deadline, so check if + * it left the dl_defer_running state. + */ + if (!dl_se->dl_defer_running) { + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + } } } @@ -1056,8 +1148,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. + * + * The deferred reservation will have its timer set to + * (deadline - runtime). At that point, the CBS rule will decide + * if the current deadline can be used, or if a replenishment is + * required to avoid add too much pressure on the system + * (current u > U). */ - act = ns_to_ktime(dl_next_period(dl_se)); + if (dl_se->dl_defer_armed) { + WARN_ON_ONCE(!dl_se->dl_throttled); + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); + } else { + /* act = deadline - rel-deadline + period */ + act = ns_to_ktime(dl_next_period(dl_se)); + } + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -1107,6 +1212,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) #endif } +/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ +static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; + +static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) +{ + struct rq *rq = rq_of_dl_se(dl_se); + u64 fw; + + scoped_guard (rq_lock, rq) { + struct rq_flags *rf = &scope.rf; + + if (!dl_se->dl_throttled || !dl_se->dl_runtime) + return HRTIMER_NORESTART; + + sched_clock_tick(); + update_rq_clock(rq); + + if (!dl_se->dl_runtime) + return HRTIMER_NORESTART; + + if (!dl_se->server_has_tasks(dl_se)) { + replenish_dl_entity(dl_se); + return HRTIMER_NORESTART; + } + + if (dl_se->dl_defer_armed) { + /* + * First check if the server could consume runtime in background. + * If so, it is possible to push the defer timer for this amount + * of time. The dl_server_min_res serves as a limit to avoid + * forwarding the timer for a too small amount of time. + */ + if (dl_time_before(rq_clock(dl_se->rq), + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { + + /* reset the defer timer */ + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; + + hrtimer_forward_now(timer, ns_to_ktime(fw)); + return HRTIMER_RESTART; + } + + dl_se->dl_defer_running = 1; + } + + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) + resched_curr(rq); + + __push_dl_task(rq, rf); + } + + return HRTIMER_NORESTART; +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1129,28 +1290,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct rq_flags rf; struct rq *rq; - if (dl_server(dl_se)) { - struct rq *rq = rq_of_dl_se(dl_se); - struct rq_flags rf; - - rq_lock(rq, &rf); - if (dl_se->dl_throttled) { - sched_clock_tick(); - update_rq_clock(rq); - - if (dl_se->server_has_tasks(dl_se)) { - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); - resched_curr(rq); - __push_dl_task(rq, &rf); - } else { - replenish_dl_entity(dl_se); - } - - } - rq_unlock(rq, &rf); - - return HRTIMER_NORESTART; - } + if (dl_server(dl_se)) + return dl_server_timer(timer, dl_se); p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); @@ -1218,7 +1359,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) #endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1241,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = dl_task_timer; + hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } /* @@ -1295,7 +1435,7 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw - * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. * Since delta is a 64 bit variable, to have an overflow its value should be * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is * not an issue here. @@ -1320,22 +1460,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } -static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); -static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { s64 scaled_delta_exec; - if (unlikely(delta_exec <= 0)) { - if (unlikely(dl_se->dl_yielded)) - goto throttle; - return; - } - - if (dl_entity_is_special(dl_se)) - return; - /* * For tasks that participate in GRUB, we implement GRUB-PA: the * spare reclaimed bandwidth is used to clock down frequency. @@ -1354,8 +1482,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); } + return scaled_delta_exec; +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +{ + s64 scaled_delta_exec; + + if (unlikely(delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; + return; + } + + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) + return; + + if (dl_entity_is_special(dl_se)) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + dl_se->runtime -= scaled_delta_exec; + /* + * The fair server can consume its runtime while throttled (not queued/ + * running as regular CFS). + * + * If the server consumes its entire runtime in this state. The server + * is not required for the current period. Thus, reset the server by + * starting a new period, pushing the activation. + */ + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { + /* + * If the server was previously activated - the starving condition + * took place, it this point it went away because the fair scheduler + * was able to get runtime in background. So return to the initial + * state. + */ + dl_se->dl_defer_running = 0; + + hrtimer_try_to_cancel(&dl_se->dl_timer); + + replenish_dl_new_period(dl_se, dl_se->rq); + + /* + * Not being able to start the timer seems problematic. If it could not + * be started for whatever reason, we need to "unthrottle" the DL server + * and queue right away. Otherwise nothing might queue it. That's similar + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. + */ + WARN_ON_ONCE(!start_dl_timer(dl_se)); + + return; + } + throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; @@ -1383,6 +1567,14 @@ throttle: } /* + * The fair server (sole dl_server) does not account for real-time + * workload because it is running fair work. + */ + if (dl_se == &rq->fair_server) + return; + +#ifdef CONFIG_RT_GROUP_SCHED + /* * Because -- for now -- we share the rt bandwidth, we need to * account our runtime there too, otherwise actual rt tasks * would be able to exceed the shared quota. @@ -1406,34 +1598,157 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } +#endif +} + +/* + * In the non-defer mode, the idle time is not accounted, as the + * server provides a guarantee. + * + * If the dl_server is in defer mode, the idle time is also considered + * as time available for the fair server, avoiding a penalty for the + * rt scheduler that did not consumed that time. + */ +void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +{ + s64 delta_exec, scaled_delta_exec; + + if (!rq->fair_server.dl_defer) + return; + + /* no need to discount more */ + if (rq->fair_server.runtime < 0) + return; + + delta_exec = rq_clock_task(rq) - p->se.exec_start; + if (delta_exec < 0) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); + + rq->fair_server.runtime -= scaled_delta_exec; + + if (rq->fair_server.runtime < 0) { + rq->fair_server.dl_defer_running = 0; + rq->fair_server.runtime = 0; + } + + p->se.exec_start = rq_clock_task(rq); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { - update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + /* 0 runtime = fair server disabled */ + if (dl_se->dl_runtime) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_start(struct sched_dl_entity *dl_se) { + struct rq *rq = dl_se->rq; + + /* + * XXX: the apply do not work fine at the init phase for the + * fair server because things are not yet set. We need to improve + * this before getting generic. + */ if (!dl_server(dl_se)) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + dl_server_apply_params(dl_se, runtime, period, 1); + dl_se->dl_server = 1; + dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); } + + if (!dl_se->dl_runtime) + return; + + dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) + resched_curr(dl_se->rq); } void dl_server_stop(struct sched_dl_entity *dl_se) { + if (!dl_se->dl_runtime) + return; + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + dl_se->dl_server_active = 0; } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick) + dl_server_pick_f pick_task) { dl_se->rq = rq; dl_se->server_has_tasks = has_tasks; - dl_se->server_pick = pick; + dl_se->server_pick_task = pick_task; +} + +void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 new_bw = dl_se->dl_bw; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + + dl_b = dl_bw_of(cpu_of(rq)); + guard(raw_spinlock)(&dl_b->lock); + + if (!dl_bw_cpus(cpu)) + return; + + __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); +} + +int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) +{ + u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 new_bw = to_ratio(period, runtime); + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + unsigned long cap; + int retval = 0; + int cpus; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + + if (__dl_overflow(dl_b, cap, old_bw, new_bw)) + return -EBUSY; + + if (init) { + __add_rq_bw(new_bw, &rq->dl); + __dl_add(dl_b, new_bw, cpus); + } else { + __dl_sub(dl_b, dl_se->dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + + dl_rq_change_utilization(rq, dl_se, new_bw); + } + + dl_se->dl_runtime = runtime; + dl_se->dl_deadline = period; + dl_se->dl_period = period; + + dl_se->runtime = 0; + dl_se->deadline = 0; + + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); + + return retval; } /* @@ -1442,11 +1757,11 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, */ static void update_curr_dl(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; + struct task_struct *donor = rq->donor; + struct sched_dl_entity *dl_se = &donor->dl; s64 delta_exec; - if (!dl_task(curr) || !on_dl_rq(dl_se)) + if (!dl_task(donor) || !on_dl_rq(dl_se)) return; /* @@ -1523,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = inactive_task_timer; + hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #define __node_2_dle(node) \ @@ -1600,46 +1914,40 @@ static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); } -static inline struct sched_statistics * +static __always_inline struct sched_statistics * __schedstats_from_dl_se(struct sched_dl_entity *dl_se) { + if (!schedstat_enabled()) + return NULL; + + if (dl_server(dl_se)) + return NULL; + return &dl_task_of(dl_se)->stats; } static inline void update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void @@ -1736,7 +2044,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) * be counted in the active utilization; hence, we need to call * add_running_bw(). */ - if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { if (flags & ENQUEUE_WAKEUP) task_contending(dl_se, flags); @@ -1754,10 +2062,30 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && + !is_dl_boosted(dl_se) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } + /* + * If the reservation is still throttled, e.g., it got replenished but is a + * deferred task and still got to wait, don't enqueue. + */ + if (dl_se->dl_throttled && start_dl_timer(dl_se)) + return; + + /* + * We're about to enqueue, make sure we're not ->dl_throttled! + * In case the timer was not started, say because the defer time + * has passed, mark as not throttled and mark unarmed. + * Also cancel earlier timers, since letting those run is pointless. + */ + if (dl_se->dl_throttled) { + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + __enqueue_dl_entity(dl_se); } @@ -1806,7 +2134,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). */ - hrtimer_try_to_cancel(&p->dl.dl_timer); + cancel_replenish_timer(&p->dl); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { @@ -1842,7 +2170,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_pushable_dl_task(rq, p); } -static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); @@ -1852,6 +2180,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) dequeue_dl_entity(&p->dl, flags); if (!p->dl.dl_throttled && !dl_server(&p->dl)) dequeue_pushable_dl_task(rq, p); + + return true; } /* @@ -1899,7 +2229,7 @@ static int find_later_rq(struct task_struct *task); static int select_task_rq_dl(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; bool select_rq; struct rq *rq; @@ -1910,6 +2240,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If we are dealing with a -deadline task, we must @@ -1920,9 +2251,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - select_rq = unlikely(dl_task(curr)) && + select_rq = unlikely(dl_task(donor)) && (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && + !dl_entity_preempt(&p->dl, &donor->dl)) && p->nr_cpus_allowed > 1; /* @@ -1971,8 +2302,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); } sub_rq_bw(&p->dl, &rq->dl); rq_unlock(rq, &rf); @@ -1985,7 +2315,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) return; /* @@ -2024,7 +2354,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; } @@ -2034,7 +2364,7 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((p->dl.deadline == rq->curr->dl.deadline) && + if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ @@ -2066,10 +2396,13 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (rq->curr->sched_class != &dl_sched_class) + if (rq->donor->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); + + if (hrtick_enabled_dl(rq)) + start_hrtick_dl(rq, &p->dl); } static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) @@ -2082,7 +2415,11 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) return __node_2_dle(left); } -static struct task_struct *pick_task_dl(struct rq *rq) +/* + * __pick_next_task_dl - Helper to pick the next -deadline task to run. + * @rq: The runqueue to pick the next task from. + */ +static struct task_struct *__pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2096,14 +2433,15 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick(dl_se); + p = dl_se->server_pick_task(dl_se); if (!p) { - WARN_ON_ONCE(1); - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); + if (dl_server_active(dl_se)) { + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + } goto again; } - p->dl_server = dl_se; + rq->dl_server = dl_se; } else { p = dl_task_of(dl_se); } @@ -2111,24 +2449,12 @@ again: return p; } -static struct task_struct *pick_next_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq) { - struct task_struct *p; - - p = pick_task_dl(rq); - if (!p) - return p; - - if (!p->dl_server) - set_next_task_dl(rq, p, true); - - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, &p->dl); - - return p; + return __pick_task_dl(rq); } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct sched_dl_entity *dl_se = &p->dl; struct dl_rq *dl_rq = &rq->dl; @@ -2179,14 +2505,6 @@ static void task_fork_dl(struct task_struct *p) /* Only try algorithms three times */ #define DL_MAX_TRIES 3 -static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - return 0; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: @@ -2200,16 +2518,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); - -next_node: - if (next_node) { + while (next_node) { p = __node_2_pdl(next_node); - if (pick_dl_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); - goto next_node; } return NULL; @@ -2399,8 +2714,8 @@ retry: * can move away, it makes sense to just reschedule * without going further in pushing next_task. */ - if (dl_task(rq->curr) && - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + if (dl_task(rq->donor) && + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; @@ -2443,9 +2758,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, later_rq->cpu); - activate_task(later_rq, next_task, 0); + move_queued_task_locked(rq, later_rq, next_task); ret = 1; resched_curr(later_rq); @@ -2489,7 +2802,7 @@ static void pull_dl_task(struct rq *this_rq) src_rq = cpu_rq(cpu); /* - * It looks racy, abd it is! However, as in sched_rt.c, + * It looks racy, and it is! However, as in sched_rt.c, * we are fine with this. */ if (this_rq->dl.dl_nr_running && @@ -2525,15 +2838,13 @@ static void pull_dl_task(struct rq *this_rq) * deadline than the current task of its runqueue. */ if (dl_time_before(p->dl.deadline, - src_rq->curr->dl.deadline)) + src_rq->donor->dl.deadline)) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); dmin = p->dl.deadline; resched = true; } @@ -2566,9 +2877,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - dl_task(rq->curr) && + dl_task(rq->donor) && (rq->curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { push_dl_tasks(rq); } } @@ -2643,7 +2954,7 @@ void dl_add_task_root_domain(struct task_struct *p) struct dl_bw *dl_b; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - if (!dl_task(p)) { + if (!dl_task(p) || dl_entity_is_special(&p->dl)) { raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return; } @@ -2662,11 +2973,26 @@ void dl_add_task_root_domain(struct task_struct *p) void dl_clear_root_domain(struct root_domain *rd) { - unsigned long flags; + int i; - raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); rd->dl_bw.total_bw = 0; - raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); + + /* + * dl_servers are not tasks. Since dl_add_task_root_domain ignores + * them, we need to account for them here explicitly. + */ + for_each_cpu(i, rd->span) { + struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; + + if (dl_server(dl_se) && cpu_active(i)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); + } +} + +void dl_clear_root_domain_cpu(int cpu) +{ + dl_clear_root_domain(cpu_rq(cpu)->rd); } #endif /* CONFIG_SMP */ @@ -2727,8 +3053,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); /* * In case a task is setscheduled to SCHED_DEADLINE we need to keep @@ -2743,12 +3068,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - if (rq->curr != p) { + if (rq->donor != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); #endif - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -2777,7 +3102,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!rq->dl.overloaded) deadline_queue_pull_task(rq); - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this @@ -2820,13 +3145,12 @@ DEFINE_SCHED_CLASS(dl) = { .wakeup_preempt = wakeup_preempt_dl, - .pick_next_task = pick_next_task_dl, + .pick_task = pick_task_dl, .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP .balance = balance_dl, - .pick_task = pick_task_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, @@ -2849,15 +3173,18 @@ DEFINE_SCHED_CLASS(dl) = { #endif }; -/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ -static u64 dl_generation; +/* + * Used for dl_bw check and update, used under sched_rt_handler()::mutex and + * sched_domains_mutex. + */ +u64 dl_cookie; int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu, cpus, ret = 0; unsigned long flags; @@ -2867,10 +3194,10 @@ int sched_dl_global_validate(void) * value smaller than the currently allocated bandwidth in * any of the root_domains. */ - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) + if (dl_bw_visited(cpu, cookie)) goto next; dl_b = dl_bw_of(cpu); @@ -2907,7 +3234,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -2918,7 +3245,7 @@ void sched_dl_do_global(void) for_each_possible_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) { + if (dl_bw_visited(cpu, cookie)) { rcu_read_unlock_sched(); continue; } @@ -3152,29 +3479,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, } enum dl_bw_request { - dl_bw_req_check_overflow = 0, + dl_bw_req_deactivate = 0, dl_bw_req_alloc, dl_bw_req_free }; static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; + u64 fair_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (req == dl_bw_req_free) { + cap = dl_bw_capacity(cpu); + switch (req) { + case dl_bw_req_free: __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); - } else { - unsigned long cap = dl_bw_capacity(cpu); - + break; + case dl_bw_req_alloc: overflow = __dl_overflow(dl_b, cap, 0, dl_bw); - if (req == dl_bw_req_alloc && !overflow) { + if (!overflow) { /* * We reserve space in the destination * root_domain, as we can't fail after this point. @@ -3183,6 +3512,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) */ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); } + break; + case dl_bw_req_deactivate: + /* + * cpu is not off yet, but we need to do the math by + * considering it off already (i.e., what would happen if we + * turn cpu off?). + */ + cap -= arch_scale_cpu_capacity(cpu); + + /* + * cpu is going offline and NORMAL tasks will be moved away + * from it. We can thus discount dl_server bandwidth + * contribution as it won't need to be servicing tasks after + * the cpu is off. + */ + if (cpu_rq(cpu)->fair_server.dl_server) + fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + + /* + * Not much to check if no DEADLINE bandwidth is present. + * dl_servers we can discount, as tasks will be moved out the + * offlined CPUs anyway. + */ + if (dl_b->total_bw - fair_server_bw > 0) { + /* + * Leaving at least one CPU for DEADLINE tasks seems a + * wise thing to do. As said above, cpu is not offline + * yet, so account for that. + */ + if (dl_bw_cpus(cpu) - 1) + overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + else + overflow = 1; + } + + break; } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -3191,9 +3556,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) return overflow ? -EBUSY : 0; } -int dl_bw_check_overflow(int cpu) +int dl_bw_deactivate(int cpu) { - return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); + return dl_bw_manage(dl_bw_req_deactivate, cpu, 0); } int dl_bw_alloc(int cpu, u64 dl_bw) @@ -3207,9 +3572,7 @@ void dl_bw_free(int cpu, u64 dl_bw) } #endif -#ifdef CONFIG_SCHED_DEBUG void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8d5d98a5834d..557246880a7e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -244,12 +244,15 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - static const char * preempt_modes[] = { - "none", "voluntary", "full" - }; - int i; + int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int j; + + /* Count entries in NULL terminated preempt_modes */ + for (j = 0; preempt_modes[j]; j++) + ; + j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + for (; i < j; i++) { if (preempt_dynamic_mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); @@ -291,7 +294,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, bool orig; cpus_read_lock(); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); orig = sched_debug_verbose; result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); @@ -303,7 +306,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, sd_dentry = NULL; } - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); cpus_read_unlock(); return result; @@ -333,8 +336,165 @@ static const struct file_operations sched_debug_fops = { .release = seq_release, }; +enum dl_param { + DL_RUNTIME = 0, + DL_PERIOD, +}; + +static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ +static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ + +static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, enum dl_param param) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + u64 runtime, period; + size_t err; + int retval; + u64 value; + + err = kstrtoull_from_user(ubuf, cnt, 10, &value); + if (err) + return err; + + scoped_guard (rq_lock_irqsave, rq) { + runtime = rq->fair_server.dl_runtime; + period = rq->fair_server.dl_period; + + switch (param) { + case DL_RUNTIME: + if (runtime == value) + break; + runtime = value; + break; + case DL_PERIOD: + if (value == period) + break; + period = value; + break; + } + + if (runtime > period || + period > fair_server_period_max || + period < fair_server_period_min) { + return -EINVAL; + } + + if (rq->cfs.h_nr_queued) { + update_rq_clock(rq); + dl_server_stop(&rq->fair_server); + } + + retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); + if (retval) + cnt = retval; + + if (!runtime) + printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", + cpu_of(rq)); + + if (rq->cfs.h_nr_queued) + dl_server_start(&rq->fair_server); + } + + *ppos += cnt; + return cnt; +} + +static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + u64 value; + + switch (param) { + case DL_RUNTIME: + value = rq->fair_server.dl_runtime; + break; + case DL_PERIOD: + value = rq->fair_server.dl_period; + break; + } + + seq_printf(m, "%llu\n", value); + return 0; + +} + +static ssize_t +sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); +} + +static int sched_fair_server_runtime_show(struct seq_file *m, void *v) +{ + return sched_fair_server_show(m, v, DL_RUNTIME); +} + +static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_fair_server_runtime_show, inode->i_private); +} + +static const struct file_operations fair_server_runtime_fops = { + .open = sched_fair_server_runtime_open, + .write = sched_fair_server_runtime_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static ssize_t +sched_fair_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); +} + +static int sched_fair_server_period_show(struct seq_file *m, void *v) +{ + return sched_fair_server_show(m, v, DL_PERIOD); +} + +static int sched_fair_server_period_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_fair_server_period_show, inode->i_private); +} + +static const struct file_operations fair_server_period_fops = { + .open = sched_fair_server_period_open, + .write = sched_fair_server_period_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *debugfs_sched; +static void debugfs_fair_server_init(void) +{ + struct dentry *d_fair; + unsigned long cpu; + + d_fair = debugfs_create_dir("fair_server", debugfs_sched); + if (!d_fair) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_fair); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops); + } +} + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -357,9 +517,9 @@ static __init int sched_init_debug(void) debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); update_sched_domain_debugfs(); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); #endif #ifdef CONFIG_NUMA_BALANCING @@ -374,6 +534,8 @@ static __init int sched_init_debug(void) debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + debugfs_fair_server_init(); + return 0; } late_initcall(sched_init_debug); @@ -425,6 +587,11 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); + debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); + + if (sd->flags & SD_ASYM_PACKING) + debugfs_create_u32("group_asym_prefer_cpu", 0444, parent, + (u32 *)&sd->groups->asym_prefer_cpu); } void update_sched_domain_debugfs(void) @@ -579,27 +746,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', SPLIT_NS(p->se.deadline), + p->se.custom_slice ? 'S' : ' ', SPLIT_NS(p->se.slice), SPLIT_NS(p->se.sum_exec_runtime), (long long)(p->nvcsw + p->nivcsw), p->prio); - SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld", SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), - SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED - SEQ_printf_task_group_path(m, task_group(p), " %s") + SEQ_printf_task_group_path(m, task_group(p), " %s") #endif SEQ_printf(m, "\n"); @@ -611,10 +778,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" - " wait-time sum-exec sum-sleep\n"); + SEQ_printf(m, " S task PID vruntime eligible " + "deadline slice sum-exec switches " + "prio wait-time sum-sleep sum-block" +#ifdef CONFIG_NUMA_BALANCING + " node group-id" +#endif +#ifdef CONFIG_CGROUP_SCHED + " group-path" +#endif + "\n"); SEQ_printf(m, "-------------------------------------------------------" - "------------------------------------------------------\n"); + "------------------------------------------------------" + "------------------------------------------------------" +#ifdef CONFIG_NUMA_BALANCING + "--------------" +#endif +#ifdef CONFIG_CGROUP_SCHED + "--------------" +#endif + "\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -640,8 +823,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, "\n"); SEQ_printf(m, "cfs_rq[%d]:\n", cpu); #endif - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", - SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); root = __pick_root_entity(cfs_rq); @@ -668,14 +849,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", - cfs_rq->nr_spread_over); - SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", - cfs_rq->idle_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", - cfs_rq->idle_h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", @@ -729,9 +906,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) PU(rt_nr_running); + +#ifdef CONFIG_RT_GROUP_SCHED P(rt_throttled); PN(rt_time); PN(rt_runtime); +#endif #undef PN #undef PU @@ -1088,7 +1268,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); + } else if (fair_policy(p->policy)) { + P(se.slice); } +#ifdef CONFIG_SCHED_CLASS_EXT + __PS("ext.enabled", task_on_scx(p)); +#endif #undef PN_SCHEDSTAT #undef P_SCHEDSTAT @@ -1115,8 +1300,10 @@ void resched_latency_warn(int cpu, u64 latency) { static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1); - WARN(__ratelimit(&latency_check_ratelimit), - "sched: CPU %d need_resched set for > %llu ns (%d ticks) " - "without schedule\n", - cpu, latency, cpu_rq(cpu)->ticks_without_resched); + if (likely(!__ratelimit(&latency_check_ratelimit))) + return; + + pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n", + cpu, latency, cpu_rq(cpu)->ticks_without_resched); + dump_stack(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c new file mode 100644 index 000000000000..2c41c78be61e --- /dev/null +++ b/kernel/sched/ext.c @@ -0,0 +1,7673 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <linux/btf_ids.h> +#include "ext_idle.h" + +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_TASK_ITER_BATCH = 32, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes + * are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* exit code if gracefully exiting */ + s64 exit_code; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * If set, only tasks with policy set to SCHED_EXT are attached to + * sched_ext. If clear, SCHED_NORMAL tasks are also included. + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, + + /* + * CPU cgroup support flags + */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | + SCX_OPS_HAS_CGROUP_WEIGHT, + + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, + + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/* + * Informational context provided to dump operations. + */ +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. + */ +struct sched_ext_ops { + /** + * @select_cpu: Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * @enqueue: Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. + * + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * @dequeue: Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). + * + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * @tick: Periodic tick + * @p: task running currently + * + * This operation is called every 1/HZ seconds on CPUs which are + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an + * immediate dispatch cycle on the CPU. + */ + void (*tick)(struct task_struct *p); + + /** + * @runnable: A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU, or when @p is + * being enqueued on a CPU experiencing a hotplug event. Likewise, a + * task may be ->enqueue()'d without being preceded by this operation + * e.g. after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * @running: A task is starting to run on its associated CPU + * @p: task starting to run + * + * Note that this callback may be called from a CPU other than the + * one the task is going to run on. This can happen when a task + * property is changed (i.e., affinity), since scx_next_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to determine the + * target CPU the task is going to use. + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * @stopping: A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * Note that this callback may be called from a CPU other than the + * one the task was running on. This can happen when a task + * property is changed (i.e., affinity), since dequeue_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU + * the task was running on. + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * @quiescent: A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * @yield: Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * @core_sched_before: Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + + /** + * @set_weight: Set task weight + * @p: task to set weight for + * @weight: new weight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * @set_cpumask: Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * @update_idle: Update the idle state of a CPU + * @cpu: CPU to update the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_select_cpu_and() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * @init_task: Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); + + /** + * @exit_task: Exit a previously-running task from the system + * @p: task to exit + * @args: exit arguments, see the struct definition + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); + + /** + * @enable: Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. enable() is called on @p any time it + * enters SCX, and is always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * @disable: Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + + /** + * @dump: Dump BPF scheduler state on error + * @ctx: debug dump context + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. + */ + void (*dump)(struct scx_dump_ctx *ctx); + + /** + * @dump_cpu: Dump BPF scheduler state for a CPU on error + * @ctx: debug dump context + * @cpu: CPU to generate debug dump for + * @idle: @cpu is currently idle without any runnable tasks + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @cpu. If @idle is %true and this operation doesn't produce any + * output, @cpu is skipped for dump. + */ + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); + + /** + * @dump_task: Dump BPF scheduler state for a runnable task on error + * @ctx: debug dump context + * @p: runnable task to generate debug dump for + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @p. + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * @cgroup_init: Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * @cgroup_exit: Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_move: Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_cancel_move: Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_set_weight: A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @tg's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); +#endif /* CONFIG_EXT_GROUP_SCHED */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * @cpu_online: A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * @cpu_offline: A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * @init: Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * @exit: Clean up after the BPF scheduler + * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. + */ + void (*exit)(struct scx_exit_info *info); + + /** + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * @flags: %SCX_OPS_* flags + */ + u64 flags; + + /** + * @timeout_ms: The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + + /** + * @hotplug_seq: A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** + * @name: BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void *priv; +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), +}; + +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * Total number of times a task's time slice was refilled with the + * default value (SCX_SLICE_DFL). + */ + s64 SCX_EV_REFILL_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); + + /* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. + * This is to avoid live-locking in bypass mode where all tasks are + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If + * per-node split isn't sufficient, it can be further split. + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; + + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats __percpu *event_stats_cpu; + + bool warned_zero_slice; + + atomic_t exit_kind; + struct scx_exit_info *exit_info; + + struct kobject kobj; + + struct kthread_worker *helper; + struct irq_work error_irq_work; + struct kthread_work disable_work; + struct rcu_work rcu_work; +}; + +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * The BPF scheduler is responsible for triggering a follow-up + * scheduling event. Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +enum scx_enable_state { + SCX_ENABLING, + SCX_ENABLED, + SCX_DISABLING, + SCX_DISABLED, +}; + +static const char *scx_enable_state_str[] = { + [SCX_ENABLING] = "enabling", + [SCX_ENABLED] = "enabled", + [SCX_DISABLING] = "disabling", + [SCX_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +/* + * NOTE: sched_ext is in the process of growing multiple scheduler support and + * scx_root usage is in a transitional state. Naked dereferences are safe if the + * caller is one of the tasks attached to SCX and explicit RCU dereference is + * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but + * are used as temporary markers to indicate that the dereferences need to be + * updated to point to the associated scheduler instances rather than scx_root. + */ +static struct scx_sched __rcu *scx_root; + +/* + * During exit, a task may schedule after losing its PIDs. When disabling the + * BPF scheduler, we need to be able to iterate tasks in every state to + * guarantee system safety. Maintain a dedicated task list which contains every + * task between its fork and eventual free. + */ +static DEFINE_SPINLOCK(scx_tasks_lock); +static LIST_HEAD(scx_tasks); + +/* ops enable/disable */ +static DEFINE_MUTEX(scx_enable_mutex); +DEFINE_STATIC_KEY_FALSE(__scx_enabled); +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); +static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); +static unsigned long scx_in_softlockup; +static atomic_t scx_breather_depth = ATOMIC_INIT(0); +static int scx_bypass_depth; +static bool scx_init_task_enabled; +static bool scx_switching_all; +DEFINE_STATIC_KEY_FALSE(__scx_switched_all); + +static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); + +/* + * A monotically increasing sequence number that is incremented every time a + * scheduler is enabled. This can be used by to check if any custom sched_ext + * scheduler has ever been used in the system. + */ +static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); + +/* + * The maximum amount of time in jiffies that a task may be runnable without + * being scheduled on a CPU. If this timeout is exceeded, it will trigger + * scx_error(). + */ +static unsigned long scx_watchdog_timeout; + +/* + * The last time the delayed work was run. This delayed work relies on + * ksoftirqd being able to run to service timer interrupts, so it's possible + * that this work itself could get wedged. To account for this, we check that + * it's not stalled in the timer tick, and trigger an error if it is. + */ +static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; + +static struct delayed_work scx_watchdog_work; + +/* for %SCX_KICK_WAIT */ +static unsigned long __percpu *scx_kick_cpus_pnt_seqs; + +/* + * Direct dispatch marker. + * + * Non-NULL values are used for direct dispatch from enqueue path. A valid + * pointer points to the task currently being enqueued. An ERR_PTR value is used + * to indicate that direct dispatch has already happened. + */ +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); + +static const struct rhashtable_params dsq_hash_params = { + .key_len = sizeof_field(struct scx_dispatch_q, id), + .key_offset = offsetof(struct scx_dispatch_q, id), + .head_offset = offsetof(struct scx_dispatch_q, hash_node), +}; + +static LLIST_HEAD(dsqs_to_free); + +/* dispatch buf */ +struct scx_dsp_buf_ent { + struct task_struct *task; + unsigned long qseq; + u64 dsq_id; + u64 enq_flags; +}; + +static u32 scx_dsp_max_batch; + +struct scx_dsp_ctx { + struct rq *rq; + u32 cursor; + u32 nr_tasks; + struct scx_dsp_buf_ent buf[]; +}; + +static struct scx_dsp_ctx __percpu *scx_dsp_ctx; + +/* string formatting from BPF */ +struct scx_bstr_buf { + u64 data[MAX_BPRINTF_VARARGS]; + char line[SCX_EXIT_MSG_LEN]; +}; + +static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); +static struct scx_bstr_buf scx_exit_bstr_buf; + +/* ops debug dump */ +struct scx_dump_data { + s32 cpu; + bool first; + s32 cursor; + struct seq_buf *s; + const char *prefix; + struct scx_bstr_buf buf; +}; + +static struct scx_dump_data scx_dump_data = { + .cpu = -1, +}; + +/* /sys/kernel/sched_ext interface */ +static struct kset *scx_kset; + +#define CREATE_TRACE_POINTS +#include <trace/events/sched_ext.h> + +static void process_ddsp_deferred_locals(struct rq *rq); +static void scx_bpf_kick_cpu(s32 cpu, u64 flags); +static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, + s64 exit_code, const char *fmt, va_list args); + +static __printf(4, 5) void scx_exit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + scx_vexit(sch, kind, exit_code, fmt, args); + va_end(args); +} + +static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, + const char *fmt, ...) +{ + struct scx_sched *sch; + va_list args; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) { + va_start(args, fmt); + scx_vexit(sch, kind, exit_code, fmt, args); + va_end(args); + } + rcu_read_unlock(); +} + +#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) + +#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) + +static long jiffies_delta_msecs(unsigned long at, unsigned long now) +{ + if (time_after(at, now)) + return jiffies_to_msecs(at - now); + else + return -(long)jiffies_to_msecs(now - at); +} + +/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ +static u32 higher_bits(u32 flags) +{ + return ~((1 << fls(flags)) - 1); +} + +/* return the mask with only the highest bit set */ +static u32 highest_bit(u32 flags) +{ + int bit = fls(flags); + return ((u64)1 << bit) >> 1; +} + +static bool u32_before(u32 a, u32 b) +{ + return (s32)(a - b) < 0; +} + +static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +{ + struct scx_sched *sch = scx_root; + + return sch->global_dsqs[cpu_to_node(task_cpu(p))]; +} + +static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) +{ + return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); +} + +/* + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate + * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check + * whether it's running from an allowed context. + * + * @mask is constant, always inline to cull the mask calculations. + */ +static __always_inline void scx_kf_allow(u32 mask) +{ + /* nesting is allowed only in increasing scx_kf_mask order */ + WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, + "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", + current->scx.kf_mask, mask); + current->scx.kf_mask |= mask; + barrier(); +} + +static void scx_kf_disallow(u32 mask) +{ + barrier(); + current->scx.kf_mask &= ~mask; +} + +/* + * Track the rq currently locked. + * + * This allows kfuncs to safely operate on rq from any scx ops callback, + * knowing which rq is already locked. + */ +static DEFINE_PER_CPU(struct rq *, locked_rq); + +static inline void update_locked_rq(struct rq *rq) +{ + /* + * Check whether @rq is actually locked. This can help expose bugs + * or incorrect assumptions about the context in which a kfunc or + * callback is executed. + */ + if (rq) + lockdep_assert_rq_held(rq); + __this_cpu_write(locked_rq, rq); +} + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(locked_rq); +} + +#define SCX_CALL_OP(sch, mask, op, rq, args...) \ +do { \ + update_locked_rq(rq); \ + if (mask) { \ + scx_kf_allow(mask); \ + (sch)->ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + (sch)->ops.op(args); \ + } \ + update_locked_rq(NULL); \ +} while (0) + +#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ +({ \ + __typeof__((sch)->ops.op(args)) __ret; \ + \ + update_locked_rq(rq); \ + if (mask) { \ + scx_kf_allow(mask); \ + __ret = (sch)->ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + __ret = (sch)->ops.op(args); \ + } \ + update_locked_rq(NULL); \ + __ret; \ +}) + +/* + * Some kfuncs are allowed only on the tasks that are subjects of the + * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such + * restrictions, the following SCX_CALL_OP_*() variants should be used when + * invoking scx_ops operations that take task arguments. These can only be used + * for non-nesting operations due to the way the tasks are tracked. + * + * kfuncs which can only operate on such tasks can in turn use + * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on + * the specific task. + */ +#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ +do { \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + SCX_CALL_OP((sch), mask, op, rq, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ +} while (0) + +#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \ +({ \ + __typeof__((sch)->ops.op(task, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + __ret; \ +}) + +#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \ +({ \ + __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task0; \ + current->scx.kf_tasks[1] = task1; \ + __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + current->scx.kf_tasks[1] = NULL; \ + __ret; \ +}) + +/* @mask is constant, always inline to cull unnecessary branches */ +static __always_inline bool scx_kf_allowed(u32 mask) +{ + if (unlikely(!(current->scx.kf_mask & mask))) { + scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); + return false; + } + + /* + * Enforce nesting boundaries. e.g. A kfunc which can be called from + * DISPATCH must not be called if we're running DEQUEUE which is nested + * inside ops.dispatch(). We don't need to check boundaries for any + * blocking kfuncs as the verifier ensures they're only called from + * sleepable progs. + */ + if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && + (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { + scx_kf_error("cpu_release kfunc called from a nested operation"); + return false; + } + + if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && + (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { + scx_kf_error("dispatch kfunc called from a nested operation"); + return false; + } + + return true; +} + +/* see SCX_CALL_OP_TASK() */ +static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, + struct task_struct *p) +{ + if (!scx_kf_allowed(mask)) + return false; + + if (unlikely((p != current->scx.kf_tasks[0] && + p != current->scx.kf_tasks[1]))) { + scx_kf_error("called on a task not being operated on"); + return false; + } + + return true; +} + +/** + * nldsq_next_task - Iterate to the next task in a non-local DSQ + * @dsq: user dsq being iterated + * @cur: current position, %NULL to start iteration + * @rev: walk backwards + * + * Returns %NULL when iteration is finished. + */ +static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, + struct task_struct *cur, bool rev) +{ + struct list_head *list_node; + struct scx_dsq_list_node *dsq_lnode; + + lockdep_assert_held(&dsq->lock); + + if (cur) + list_node = &cur->scx.dsq_list.node; + else + list_node = &dsq->list; + + /* find the next task, need to skip BPF iteration cursors */ + do { + if (rev) + list_node = list_node->prev; + else + list_node = list_node->next; + + if (list_node == &dsq->list) + return NULL; + + dsq_lnode = container_of(list_node, struct scx_dsq_list_node, + node); + } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); + + return container_of(dsq_lnode, struct task_struct, scx.dsq_list); +} + +#define nldsq_for_each_task(p, dsq) \ + for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ + (p) = nldsq_next_task((dsq), (p), false)) + + +/* + * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] + * dispatch order. BPF-visible iterator is opaque and larger to allow future + * changes without breaking backward compatibility. Can be used with + * bpf_for_each(). See bpf_iter_scx_dsq_*(). + */ +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ + SCX_DSQ_ITER_REV = 1U << 16, + + __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, + __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, + + __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, + __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | + __SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME, +}; + +struct bpf_iter_scx_dsq_kern { + struct scx_dsq_list_node cursor; + struct scx_dispatch_q *dsq; + u64 slice; + u64 vtime; +} __attribute__((aligned(8))); + +struct bpf_iter_scx_dsq { + u64 __opaque[6]; +} __attribute__((aligned(8))); + + +/* + * SCX task iterator. + */ +struct scx_task_iter { + struct sched_ext_entity cursor; + struct task_struct *locked; + struct rq *rq; + struct rq_flags rf; + u32 cnt; +}; + +/** + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration + * @iter: iterator to init + * + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop(). + * + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count. + * + * All tasks which existed when the iteration started are guaranteed to be + * visited as long as they still exist. + */ +static void scx_task_iter_start(struct scx_task_iter *iter) +{ + BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & + ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + + spin_lock_irq(&scx_tasks_lock); + + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; + list_add(&iter->cursor.tasks_node, &scx_tasks); + iter->locked = NULL; + iter->cnt = 0; +} + +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } +} + +/** + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock + * + * If @iter is in the middle of a locked iteration, it may be locking the rq of + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ + __scx_task_iter_rq_unlock(iter); + spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock + * + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations. + */ +static void scx_task_iter_relock(struct scx_task_iter *iter) +{ + spin_lock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock + * @iter: iterator to exit + * + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details. + */ +static void scx_task_iter_stop(struct scx_task_iter *iter) +{ + list_del_init(&iter->cursor.tasks_node); + scx_task_iter_unlock(iter); +} + +/** + * scx_task_iter_next - Next task + * @iter: iterator to walk + * + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls + * by holding scx_tasks_lock for too long. + */ +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) +{ + struct list_head *cursor = &iter->cursor.tasks_node; + struct sched_ext_entity *pos; + + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { + scx_task_iter_unlock(iter); + cond_resched(); + scx_task_iter_relock(iter); + } + + list_for_each_entry(pos, cursor, tasks_node) { + if (&pos->tasks_node == &scx_tasks) + return NULL; + if (!(pos->flags & SCX_TASK_CURSOR)) { + list_move(cursor, &pos->tasks_node); + return container_of(pos, struct task_struct, scx); + } + } + + /* can't happen, should always terminate at scx_tasks above */ + BUG(); +} + +/** + * scx_task_iter_next_locked - Next non-idle task with its rq locked + * @iter: iterator to walk + * + * Visit the non-idle task with its rq lock held. Allows callers to specify + * whether they would like to filter out dead tasks. See scx_task_iter_start() + * for details. + */ +static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) +{ + struct task_struct *p; + + __scx_task_iter_rq_unlock(iter); + + while ((p = scx_task_iter_next(iter))) { + /* + * scx_task_iter is used to prepare and move tasks into SCX + * while loading the BPF scheduler and vice-versa while + * unloading. The init_tasks ("swappers") should be excluded + * from the iteration because: + * + * - It's unsafe to use __setschduler_prio() on an init_task to + * determine the sched_class to use as it won't preserve its + * idle_sched_class. + * + * - ops.init/exit_task() can easily be confused if called with + * init_tasks as they, e.g., share PID 0. + * + * As init_tasks are never scheduled through SCX, they can be + * skipped safely. Note that is_idle_task() which tests %PF_IDLE + * doesn't work here: + * + * - %PF_IDLE may not be set for an init_task whose CPU hasn't + * yet been onlined. + * + * - %PF_IDLE can be set on tasks that are not init_tasks. See + * play_idle_precise() used by CONFIG_IDLE_INJECT. + * + * Test for idle_sched_class as only init_tasks are on it. + */ + if (p->sched_class != &idle_sched_class) + break; + } + if (!p) + return NULL; + + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked = p; + + return p; +} + +/** + * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This can be used when preemption is not disabled. + */ +#define scx_add_event(sch, name, cnt) do { \ + this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + trace_sched_ext_event(#name, (cnt)); \ +} while(0) + +/** + * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This should be used only when preemption is disabled. + */ +#define __scx_add_event(sch, name, cnt) do { \ + __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' + * @dst_e: destination event stats + * @src_e: source event stats + * @kind: a kind of event to be aggregated + */ +#define scx_agg_event(dst_e, src_e, kind) do { \ + (dst_e)->kind += READ_ONCE((src_e)->kind); \ +} while(0) + +/** + * scx_dump_event - Dump an event 'kind' in 'events' to 's' + * @s: output seq_buf + * @events: event stats + * @kind: a kind of event to dump + */ +#define scx_dump_event(s, events, kind) do { \ + dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ +} while (0) + + +static void scx_read_events(struct scx_sched *sch, + struct scx_event_stats *events); + +static enum scx_enable_state scx_enable_state(void) +{ + return atomic_read(&scx_enable_state_var); +} + +static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) +{ + return atomic_xchg(&scx_enable_state_var, to); +} + +static bool scx_tryset_enable_state(enum scx_enable_state to, + enum scx_enable_state from) +{ + int from_v = from; + + return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); +} + +static bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} + +/** + * wait_ops_state - Busy-wait the specified ops state to end + * @p: target task + * @opss: state to wait the end of + * + * Busy-wait for @p to transition out of @opss. This can only be used when the + * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also + * has load_acquire semantics to ensure that the caller can see the updates made + * in the enqueueing and dispatching paths. + */ +static void wait_ops_state(struct task_struct *p, unsigned long opss) +{ + do { + cpu_relax(); + } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); +} + +static inline bool __cpu_valid(s32 cpu) +{ + return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); +} + +/** + * ops_cpu_valid - Verify a cpu number, to be used on ops input args + * @sch: scx_sched to abort on error + * @cpu: cpu number which came from a BPF ops + * @where: extra information reported on error + * + * @cpu is a cpu number which came from the BPF scheduler and can be any value. + * Verify that it is in range and one of the possible cpus. If invalid, trigger + * an ops error. + */ +static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) +{ + if (__cpu_valid(cpu)) { + return true; + } else { + scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); + return false; + } +} + +/** + * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args + * @cpu: cpu number which came from a BPF ops + * @where: extra information reported on error + * + * The same as ops_cpu_valid() but @sch is implicit. + */ +static bool kf_cpu_valid(u32 cpu, const char *where) +{ + if (__cpu_valid(cpu)) { + return true; + } else { + scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); + return false; + } +} + +/** + * ops_sanitize_err - Sanitize a -errno value + * @sch: scx_sched to error out on error + * @ops_name: operation to blame on failure + * @err: -errno value to sanitize + * + * Verify @err is a valid -errno. If not, trigger scx_error() and return + * -%EPROTO. This is necessary because returning a rogue -errno up the chain can + * cause misbehaviors. For an example, a large negative return from + * ops.init_task() triggers an oops when passed up the call chain because the + * value fails IS_ERR() test after being encoded with ERR_PTR() and then is + * handled as a pointer. + */ +static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) +{ + if (err < 0 && err >= -MAX_ERRNO) + return err; + + scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); + return -EPROTO; +} + +static void run_deferred(struct rq *rq) +{ + process_ddsp_deferred_locals(rq); +} + +#ifdef CONFIG_SMP +static void deferred_bal_cb_workfn(struct rq *rq) +{ + run_deferred(rq); +} +#endif + +static void deferred_irq_workfn(struct irq_work *irq_work) +{ + struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); + + raw_spin_rq_lock(rq); + run_deferred(rq); + raw_spin_rq_unlock(rq); +} + +/** + * schedule_deferred - Schedule execution of deferred actions on an rq + * @rq: target rq + * + * Schedule execution of deferred actions on @rq. Must be called with @rq + * locked. Deferred actions are executed with @rq locked but unpinned, and thus + * can unlock @rq to e.g. migrate tasks to other rqs. + */ +static void schedule_deferred(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + +#ifdef CONFIG_SMP + /* + * If in the middle of waking up a task, task_woken_scx() will be called + * afterwards which will then run the deferred actions, no need to + * schedule anything. + */ + if (rq->scx.flags & SCX_RQ_IN_WAKEUP) + return; + + /* + * If in balance, the balance callbacks will be called before rq lock is + * released. Schedule one. + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) { + queue_balance_callback(rq, &rq->scx.deferred_bal_cb, + deferred_bal_cb_workfn); + return; + } +#endif + /* + * No scheduler hooks available. Queue an irq work. They are executed on + * IRQ re-enable which may take a bit longer than the scheduler hooks. + * The above WAKEUP and BALANCE paths should cover most of the cases and + * the time to IRQ re-enable shouldn't be long. + */ + irq_work_queue(&rq->scx.deferred_irq_work); +} + +/** + * touch_core_sched - Update timestamp used for core-sched task ordering + * @rq: rq to read clock from, must be locked + * @p: task to update the timestamp for + * + * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to + * implement global or local-DSQ FIFO ordering for core-sched. Should be called + * when a task becomes runnable and its turn on the CPU ends (e.g. slice + * exhaustion). + */ +static void touch_core_sched(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + +#ifdef CONFIG_SCHED_CORE + /* + * It's okay to update the timestamp spuriously. Use + * sched_core_disabled() which is cheaper than enabled(). + * + * As this is used to determine ordering between tasks of sibling CPUs, + * it may be better to use per-core dispatch sequence instead. + */ + if (!sched_core_disabled()) + p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); +#endif +} + +/** + * touch_core_sched_dispatch - Update core-sched timestamp on dispatch + * @rq: rq to read clock from, must be locked + * @p: task being dispatched + * + * If the BPF scheduler implements custom core-sched ordering via + * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO + * ordering within each local DSQ. This function is called from dispatch paths + * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. + */ +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + +#ifdef CONFIG_SCHED_CORE + if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) + touch_core_sched(rq, p); +#endif +} + +static void update_curr_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + s64 delta_exec; + + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) + return; + + if (curr->scx.slice != SCX_SLICE_INF) { + curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); + if (!curr->scx.slice) + touch_core_sched(rq, curr); + } +} + +static bool scx_dsq_priq_less(struct rb_node *node_a, + const struct rb_node *node_b) +{ + const struct task_struct *a = + container_of(node_a, struct task_struct, scx.dsq_priq); + const struct task_struct *b = + container_of(node_b, struct task_struct, scx.dsq_priq); + + return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); +} + +static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) +{ + /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ + WRITE_ONCE(dsq->nr, dsq->nr + delta); +} + +static void refill_task_slice_dfl(struct task_struct *p) +{ + p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); +} + +static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, + struct task_struct *p, u64 enq_flags) +{ + bool is_local = dsq->id == SCX_DSQ_LOCAL; + + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); + WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || + !RB_EMPTY_NODE(&p->scx.dsq_priq)); + + if (!is_local) { + raw_spin_lock(&dsq->lock); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { + scx_error(sch, "attempting to dispatch to a destroyed dsq"); + /* fall back to the global dsq */ + raw_spin_unlock(&dsq->lock); + dsq = find_global_dsq(p); + raw_spin_lock(&dsq->lock); + } + } + + if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && + (enq_flags & SCX_ENQ_DSQ_PRIQ))) { + /* + * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from + * their FIFO queues. To avoid confusion and accidentally + * starving vtime-dispatched tasks by FIFO-dispatched tasks, we + * disallow any internal DSQ from doing vtime ordering of + * tasks. + */ + scx_error(sch, "cannot use vtime ordering for built-in DSQs"); + enq_flags &= ~SCX_ENQ_DSQ_PRIQ; + } + + if (enq_flags & SCX_ENQ_DSQ_PRIQ) { + struct rb_node *rbp; + + /* + * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are + * linked to both the rbtree and list on PRIQs, this can only be + * tested easily when adding the first task. + */ + if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && + nldsq_next_task(dsq, NULL, false))) + scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); + + p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; + rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); + + /* + * Find the previous task and insert after it on the list so + * that @dsq->list is vtime ordered. + */ + rbp = rb_prev(&p->scx.dsq_priq); + if (rbp) { + struct task_struct *prev = + container_of(rbp, struct task_struct, + scx.dsq_priq); + list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); + } else { + list_add(&p->scx.dsq_list.node, &dsq->list); + } + } else { + /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ + if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) + scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_list.node, &dsq->list); + else + list_add_tail(&p->scx.dsq_list.node, &dsq->list); + } + + /* seq records the order tasks are queued, used by BPF DSQ iterator */ + dsq->seq++; + p->scx.dsq_seq = dsq->seq; + + dsq_mod_nr(dsq, 1); + p->scx.dsq = dsq; + + /* + * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the + * direct dispatch path, but we clear them here because the direct + * dispatch verdict may be overridden on the enqueue path during e.g. + * bypass. + */ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; + + /* + * We're transitioning out of QUEUEING or DISPATCHING. store_release to + * match waiters' load_acquire. + */ + if (enq_flags & SCX_ENQ_CLEAR_OPSS) + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + if (is_local) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, + rq->curr->sched_class)) + resched_curr(rq); + } else { + raw_spin_unlock(&dsq->lock); + } +} + +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); + + if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase(&p->scx.dsq_priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_priq); + p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; + } + + list_del_init(&p->scx.dsq_list.node); + dsq_mod_nr(dsq, -1); +} + +static void dispatch_dequeue(struct rq *rq, struct task_struct *p) +{ + struct scx_dispatch_q *dsq = p->scx.dsq; + bool is_local = dsq == &rq->scx.local_dsq; + + if (!dsq) { + /* + * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. + * Unlinking is all that's needed to cancel. + */ + if (unlikely(!list_empty(&p->scx.dsq_list.node))) + list_del_init(&p->scx.dsq_list.node); + + /* + * When dispatching directly from the BPF scheduler to a local + * DSQ, the task isn't associated with any DSQ but + * @p->scx.holding_cpu may be set under the protection of + * %SCX_OPSS_DISPATCHING. + */ + if (p->scx.holding_cpu >= 0) + p->scx.holding_cpu = -1; + + return; + } + + if (!is_local) + raw_spin_lock(&dsq->lock); + + /* + * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't + * change underneath us. + */ + if (p->scx.holding_cpu < 0) { + /* @p must still be on @dsq, dequeue */ + task_unlink_from_dsq(p, dsq); + } else { + /* + * We're racing against dispatch_to_local_dsq() which already + * removed @p from @dsq and set @p->scx.holding_cpu. Clear the + * holding_cpu which tells dispatch_to_local_dsq() that it lost + * the race. + */ + WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); + p->scx.holding_cpu = -1; + } + p->scx.dsq = NULL; + + if (!is_local) + raw_spin_unlock(&dsq->lock); +} + +static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, + struct rq *rq, u64 dsq_id, + struct task_struct *p) +{ + struct scx_dispatch_q *dsq; + + if (dsq_id == SCX_DSQ_LOCAL) + return &rq->scx.local_dsq; + + if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) + return find_global_dsq(p); + + return &cpu_rq(cpu)->scx.local_dsq; + } + + if (dsq_id == SCX_DSQ_GLOBAL) + dsq = find_global_dsq(p); + else + dsq = find_user_dsq(sch, dsq_id); + + if (unlikely(!dsq)) { + scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); + return find_global_dsq(p); + } + + return dsq; +} + +static void mark_direct_dispatch(struct task_struct *ddsp_task, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + /* + * Mark that dispatch already happened from ops.select_cpu() or + * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value + * which can never match a valid task pointer. + */ + __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); + + /* @p must match the task on the enqueue path */ + if (unlikely(p != ddsp_task)) { + if (IS_ERR(ddsp_task)) + scx_kf_error("%s[%d] already direct-dispatched", + p->comm, p->pid); + else + scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + ddsp_task->comm, ddsp_task->pid, + p->comm, p->pid); + return; + } + + WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); + WARN_ON_ONCE(p->scx.ddsp_enq_flags); + + p->scx.ddsp_dsq_id = dsq_id; + p->scx.ddsp_enq_flags = enq_flags; +} + +static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) +{ + struct rq *rq = task_rq(p); + struct scx_dispatch_q *dsq = + find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + + touch_core_sched_dispatch(rq, p); + + p->scx.ddsp_enq_flags |= enq_flags; + + /* + * We are in the enqueue path with @rq locked and pinned, and thus can't + * double lock a remote rq and enqueue to its local DSQ. For + * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer + * the enqueue so that it's executed when @rq can be unlocked. + */ + if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { + unsigned long opss; + + opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_NONE: + break; + case SCX_OPSS_QUEUEING: + /* + * As @p was never passed to the BPF side, _release is + * not strictly necessary. Still do it for consistency. + */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + break; + default: + WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", + p->comm, p->pid, opss); + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + break; + } + + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); + list_add_tail(&p->scx.dsq_list.node, + &rq->scx.ddsp_deferred_locals); + schedule_deferred(rq); + return; + } + + dispatch_enqueue(sch, dsq, p, + p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); +} + +static bool scx_rq_online(struct rq *rq) +{ + /* + * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates + * the online state as seen from the BPF scheduler. cpu_active() test + * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will + * stay set until the current scheduling operation is complete even if + * we aren't locking @rq. + */ + return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); +} + +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, + int sticky_cpu) +{ + struct scx_sched *sch = scx_root; + struct task_struct **ddsp_taskp; + unsigned long qseq; + + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + + /* rq migration */ + if (sticky_cpu == cpu_of(rq)) + goto local_norefill; + + /* + * If !scx_rq_online(), we already told the BPF scheduler that the CPU + * is offline and are just running the hotplug path. Don't bother the + * BPF scheduler. + */ + if (!scx_rq_online(rq)) + goto local; + + if (scx_rq_bypassing(rq)) { + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); + goto global; + } + + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* see %SCX_OPS_ENQ_EXITING */ + if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && + unlikely(p->flags & PF_EXITING)) { + __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); + goto local; + } + + /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ + if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && + is_migration_disabled(p)) { + __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); + goto local; + } + + if (unlikely(!SCX_HAS_OP(sch, enqueue))) + goto global; + + /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ + qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; + + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); + + *ddsp_taskp = NULL; + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* + * If not directly dispatched, QUEUEING isn't clear yet and dispatch or + * dequeue may be waiting. The store_release matches their load_acquire. + */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); + return; + +direct: + direct_dispatch(sch, p, enq_flags); + return; + +local: + /* + * For task-ordering, slice refill must be treated as implying the end + * of the current slice. Otherwise, the longer @p stays on the CPU, the + * higher priority it becomes from scx_prio_less()'s POV. + */ + touch_core_sched(rq, p); + refill_task_slice_dfl(p); +local_norefill: + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); + return; + +global: + touch_core_sched(rq, p); /* see the comment in local: */ + refill_task_slice_dfl(p); + dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); +} + +static bool task_runnable(const struct task_struct *p) +{ + return !list_empty(&p->scx.runnable_node); +} + +static void set_task_runnable(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + + if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { + p->scx.runnable_at = jiffies; + p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; + } + + /* + * list_add_tail() must be used. scx_bypass() depends on tasks being + * appended to the runnable_list. + */ + list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); +} + +static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) +{ + list_del_init(&p->scx.runnable_node); + if (reset_runnable_at) + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; +} + +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) +{ + struct scx_sched *sch = scx_root; + int sticky_cpu = p->scx.sticky_cpu; + + if (enq_flags & ENQUEUE_WAKEUP) + rq->scx.flags |= SCX_RQ_IN_WAKEUP; + + enq_flags |= rq->scx.extra_enq_flags; + + if (sticky_cpu >= 0) + p->scx.sticky_cpu = -1; + + /* + * Restoring a running task will be immediately followed by + * set_next_task_scx() which expects the task to not be on the BPF + * scheduler as tasks can only start running through local DSQs. Force + * direct-dispatch into the local DSQ by setting the sticky_cpu. + */ + if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) + sticky_cpu = cpu_of(rq); + + if (p->scx.flags & SCX_TASK_QUEUED) { + WARN_ON_ONCE(!task_runnable(p)); + goto out; + } + + set_task_runnable(rq, p); + p->scx.flags |= SCX_TASK_QUEUED; + rq->scx.nr_running++; + add_nr_running(rq, 1); + + if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); + + if (enq_flags & SCX_ENQ_WAKEUP) + touch_core_sched(rq, p); + + do_enqueue_task(rq, p, enq_flags, sticky_cpu); +out: + rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; + + if ((enq_flags & SCX_ENQ_CPU_SELECTED) && + unlikely(cpu_of(rq) != p->scx.selected_cpu)) + __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); +} + +static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) +{ + struct scx_sched *sch = scx_root; + unsigned long opss; + + /* dequeue is always temporary, don't reset runnable_at */ + clr_task_runnable(p, false); + + /* acquire ensures that we see the preceding updates on QUEUED */ + opss = atomic_long_read_acquire(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_NONE: + break; + case SCX_OPSS_QUEUEING: + /* + * QUEUEING is started and finished while holding @p's rq lock. + * As we're holding the rq lock now, we shouldn't see QUEUEING. + */ + BUG(); + case SCX_OPSS_QUEUED: + if (SCX_HAS_OP(sch, dequeue)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, + p, deq_flags); + + if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_NONE)) + break; + fallthrough; + case SCX_OPSS_DISPATCHING: + /* + * If @p is being dispatched from the BPF scheduler to a DSQ, + * wait for the transfer to complete so that @p doesn't get + * added to its DSQ after dequeueing is complete. + * + * As we're waiting on DISPATCHING with the rq locked, the + * dispatching side shouldn't try to lock the rq while + * DISPATCHING is set. See dispatch_to_local_dsq(). + * + * DISPATCHING shouldn't have qseq set and control can reach + * here with NONE @opss from the above QUEUED case block. + * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. + */ + wait_ops_state(p, SCX_OPSS_DISPATCHING); + BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + break; + } +} + +static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) +{ + struct scx_sched *sch = scx_root; + + if (!(p->scx.flags & SCX_TASK_QUEUED)) { + WARN_ON_ONCE(task_runnable(p)); + return true; + } + + ops_dequeue(rq, p, deq_flags); + + /* + * A currently running task which is going off @rq first gets dequeued + * and then stops running. As we want running <-> stopping transitions + * to be contained within runnable <-> quiescent transitions, trigger + * ->stopping() early here instead of in put_prev_task_scx(). + * + * @p may go through multiple stopping <-> running transitions between + * here and put_prev_task_scx() if task attribute changes occur while + * balance_scx() leaves @rq unlocked. However, they don't contain any + * information meaningful to the BPF scheduler and can be suppressed by + * skipping the callbacks if the task is !QUEUED. + */ + if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { + update_curr_scx(rq); + SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); + } + + if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); + + if (deq_flags & SCX_DEQ_SLEEP) + p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; + else + p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; + + p->scx.flags &= ~SCX_TASK_QUEUED; + rq->scx.nr_running--; + sub_nr_running(rq, 1); + + dispatch_dequeue(rq, p); + return true; +} + +static void yield_task_scx(struct rq *rq) +{ + struct scx_sched *sch = scx_root; + struct task_struct *p = rq->curr; + + if (SCX_HAS_OP(sch, yield)) + SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); + else + p->scx.slice = 0; +} + +static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) +{ + struct scx_sched *sch = scx_root; + struct task_struct *from = rq->curr; + + if (SCX_HAS_OP(sch, yield)) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, + from, to); + else + return false; +} + +static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, + struct scx_dispatch_q *src_dsq, + struct rq *dst_rq) +{ + struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; + + /* @dsq is locked and @p is on @dst_rq */ + lockdep_assert_held(&src_dsq->lock); + lockdep_assert_rq_held(dst_rq); + + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_list.node, &dst_dsq->list); + else + list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); + + dsq_mod_nr(dst_dsq, 1); + p->scx.dsq = dst_dsq; +} + +#ifdef CONFIG_SMP +/** + * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ + * @p: task to move + * @enq_flags: %SCX_ENQ_* + * @src_rq: rq to move the task from, locked on entry, released on return + * @dst_rq: rq to move the task into, locked on return + * + * Move @p which is currently on @src_rq to @dst_rq's local DSQ. + */ +static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, + struct rq *src_rq, struct rq *dst_rq) +{ + lockdep_assert_rq_held(src_rq); + + /* the following marks @p MIGRATING which excludes dequeue */ + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu_of(dst_rq)); + p->scx.sticky_cpu = cpu_of(dst_rq); + + raw_spin_rq_unlock(src_rq); + raw_spin_rq_lock(dst_rq); + + /* + * We want to pass scx-specific enq_flags but activate_task() will + * truncate the upper 32 bit. As we own @rq, we can pass them through + * @rq->scx.extra_enq_flags instead. + */ + WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); + WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); + dst_rq->scx.extra_enq_flags = enq_flags; + activate_task(dst_rq, p, 0); + dst_rq->scx.extra_enq_flags = 0; +} + +/* + * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two + * differences: + * + * - is_cpu_allowed() asks "Can this task run on this CPU?" while + * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to + * this CPU?". + * + * While migration is disabled, is_cpu_allowed() has to say "yes" as the task + * must be allowed to finish on the CPU that it's currently on regardless of + * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the + * BPF scheduler shouldn't attempt to migrate a task which has migration + * disabled. + * + * - The BPF scheduler is bypassed while the rq is offline and we can always say + * no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs. + */ +static bool task_can_run_on_remote_rq(struct scx_sched *sch, + struct task_struct *p, struct rq *rq, + bool enforce) +{ + int cpu = cpu_of(rq); + + WARN_ON_ONCE(task_cpu(p) == cpu); + + /* + * If @p has migration disabled, @p->cpus_ptr is updated to contain only + * the pinned CPU in migrate_disable_switch() while @p is being switched + * out. However, put_prev_task_scx() is called before @p->cpus_ptr is + * updated and thus another CPU may see @p on a DSQ inbetween leading to + * @p passing the below task_allowed_on_cpu() check while migration is + * disabled. + * + * Test the migration disabled state first as the race window is narrow + * and the BPF scheduler failing to check migration disabled state can + * easily be masked if task_allowed_on_cpu() is done first. + */ + if (unlikely(is_migration_disabled(p))) { + if (enforce) + scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); + return false; + } + + /* + * We don't require the BPF scheduler to avoid dispatching to offline + * CPUs mostly for convenience but also because CPUs can go offline + * between scx_bpf_dsq_insert() calls and here. Trigger error iff the + * picked CPU is outside the allowed mask. + */ + if (!task_allowed_on_cpu(p, cpu)) { + if (enforce) + scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); + return false; + } + + if (!scx_rq_online(rq)) { + if (enforce) + __scx_add_event(scx_root, + SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + return false; + } + + return true; +} + +/** + * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq + * @p: target task + * @dsq: locked DSQ @p is currently on + * @src_rq: rq @p is currently on, stable with @dsq locked + * + * Called with @dsq locked but no rq's locked. We want to move @p to a different + * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is + * required when transferring into a local DSQ. Even when transferring into a + * non-local DSQ, it's better to use the same mechanism to protect against + * dequeues and maintain the invariant that @p->scx.dsq can only change while + * @src_rq is locked, which e.g. scx_dump_task() depends on. + * + * We want to grab @src_rq but that can deadlock if we try while locking @dsq, + * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As + * this may race with dequeue, which can't drop the rq lock or fail, do a little + * dancing from our side. + * + * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets + * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu + * would be cleared to -1. While other cpus may have updated it to different + * values afterwards, as this operation can't be preempted or recurse, the + * holding_cpu can never become this CPU again before we're done. Thus, we can + * tell whether we lost to dequeue by testing whether the holding_cpu still + * points to this CPU. See dispatch_dequeue() for the counterpart. + * + * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is + * still valid. %false if lost to dequeue. + */ +static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, + struct scx_dispatch_q *dsq, + struct rq *src_rq) +{ + s32 cpu = raw_smp_processor_id(); + + lockdep_assert_held(&dsq->lock); + + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); + p->scx.holding_cpu = cpu; + + raw_spin_unlock(&dsq->lock); + raw_spin_rq_lock(src_rq); + + /* task_rq couldn't have changed if we're still the holding cpu */ + return likely(p->scx.holding_cpu == cpu) && + !WARN_ON_ONCE(src_rq != task_rq(p)); +} + +static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, + struct scx_dispatch_q *dsq, struct rq *src_rq) +{ + raw_spin_rq_unlock(this_rq); + + if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { + move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); + return true; + } else { + raw_spin_rq_unlock(src_rq); + raw_spin_rq_lock(this_rq); + return false; + } +} +#else /* CONFIG_SMP */ +static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } +static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; } +static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } +#endif /* CONFIG_SMP */ + +/** + * move_task_between_dsqs() - Move a task from one DSQ to another + * @sch: scx_sched being operated on + * @p: target task + * @enq_flags: %SCX_ENQ_* + * @src_dsq: DSQ @p is currently on, must not be a local DSQ + * @dst_dsq: DSQ @p is being moved to, can be any DSQ + * + * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local + * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq + * will change. As @p's task_rq is locked, this function doesn't need to use the + * holding_cpu mechanism. + * + * On return, @src_dsq is unlocked and only @p's new task_rq, which is the + * return value, is locked. + */ +static struct rq *move_task_between_dsqs(struct scx_sched *sch, + struct task_struct *p, u64 enq_flags, + struct scx_dispatch_q *src_dsq, + struct scx_dispatch_q *dst_dsq) +{ + struct rq *src_rq = task_rq(p), *dst_rq; + + BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); + lockdep_assert_held(&src_dsq->lock); + lockdep_assert_rq_held(src_rq); + + if (dst_dsq->id == SCX_DSQ_LOCAL) { + dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { + dst_dsq = find_global_dsq(p); + dst_rq = src_rq; + } + } else { + /* no need to migrate if destination is a non-local DSQ */ + dst_rq = src_rq; + } + + /* + * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different + * CPU, @p will be migrated. + */ + if (dst_dsq->id == SCX_DSQ_LOCAL) { + /* @p is going from a non-local DSQ to a local DSQ */ + if (src_rq == dst_rq) { + task_unlink_from_dsq(p, src_dsq); + move_local_task_to_local_dsq(p, enq_flags, + src_dsq, dst_rq); + raw_spin_unlock(&src_dsq->lock); + } else { + raw_spin_unlock(&src_dsq->lock); + move_remote_task_to_local_dsq(p, enq_flags, + src_rq, dst_rq); + } + } else { + /* + * @p is going from a non-local DSQ to a non-local DSQ. As + * $src_dsq is already locked, do an abbreviated dequeue. + */ + task_unlink_from_dsq(p, src_dsq); + p->scx.dsq = NULL; + raw_spin_unlock(&src_dsq->lock); + + dispatch_enqueue(sch, dst_dsq, p, enq_flags); + } + + return dst_rq; +} + +/* + * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly + * banging on the same DSQ on a large NUMA system to the point where switching + * to the bypass mode can take a long time. Inject artificial delays while the + * bypass mode is switching to guarantee timely completion. + */ +static void scx_breather(struct rq *rq) +{ + u64 until; + + lockdep_assert_rq_held(rq); + + if (likely(!atomic_read(&scx_breather_depth))) + return; + + raw_spin_rq_unlock(rq); + + until = ktime_get_ns() + NSEC_PER_MSEC; + + do { + int cnt = 1024; + while (atomic_read(&scx_breather_depth) && --cnt) + cpu_relax(); + } while (atomic_read(&scx_breather_depth) && + time_before64(ktime_get_ns(), until)); + + raw_spin_rq_lock(rq); +} + +static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dsq) +{ + struct task_struct *p; +retry: + /* + * This retry loop can repeatedly race against scx_bypass() dequeueing + * tasks from @dsq trying to put the system into the bypass mode. On + * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock + * the machine into soft lockups. Give a breather. + */ + scx_breather(rq); + + /* + * The caller can't expect to successfully consume a task if the task's + * addition to @dsq isn't guaranteed to be visible somehow. Test + * @dsq->list without locking and skip if it seems empty. + */ + if (list_empty(&dsq->list)) + return false; + + raw_spin_lock(&dsq->lock); + + nldsq_for_each_task(p, dsq) { + struct rq *task_rq = task_rq(p); + + if (rq == task_rq) { + task_unlink_from_dsq(p, dsq); + move_local_task_to_local_dsq(p, 0, dsq, rq); + raw_spin_unlock(&dsq->lock); + return true; + } + + if (task_can_run_on_remote_rq(sch, p, rq, false)) { + if (likely(consume_remote_task(rq, p, dsq, task_rq))) + return true; + goto retry; + } + } + + raw_spin_unlock(&dsq->lock); + return false; +} + +static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) +{ + int node = cpu_to_node(cpu_of(rq)); + + return consume_dispatch_q(sch, rq, sch->global_dsqs[node]); +} + +/** + * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @sch: scx_sched being operated on + * @rq: current rq which is locked + * @dst_dsq: destination DSQ + * @p: task to dispatch + * @enq_flags: %SCX_ENQ_* + * + * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local + * DSQ. This function performs all the synchronization dancing needed because + * local DSQs are protected with rq locks. + * + * The caller must have exclusive ownership of @p (e.g. through + * %SCX_OPSS_DISPATCHING). + */ +static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dst_dsq, + struct task_struct *p, u64 enq_flags) +{ + struct rq *src_rq = task_rq(p); + struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP + struct rq *locked_rq = rq; +#endif + + /* + * We're synchronized against dequeue through DISPATCHING. As @p can't + * be dequeued, its task_rq and cpus_allowed are stable too. + * + * If dispatching to @rq that @p is already on, no lock dancing needed. + */ + if (rq == src_rq && rq == dst_rq) { + dispatch_enqueue(sch, dst_dsq, p, + enq_flags | SCX_ENQ_CLEAR_OPSS); + return; + } + +#ifdef CONFIG_SMP + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { + dispatch_enqueue(sch, find_global_dsq(p), p, + enq_flags | SCX_ENQ_CLEAR_OPSS); + return; + } + + /* + * @p is on a possibly remote @src_rq which we need to lock to move the + * task. If dequeue is in progress, it'd be locking @src_rq and waiting + * on DISPATCHING, so we can't grab @src_rq lock while holding + * DISPATCHING. + * + * As DISPATCHING guarantees that @p is wholly ours, we can pretend that + * we're moving from a DSQ and use the same mechanism - mark the task + * under transfer with holding_cpu, release DISPATCHING and then follow + * the same protocol. See unlink_dsq_and_lock_src_rq(). + */ + p->scx.holding_cpu = raw_smp_processor_id(); + + /* store_release ensures that dequeue sees the above */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + /* switch to @src_rq lock */ + if (locked_rq != src_rq) { + raw_spin_rq_unlock(locked_rq); + locked_rq = src_rq; + raw_spin_rq_lock(src_rq); + } + + /* task_rq couldn't have changed if we're still the holding cpu */ + if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && + !WARN_ON_ONCE(src_rq != task_rq(p))) { + /* + * If @p is staying on the same rq, there's no need to go + * through the full deactivate/activate cycle. Optimize by + * abbreviating move_remote_task_to_local_dsq(). + */ + if (src_rq == dst_rq) { + p->scx.holding_cpu = -1; + dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p, + enq_flags); + } else { + move_remote_task_to_local_dsq(p, enq_flags, + src_rq, dst_rq); + /* task has been moved to dst_rq, which is now locked */ + locked_rq = dst_rq; + } + + /* if the destination CPU is idle, wake it up */ + if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) + resched_curr(dst_rq); + } + + /* switch back to @rq lock */ + if (locked_rq != rq) { + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(rq); + } +#else /* CONFIG_SMP */ + BUG(); /* control can not reach here on UP */ +#endif /* CONFIG_SMP */ +} + +/** + * finish_dispatch - Asynchronously finish dispatching a task + * @rq: current rq which is locked + * @p: task to finish dispatching + * @qseq_at_dispatch: qseq when @p started getting dispatched + * @dsq_id: destination DSQ ID + * @enq_flags: %SCX_ENQ_* + * + * Dispatching to local DSQs may need to wait for queueing to complete or + * require rq lock dancing. As we don't wanna do either while inside + * ops.dispatch() to avoid locking order inversion, we split dispatching into + * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the + * task and its qseq. Once ops.dispatch() returns, this function is called to + * finish up. + * + * There is no guarantee that @p is still valid for dispatching or even that it + * was valid in the first place. Make sure that the task is still owned by the + * BPF scheduler and claim the ownership before dispatching. + */ +static void finish_dispatch(struct scx_sched *sch, struct rq *rq, + struct task_struct *p, + unsigned long qseq_at_dispatch, + u64 dsq_id, u64 enq_flags) +{ + struct scx_dispatch_q *dsq; + unsigned long opss; + + touch_core_sched_dispatch(rq, p); +retry: + /* + * No need for _acquire here. @p is accessed only after a successful + * try_cmpxchg to DISPATCHING. + */ + opss = atomic_long_read(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_DISPATCHING: + case SCX_OPSS_NONE: + /* someone else already got to it */ + return; + case SCX_OPSS_QUEUED: + /* + * If qseq doesn't match, @p has gone through at least one + * dispatch/dequeue and re-enqueue cycle between + * scx_bpf_dsq_insert() and here and we have no claim on it. + */ + if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) + return; + + /* + * While we know @p is accessible, we don't yet have a claim on + * it - the BPF scheduler is allowed to dispatch tasks + * spuriously and there can be a racing dequeue attempt. Let's + * claim @p by atomically transitioning it from QUEUED to + * DISPATCHING. + */ + if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_DISPATCHING))) + break; + goto retry; + case SCX_OPSS_QUEUEING: + /* + * do_enqueue_task() is in the process of transferring the task + * to the BPF scheduler while holding @p's rq lock. As we aren't + * holding any kernel or BPF resource that the enqueue path may + * depend upon, it's safe to wait. + */ + wait_ops_state(p, opss); + goto retry; + } + + BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); + + dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p); + + if (dsq->id == SCX_DSQ_LOCAL) + dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); + else + dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); +} + +static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + u32 u; + + for (u = 0; u < dspc->cursor; u++) { + struct scx_dsp_buf_ent *ent = &dspc->buf[u]; + + finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, + ent->enq_flags); + } + + dspc->nr_tasks += dspc->cursor; + dspc->cursor = 0; +} + +static int balance_one(struct rq *rq, struct task_struct *prev) +{ + struct scx_sched *sch = scx_root; + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; + int nr_loops = SCX_DSP_MAX_LOOPS; + + lockdep_assert_rq_held(rq); + rq->scx.flags |= SCX_RQ_IN_BALANCE; + rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + + if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && + unlikely(rq->scx.cpu_released)) { + /* + * If the previous sched_class for the current CPU was not SCX, + * notify the BPF scheduler that it again has control of the + * core. This callback complements ->cpu_release(), which is + * emitted in switch_class(). + */ + if (SCX_HAS_OP(sch, cpu_acquire)) + SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, + cpu_of(rq), NULL); + rq->scx.cpu_released = false; + } + + if (prev_on_scx) { + update_curr_scx(rq); + + /* + * If @prev is runnable & has slice left, it has priority and + * fetching more just increases latency for the fetched tasks. + * Tell pick_task_scx() to keep running @prev. If the BPF + * scheduler wants to handle this explicitly, it should + * implement ->cpu_release(). + * + * See scx_disable_workfn() for the explanation on the bypassing + * test. + */ + if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } + } + + /* if there already are tasks to run, nothing to do */ + if (rq->scx.local_dsq.nr) + goto has_tasks; + + if (consume_global_dsq(sch, rq)) + goto has_tasks; + + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || + scx_rq_bypassing(rq) || !scx_rq_online(rq)) + goto no_tasks; + + dspc->rq = rq; + + /* + * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, + * the local DSQ might still end up empty after a successful + * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() + * produced some tasks, retry. The BPF scheduler may depend on this + * looping behavior to simplify its implementation. + */ + do { + dspc->nr_tasks = 0; + + SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, + cpu_of(rq), prev_on_scx ? prev : NULL); + + flush_dispatch_buf(sch, rq); + + if (prev_on_rq && prev->scx.slice) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } + if (rq->scx.local_dsq.nr) + goto has_tasks; + if (consume_global_dsq(sch, rq)) + goto has_tasks; + + /* + * ops.dispatch() can trap us in this loop by repeatedly + * dispatching ineligible tasks. Break out once in a while to + * allow the watchdog to run. As IRQ can't be enabled in + * balance(), we want to complete this scheduling cycle and then + * start a new one. IOW, we want to call resched_curr() on the + * next, most likely idle, task, not the current one. Use + * scx_bpf_kick_cpu() for deferred kicking. + */ + if (unlikely(!--nr_loops)) { + scx_bpf_kick_cpu(cpu_of(rq), 0); + break; + } + } while (dspc->nr_tasks); + +no_tasks: + /* + * Didn't find another task to run. Keep running @prev unless + * %SCX_OPS_ENQ_LAST is in effect. + */ + if (prev_on_rq && + (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); + goto has_tasks; + } + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; + return false; + +has_tasks: + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; + return true; +} + +static int balance_scx(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ + int ret; + + rq_unpin_lock(rq, rf); + + ret = balance_one(rq, prev); + +#ifdef CONFIG_SCHED_SMT + /* + * When core-sched is enabled, this ops.balance() call will be followed + * by pick_task_scx() on this CPU and the SMT siblings. Balance the + * siblings too. + */ + if (sched_core_enabled(rq)) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + int scpu; + + for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { + struct rq *srq = cpu_rq(scpu); + struct task_struct *sprev = srq->curr; + + WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); + update_rq_clock(srq); + balance_one(srq, sprev); + } + } +#endif + rq_repin_lock(rq, rf); + + return ret; +} + +static void process_ddsp_deferred_locals(struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_rq_held(rq); + + /* + * Now that @rq can be unlocked, execute the deferred enqueueing of + * tasks directly dispatched to the local DSQs of other CPUs. See + * direct_dispatch(). Keep popping from the head instead of using + * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq + * temporarily. + */ + while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, + struct task_struct, scx.dsq_list.node))) { + struct scx_sched *sch = scx_root; + struct scx_dispatch_q *dsq; + + list_del_init(&p->scx.dsq_list.node); + + dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) + dispatch_to_local_dsq(sch, rq, dsq, p, + p->scx.ddsp_enq_flags); + } +} + +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) +{ + struct scx_sched *sch = scx_root; + + if (p->scx.flags & SCX_TASK_QUEUED) { + /* + * Core-sched might decide to execute @p before it is + * dispatched. Call ops_dequeue() to notify the BPF scheduler. + */ + ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); + dispatch_dequeue(rq, p); + } + + p->se.exec_start = rq_clock_task(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p); + + clr_task_runnable(p, true); + + /* + * @p is getting newly scheduled or got kicked after someone updated its + * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). + */ + if ((p->scx.slice == SCX_SLICE_INF) != + (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { + if (p->scx.slice == SCX_SLICE_INF) + rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; + else + rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; + + sched_update_tick_dependency(rq); + + /* + * For now, let's refresh the load_avgs just when transitioning + * in and out of nohz. In the future, we might want to add a + * mechanism which calls the following periodically on + * tick-stopped CPUs. + */ + update_other_load_avgs(rq); + } +} + +static enum scx_cpu_preempt_reason +preempt_reason_from_class(const struct sched_class *class) +{ +#ifdef CONFIG_SMP + if (class == &stop_sched_class) + return SCX_CPU_PREEMPT_STOP; +#endif + if (class == &dl_sched_class) + return SCX_CPU_PREEMPT_DL; + if (class == &rt_sched_class) + return SCX_CPU_PREEMPT_RT; + return SCX_CPU_PREEMPT_UNKNOWN; +} + +static void switch_class(struct rq *rq, struct task_struct *next) +{ + struct scx_sched *sch = scx_root; + const struct sched_class *next_class = next->sched_class; + +#ifdef CONFIG_SMP + /* + * Pairs with the smp_load_acquire() issued by a CPU in + * kick_cpus_irq_workfn() who is waiting for this CPU to perform a + * resched. + */ + smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); +#endif + if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) + return; + + /* + * The callback is conceptually meant to convey that the CPU is no + * longer under the control of SCX. Therefore, don't invoke the callback + * if the next class is below SCX (in which case the BPF scheduler has + * actively decided not to schedule any tasks on the CPU). + */ + if (sched_class_above(&ext_sched_class, next_class)) + return; + + /* + * At this point we know that SCX was preempted by a higher priority + * sched_class, so invoke the ->cpu_release() callback if we have not + * done so already. We only send the callback once between SCX being + * preempted, and it regaining control of the CPU. + * + * ->cpu_release() complements ->cpu_acquire(), which is emitted the + * next time that balance_scx() is invoked. + */ + if (!rq->scx.cpu_released) { + if (SCX_HAS_OP(sch, cpu_release)) { + struct scx_cpu_release_args args = { + .reason = preempt_reason_from_class(next_class), + .task = next, + }; + + SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq, + cpu_of(rq), &args); + } + rq->scx.cpu_released = true; + } +} + +static void put_prev_task_scx(struct rq *rq, struct task_struct *p, + struct task_struct *next) +{ + struct scx_sched *sch = scx_root; + update_curr_scx(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true); + + if (p->scx.flags & SCX_TASK_QUEUED) { + set_task_runnable(rq, p); + + /* + * If @p has slice left and is being put, @p is getting + * preempted by a higher priority scheduler class or core-sched + * forcing a different task. Leave it at the head of the local + * DSQ. + */ + if (p->scx.slice && !scx_rq_bypassing(rq)) { + dispatch_enqueue(sch, &rq->scx.local_dsq, p, + SCX_ENQ_HEAD); + goto switch_class; + } + + /* + * If @p is runnable but we're about to enter a lower + * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell + * ops.enqueue() that @p is the only one available for this cpu, + * which should trigger an explicit follow-up scheduling event. + */ + if (sched_class_above(&ext_sched_class, next->sched_class)) { + WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); + do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); + } else { + do_enqueue_task(rq, p, 0, -1); + } + } + +switch_class: + if (next && next->sched_class != &ext_sched_class) + switch_class(rq, next); +} + +static struct task_struct *first_local_task(struct rq *rq) +{ + return list_first_entry_or_null(&rq->scx.local_dsq.list, + struct task_struct, scx.dsq_list.node); +} + +static struct task_struct *pick_task_scx(struct rq *rq) +{ + struct task_struct *prev = rq->curr; + struct task_struct *p; + bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + bool kick_idle = false; + + /* + * WORKAROUND: + * + * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just + * have gone through balance_scx(). Unfortunately, there currently is a + * bug where fair could say yes on balance() but no on pick_task(), + * which then ends up calling pick_task_scx() without preceding + * balance_scx(). + * + * Keep running @prev if possible and avoid stalling from entering idle + * without balancing. + * + * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() + * if pick_task_scx() is called without preceding balance_scx(). + */ + if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { + if (prev->scx.flags & SCX_TASK_QUEUED) { + keep_prev = true; + } else { + keep_prev = false; + kick_idle = true; + } + } else if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { + /* + * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is + * conditional on scx_enabled() and may have been skipped. + */ + WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); + keep_prev = false; + } + + /* + * If balance_scx() is telling us to keep running @prev, replenish slice + * if necessary and keep running @prev. Otherwise, pop the first one + * from the local DSQ. + */ + if (keep_prev) { + p = prev; + if (!p->scx.slice) + refill_task_slice_dfl(p); + } else { + p = first_local_task(rq); + if (!p) { + if (kick_idle) + scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); + return NULL; + } + + if (unlikely(!p->scx.slice)) { + struct scx_sched *sch = scx_root; + + if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", + p->comm, p->pid, __func__); + sch->warned_zero_slice = true; + } + refill_task_slice_dfl(p); + } + } + + return p; +} + +#ifdef CONFIG_SCHED_CORE +/** + * scx_prio_less - Task ordering for core-sched + * @a: task A + * @b: task B + * @in_fi: in forced idle state + * + * Core-sched is implemented as an additional scheduling layer on top of the + * usual sched_class'es and needs to find out the expected task ordering. For + * SCX, core-sched calls this function to interrogate the task ordering. + * + * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used + * to implement the default task ordering. The older the timestamp, the higher + * priority the task - the global FIFO ordering matching the default scheduling + * behavior. + * + * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to + * implement FIFO ordering within each local DSQ. See pick_task_scx(). + */ +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) +{ + struct scx_sched *sch = scx_root; + + /* + * The const qualifiers are dropped from task_struct pointers when + * calling ops.core_sched_before(). Accesses are controlled by the + * verifier. + */ + if (SCX_HAS_OP(sch, core_sched_before) && + !scx_rq_bypassing(task_rq(a))) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before, + NULL, + (struct task_struct *)a, + (struct task_struct *)b); + else + return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); +} +#endif /* CONFIG_SCHED_CORE */ + +#ifdef CONFIG_SMP + +static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) +{ + struct scx_sched *sch = scx_root; + bool rq_bypass; + + /* + * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it + * can be a good migration opportunity with low cache and memory + * footprint. Returning a CPU different than @prev_cpu triggers + * immediate rq migration. However, for SCX, as the current rq + * association doesn't dictate where the task is going to run, this + * doesn't fit well. If necessary, we can later add a dedicated method + * which can decide to preempt self to force it through the regular + * scheduling path. + */ + if (unlikely(wake_flags & WF_EXEC)) + return prev_cpu; + + rq_bypass = scx_rq_bypassing(task_rq(p)); + if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) { + s32 cpu; + struct task_struct **ddsp_taskp; + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + cpu = SCX_CALL_OP_TASK_RET(sch, + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, + select_cpu, NULL, p, prev_cpu, + wake_flags); + p->scx.selected_cpu = cpu; + *ddsp_taskp = NULL; + if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) + return cpu; + else + return prev_cpu; + } else { + s32 cpu; + + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); + if (cpu >= 0) { + refill_task_slice_dfl(p); + p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + } else { + cpu = prev_cpu; + } + p->scx.selected_cpu = cpu; + + if (rq_bypass) + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); + return cpu; + } +} + +static void task_woken_scx(struct rq *rq, struct task_struct *p) +{ + run_deferred(rq); +} + +static void set_cpus_allowed_scx(struct task_struct *p, + struct affinity_context *ac) +{ + struct scx_sched *sch = scx_root; + + set_cpus_allowed_common(p, ac); + + /* + * The effective cpumask is stored in @p->cpus_ptr which may temporarily + * differ from the configured one in @p->cpus_mask. Always tell the bpf + * scheduler the effective one. + * + * Fine-grained memory write control is enforced by BPF making the const + * designation pointless. Cast it away when calling the operation. + */ + if (SCX_HAS_OP(sch, set_cpumask)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL, + p, (struct cpumask *)p->cpus_ptr); +} + +static void handle_hotplug(struct rq *rq, bool online) +{ + struct scx_sched *sch = scx_root; + int cpu = cpu_of(rq); + + atomic_long_inc(&scx_hotplug_seq); + + /* + * scx_root updates are protected by cpus_read_lock() and will stay + * stable here. Note that we can't depend on scx_enabled() test as the + * hotplug ops need to be enabled before __scx_enabled is set. + */ + if (unlikely(!sch)) + return; + + if (scx_enabled()) + scx_idle_update_selcpu_topology(&sch->ops); + + if (online && SCX_HAS_OP(sch, cpu_online)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu); + else if (!online && SCX_HAS_OP(sch, cpu_offline)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); + else + scx_exit(sch, SCX_EXIT_UNREG_KERN, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); +} + +void scx_rq_activate(struct rq *rq) +{ + handle_hotplug(rq, true); +} + +void scx_rq_deactivate(struct rq *rq) +{ + handle_hotplug(rq, false); +} + +static void rq_online_scx(struct rq *rq) +{ + rq->scx.flags |= SCX_RQ_ONLINE; +} + +static void rq_offline_scx(struct rq *rq) +{ + rq->scx.flags &= ~SCX_RQ_ONLINE; +} + +#endif /* CONFIG_SMP */ + +static bool check_rq_for_timeouts(struct rq *rq) +{ + struct scx_sched *sch; + struct task_struct *p; + struct rq_flags rf; + bool timed_out = false; + + rq_lock_irqsave(rq, &rf); + sch = rcu_dereference_bh(scx_root); + if (unlikely(!sch)) + goto out_unlock; + + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + unsigned long last_runnable = p->scx.runnable_at; + + if (unlikely(time_after(jiffies, + last_runnable + scx_watchdog_timeout))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); + + scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); + timed_out = true; + break; + } + } +out_unlock: + rq_unlock_irqrestore(rq, &rf); + return timed_out; +} + +static void scx_watchdog_workfn(struct work_struct *work) +{ + int cpu; + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + + for_each_online_cpu(cpu) { + if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) + break; + + cond_resched(); + } + queue_delayed_work(system_unbound_wq, to_delayed_work(work), + scx_watchdog_timeout / 2); +} + +void scx_tick(struct rq *rq) +{ + struct scx_sched *sch; + unsigned long last_check; + + if (!scx_enabled()) + return; + + sch = rcu_dereference_bh(scx_root); + if (unlikely(!sch)) + return; + + last_check = READ_ONCE(scx_watchdog_timestamp); + if (unlikely(time_after(jiffies, + last_check + READ_ONCE(scx_watchdog_timeout)))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_check); + + scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); + } + + update_other_load_avgs(rq); +} + +static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +{ + struct scx_sched *sch = scx_root; + + update_curr_scx(rq); + + /* + * While disabling, always resched and refresh core-sched timestamp as + * we can't trust the slice management or ops.core_sched_before(). + */ + if (scx_rq_bypassing(rq)) { + curr->scx.slice = 0; + touch_core_sched(rq, curr); + } else if (SCX_HAS_OP(sch, tick)) { + SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr); + } + + if (!curr->scx.slice) + resched_curr(rq); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static struct cgroup *tg_cgrp(struct task_group *tg) +{ + /* + * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, + * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the + * root cgroup. + */ + if (tg && tg->css.cgroup) + return tg->css.cgroup; + else + return &cgrp_dfl_root.cgrp; +} + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), + +#else /* CONFIG_EXT_GROUP_SCHED */ + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +static enum scx_task_state scx_get_task_state(const struct task_struct *p) +{ + return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; +} + +static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +{ + enum scx_task_state prev_state = scx_get_task_state(p); + bool warn = false; + + BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); + + switch (state) { + case SCX_TASK_NONE: + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_READY: + warn = prev_state == SCX_TASK_NONE; + break; + case SCX_TASK_ENABLED: + warn = prev_state != SCX_TASK_READY; + break; + default: + warn = true; + return; + } + + WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state << SCX_TASK_STATE_SHIFT; +} + +static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork) +{ + struct scx_sched *sch = scx_root; + int ret; + + p->scx.disallow = false; + + if (SCX_HAS_OP(sch, init_task)) { + struct scx_init_task_args args = { + SCX_INIT_TASK_ARGS_CGROUP(tg) + .fork = fork, + }; + + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL, + p, &args); + if (unlikely(ret)) { + ret = ops_sanitize_err(sch, "init_task", ret); + return ret; + } + } + + scx_set_task_state(p, SCX_TASK_INIT); + + if (p->scx.disallow) { + if (!fork) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + + /* + * We're in the load path and @p->policy will be applied + * right after. Reverting @p->policy here and rejecting + * %SCHED_EXT transitions from scx_check_setscheduler() + * guarantees that if ops.init_task() sets @p->disallow, + * @p can never be in SCX. + */ + if (p->policy == SCHED_EXT) { + p->policy = SCHED_NORMAL; + atomic_long_inc(&scx_nr_rejected); + } + + task_rq_unlock(rq, p, &rf); + } else if (p->policy == SCHED_EXT) { + scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", + p->comm, p->pid); + } + } + + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + return 0; +} + +static void scx_enable_task(struct task_struct *p) +{ + struct scx_sched *sch = scx_root; + struct rq *rq = task_rq(p); + u32 weight; + + lockdep_assert_rq_held(rq); + + /* + * Set the weight before calling ops.enable() so that the scheduler + * doesn't see a stale value if they inspect the task struct. + */ + if (task_has_idle_policy(p)) + weight = WEIGHT_IDLEPRIO; + else + weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; + + p->scx.weight = sched_weight_to_cgroup(weight); + + if (SCX_HAS_OP(sch, enable)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p); + scx_set_task_state(p, SCX_TASK_ENABLED); + + if (SCX_HAS_OP(sch, set_weight)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, + p, p->scx.weight); +} + +static void scx_disable_task(struct task_struct *p) +{ + struct scx_sched *sch = scx_root; + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + + if (SCX_HAS_OP(sch, disable)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); + scx_set_task_state(p, SCX_TASK_READY); +} + +static void scx_exit_task(struct task_struct *p) +{ + struct scx_sched *sch = scx_root; + struct scx_exit_task_args args = { + .cancelled = false, + }; + + lockdep_assert_rq_held(task_rq(p)); + + switch (scx_get_task_state(p)) { + case SCX_TASK_NONE: + return; + case SCX_TASK_INIT: + args.cancelled = true; + break; + case SCX_TASK_READY: + break; + case SCX_TASK_ENABLED: + scx_disable_task(p); + break; + default: + WARN_ON_ONCE(true); + return; + } + + if (SCX_HAS_OP(sch, exit_task)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), + p, &args); + scx_set_task_state(p, SCX_TASK_NONE); +} + +void init_scx_entity(struct sched_ext_entity *scx) +{ + memset(scx, 0, sizeof(*scx)); + INIT_LIST_HEAD(&scx->dsq_list.node); + RB_CLEAR_NODE(&scx->dsq_priq); + scx->sticky_cpu = -1; + scx->holding_cpu = -1; + INIT_LIST_HEAD(&scx->runnable_node); + scx->runnable_at = jiffies; + scx->ddsp_dsq_id = SCX_DSQ_INVALID; + scx->slice = SCX_SLICE_DFL; +} + +void scx_pre_fork(struct task_struct *p) +{ + /* + * BPF scheduler enable/disable paths want to be able to iterate and + * update all tasks which can become complex when racing forks. As + * enable/disable are very cold paths, let's use a percpu_rwsem to + * exclude forks. + */ + percpu_down_read(&scx_fork_rwsem); +} + +int scx_fork(struct task_struct *p) +{ + percpu_rwsem_assert_held(&scx_fork_rwsem); + + if (scx_init_task_enabled) + return scx_init_task(p, task_group(p), true); + else + return 0; +} + +void scx_post_fork(struct task_struct *p) +{ + if (scx_init_task_enabled) { + scx_set_task_state(p, SCX_TASK_READY); + + /* + * Enable the task immediately if it's running on sched_ext. + * Otherwise, it'll be enabled in switching_to_scx() if and + * when it's ever configured to run with a SCHED_EXT policy. + */ + if (p->sched_class == &ext_sched_class) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_enable_task(p); + task_rq_unlock(rq, p, &rf); + } + } + + spin_lock_irq(&scx_tasks_lock); + list_add_tail(&p->scx.tasks_node, &scx_tasks); + spin_unlock_irq(&scx_tasks_lock); + + percpu_up_read(&scx_fork_rwsem); +} + +void scx_cancel_fork(struct task_struct *p) +{ + if (scx_enabled()) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); + scx_exit_task(p); + task_rq_unlock(rq, p, &rf); + } + + percpu_up_read(&scx_fork_rwsem); +} + +void sched_ext_free(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&scx_tasks_lock, flags); + list_del_init(&p->scx.tasks_node); + spin_unlock_irqrestore(&scx_tasks_lock, flags); + + /* + * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED + * transitions can't race us. Disable ops for @p. + */ + if (scx_get_task_state(p) != SCX_TASK_NONE) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_exit_task(p); + task_rq_unlock(rq, p, &rf); + } +} + +static void reweight_task_scx(struct rq *rq, struct task_struct *p, + const struct load_weight *lw) +{ + struct scx_sched *sch = scx_root; + + lockdep_assert_rq_held(task_rq(p)); + + p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); + if (SCX_HAS_OP(sch, set_weight)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, + p, p->scx.weight); +} + +static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +{ +} + +static void switching_to_scx(struct rq *rq, struct task_struct *p) +{ + struct scx_sched *sch = scx_root; + + scx_enable_task(p); + + /* + * set_cpus_allowed_scx() is not called while @p is associated with a + * different scheduler class. Keep the BPF scheduler up-to-date. + */ + if (SCX_HAS_OP(sch, set_cpumask)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq, + p, (struct cpumask *)p->cpus_ptr); +} + +static void switched_from_scx(struct rq *rq, struct task_struct *p) +{ + scx_disable_task(p); +} + +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void switched_to_scx(struct rq *rq, struct task_struct *p) {} + +int scx_check_setscheduler(struct task_struct *p, int policy) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* if disallow, reject transitioning into SCX */ + if (scx_enabled() && READ_ONCE(p->scx.disallow) && + p->policy != policy && policy == SCHED_EXT) + return -EACCES; + + return 0; +} + +#ifdef CONFIG_NO_HZ_FULL +bool scx_can_stop_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (scx_rq_bypassing(rq)) + return false; + + if (p->sched_class != &ext_sched_class) + return true; + + /* + * @rq can dispatch from different DSQs, so we can't tell whether it + * needs the tick or not by looking at nr_running. Allow stopping ticks + * iff the BPF scheduler indicated so. See set_next_task_scx(). + */ + return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; +} +#endif + +#ifdef CONFIG_EXT_GROUP_SCHED + +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +static bool scx_cgroup_enabled; + +int scx_tg_online(struct task_group *tg) +{ + struct scx_sched *sch = scx_root; + int ret = 0; + + WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (scx_cgroup_enabled) { + if (SCX_HAS_OP(sch, cgroup_init)) { + struct scx_cgroup_init_args args = + { .weight = tg->scx_weight }; + + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, + NULL, tg->css.cgroup, &args); + if (ret) + ret = ops_sanitize_err(sch, "cgroup_init", ret); + } + if (ret == 0) + tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + } else { + tg->scx_flags |= SCX_TG_ONLINE; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ret; +} + +void scx_tg_offline(struct task_group *tg) +{ + struct scx_sched *sch = scx_root; + + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && + (tg->scx_flags & SCX_TG_INITED)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, + tg->css.cgroup); + tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + + percpu_up_read(&scx_cgroup_rwsem); +} + +int scx_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct scx_sched *sch = scx_root; + struct cgroup_subsys_state *css; + struct task_struct *p; + int ret; + + /* released in scx_finish/cancel_attach() */ + percpu_down_read(&scx_cgroup_rwsem); + + if (!scx_cgroup_enabled) + return 0; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup *from = tg_cgrp(task_group(p)); + struct cgroup *to = tg_cgrp(css_tg(css)); + + WARN_ON_ONCE(p->scx.cgrp_moving_from); + + /* + * sched_move_task() omits identity migrations. Let's match the + * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() + * always match one-to-one. + */ + if (from == to) + continue; + + if (SCX_HAS_OP(sch, cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, + cgroup_prep_move, NULL, + p, from, css->cgroup); + if (ret) + goto err; + } + + p->scx.cgrp_moving_from = from; + } + + return 0; + +err: + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(sch, cgroup_cancel_move) && + p->scx.cgrp_moving_from) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ops_sanitize_err(sch, "cgroup_prep_move", ret); +} + +void scx_cgroup_move_task(struct task_struct *p) +{ + struct scx_sched *sch = scx_root; + + if (!scx_cgroup_enabled) + return; + + /* + * @p must have ops.cgroup_prep_move() called on it and thus + * cgrp_moving_from set. + */ + if (SCX_HAS_OP(sch, cgroup_move) && + !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL, + p, p->scx.cgrp_moving_from, + tg_cgrp(task_group(p))); + p->scx.cgrp_moving_from = NULL; +} + +void scx_cgroup_finish_attach(void) +{ + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + struct scx_sched *sch = scx_root; + struct cgroup_subsys_state *css; + struct task_struct *p; + + if (!scx_cgroup_enabled) + goto out_unlock; + + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(sch, cgroup_cancel_move) && + p->scx.cgrp_moving_from) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } +out_unlock: + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_weight(struct task_group *tg, unsigned long weight) +{ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_rwsem); + + if (scx_cgroup_enabled && tg->scx_weight != weight) { + if (SCX_HAS_OP(sch, cgroup_set_weight)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, + tg_cgrp(tg), weight); + tg->scx_weight = weight; + } + + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_idle(struct task_group *tg, bool idle) +{ + /* TODO: Implement ops->cgroup_set_idle() */ +} + +static void scx_cgroup_lock(void) +{ + percpu_down_write(&scx_cgroup_rwsem); +} + +static void scx_cgroup_unlock(void) +{ + percpu_up_write(&scx_cgroup_rwsem); +} + +#else /* CONFIG_EXT_GROUP_SCHED */ + +static inline void scx_cgroup_lock(void) {} +static inline void scx_cgroup_unlock(void) {} + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +/* + * Omitted operations: + * + * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task + * isn't tied to the CPU at that point. Preemption is implemented by resetting + * the victim task's slice to 0 and triggering reschedule on the target CPU. + * + * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. + * + * - task_fork/dead: We need fork/dead notifications for all tasks regardless of + * their current sched_class. Call them directly from sched core instead. + */ +DEFINE_SCHED_CLASS(ext) = { + .enqueue_task = enqueue_task_scx, + .dequeue_task = dequeue_task_scx, + .yield_task = yield_task_scx, + .yield_to_task = yield_to_task_scx, + + .wakeup_preempt = wakeup_preempt_scx, + + .balance = balance_scx, + .pick_task = pick_task_scx, + + .put_prev_task = put_prev_task_scx, + .set_next_task = set_next_task_scx, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_scx, + .task_woken = task_woken_scx, + .set_cpus_allowed = set_cpus_allowed_scx, + + .rq_online = rq_online_scx, + .rq_offline = rq_offline_scx, +#endif + + .task_tick = task_tick_scx, + + .switching_to = switching_to_scx, + .switched_from = switched_from_scx, + .switched_to = switched_to_scx, + .reweight_task = reweight_task_scx, + .prio_changed = prio_changed_scx, + + .update_curr = update_curr_scx, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif +}; + +static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +{ + memset(dsq, 0, sizeof(*dsq)); + + raw_spin_lock_init(&dsq->lock); + INIT_LIST_HEAD(&dsq->list); + dsq->id = dsq_id; +} + +static void free_dsq_irq_workfn(struct irq_work *irq_work) +{ + struct llist_node *to_free = llist_del_all(&dsqs_to_free); + struct scx_dispatch_q *dsq, *tmp_dsq; + + llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) + kfree_rcu(dsq, rcu); +} + +static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); + +static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + unsigned long flags; + + rcu_read_lock(); + + dsq = find_user_dsq(sch, dsq_id); + if (!dsq) + goto out_unlock_rcu; + + raw_spin_lock_irqsave(&dsq->lock, flags); + + if (dsq->nr) { + scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", + dsq->id, dsq->nr); + goto out_unlock_dsq; + } + + if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, + dsq_hash_params)) + goto out_unlock_dsq; + + /* + * Mark dead by invalidating ->id to prevent dispatch_enqueue() from + * queueing more tasks. As this function can be called from anywhere, + * freeing is bounced through an irq work to avoid nesting RCU + * operations inside scheduler locks. + */ + dsq->id = SCX_DSQ_INVALID; + llist_add(&dsq->free_node, &dsqs_to_free); + irq_work_queue(&free_dsq_irq_work); + +out_unlock_dsq: + raw_spin_unlock_irqrestore(&dsq->lock, flags); +out_unlock_rcu: + rcu_read_unlock(); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static void scx_cgroup_exit(struct scx_sched *sch) +{ + struct cgroup_subsys_state *css; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + scx_cgroup_enabled = false; + + /* + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * cgroups and exit all the inited ones, all online cgroups are exited. + */ + rcu_read_lock(); + css_for_each_descendant_post(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + + if (!(tg->scx_flags & SCX_TG_INITED)) + continue; + tg->scx_flags &= ~SCX_TG_INITED; + + if (!sch->ops.cgroup_exit) + continue; + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, + css->cgroup); + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); +} + +static int scx_cgroup_init(struct scx_sched *sch) +{ + struct cgroup_subsys_state *css; + int ret; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + /* + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * cgroups and init, all online cgroups are initialized. + */ + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + if ((tg->scx_flags & + (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) + continue; + + if (!sch->ops.cgroup_init) { + tg->scx_flags |= SCX_TG_INITED; + continue; + } + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, + css->cgroup, &args); + if (ret) { + css_put(css); + scx_error(sch, "ops.cgroup_init() failed (%d)", ret); + return ret; + } + tg->scx_flags |= SCX_TG_INITED; + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); + + WARN_ON_ONCE(scx_cgroup_enabled); + scx_cgroup_enabled = true; + + return 0; +} + +#else +static void scx_cgroup_exit(struct scx_sched *sch) {} +static int scx_cgroup_init(struct scx_sched *sch) { return 0; } +#endif + + +/******************************************************************************** + * Sysfs interface and ops enable/disable. + */ + +#define SCX_ATTR(_name) \ + static struct kobj_attribute scx_attr_##_name = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = scx_attr_##_name##_show, \ + } + +static ssize_t scx_attr_state_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); +} +SCX_ATTR(state); + +static ssize_t scx_attr_switch_all_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); +} +SCX_ATTR(switch_all); + +static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); +} +SCX_ATTR(nr_rejected); + +static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); +} +SCX_ATTR(hotplug_seq); + +static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); +} +SCX_ATTR(enable_seq); + +static struct attribute *scx_global_attrs[] = { + &scx_attr_state.attr, + &scx_attr_switch_all.attr, + &scx_attr_nr_rejected.attr, + &scx_attr_hotplug_seq.attr, + &scx_attr_enable_seq.attr, + NULL, +}; + +static const struct attribute_group scx_global_attr_group = { + .attrs = scx_global_attrs, +}; + +static void free_exit_info(struct scx_exit_info *ei); + +static void scx_sched_free_rcu_work(struct work_struct *work) +{ + struct rcu_work *rcu_work = to_rcu_work(work); + struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); + struct rhashtable_iter rht_iter; + struct scx_dispatch_q *dsq; + int node; + + kthread_stop(sch->helper->task); + free_percpu(sch->event_stats_cpu); + + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); + kfree(sch->global_dsqs); + + rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); + do { + rhashtable_walk_start(&rht_iter); + + while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + destroy_dsq(sch, dsq->id); + + rhashtable_walk_stop(&rht_iter); + } while (dsq == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&rht_iter); + + rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); + free_exit_info(sch->exit_info); + kfree(sch); +} + +static void scx_kobj_release(struct kobject *kobj) +{ + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); + queue_rcu_work(system_unbound_wq, &sch->rcu_work); +} + +static ssize_t scx_attr_ops_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", scx_root->ops.name); +} +SCX_ATTR(ops); + +#define scx_attr_event_show(buf, at, events, kind) ({ \ + sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ +}) + +static ssize_t scx_attr_events_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + struct scx_event_stats events; + int at = 0; + + scx_read_events(sch, &events); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + return at; +} +SCX_ATTR(events); + +static struct attribute *scx_sched_attrs[] = { + &scx_attr_ops.attr, + &scx_attr_events.attr, + NULL, +}; +ATTRIBUTE_GROUPS(scx_sched); + +static const struct kobj_type scx_ktype = { + .release = scx_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = scx_sched_groups, +}; + +static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); +} + +static const struct kset_uevent_ops scx_uevent_ops = { + .uevent = scx_uevent, +}; + +/* + * Used by sched_fork() and __setscheduler_prio() to pick the matching + * sched_class. dl/rt are already handled. + */ +bool task_should_scx(int policy) +{ + if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) + return false; + if (READ_ONCE(scx_switching_all)) + return true; + return policy == SCHED_EXT; +} + +bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + return !scx_enabled() || + (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || + p->sched_class != &ext_sched_class; +} + +/** + * scx_softlockup - sched_ext softlockup handler + * @dur_s: number of seconds of CPU stuck due to soft lockup + * + * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can + * live-lock the system by making many CPUs target the same DSQ to the point + * where soft-lockup detection triggers. This function is called from + * soft-lockup watchdog when the triggering point is close and tries to unjam + * the system by enabling the breather and aborting the BPF scheduler. + */ +void scx_softlockup(u32 dur_s) +{ + struct scx_sched *sch; + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out_unlock; + + switch (scx_enable_state()) { + case SCX_ENABLING: + case SCX_ENABLED: + break; + default: + goto out_unlock; + } + + /* allow only one instance, cleared at the end of scx_bypass() */ + if (test_and_set_bit(0, &scx_in_softlockup)) + goto out_unlock; + + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", + smp_processor_id(), dur_s, scx_root->ops.name); + + /* + * Some CPUs may be trapped in the dispatch paths. Enable breather + * immediately; otherwise, we might even be able to get to scx_bypass(). + */ + atomic_inc(&scx_breather_depth); + + scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); +out_unlock: + rcu_read_unlock(); +} + +static void scx_clear_softlockup(void) +{ + if (test_and_clear_bit(0, &scx_in_softlockup)) + atomic_dec(&scx_breather_depth); +} + +/** + * scx_bypass - [Un]bypass scx_ops and guarantee forward progress + * @bypass: true for bypass, false for unbypass + * + * Bypassing guarantees that all runnable tasks make forward progress without + * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might + * be held by tasks that the BPF scheduler is forgetting to run, which + * unfortunately also excludes toggling the static branches. + * + * Let's work around by overriding a couple ops and modifying behaviors based on + * the DISABLING state and then cycling the queued tasks through dequeue/enqueue + * to force global FIFO scheduling. + * + * - ops.select_cpu() is ignored and the default select_cpu() is used. + * + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. + * + * - ops.dispatch() is ignored. + * + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. + * + * - pick_next_task() suppresses zero slice warning. + * + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + * + * - scx_prio_less() reverts to the default core_sched_at order. + */ +static void scx_bypass(bool bypass) +{ + static DEFINE_RAW_SPINLOCK(bypass_lock); + static unsigned long bypass_timestamp; + struct scx_sched *sch; + unsigned long flags; + int cpu; + + raw_spin_lock_irqsave(&bypass_lock, flags); + sch = rcu_dereference_bh(scx_root); + + if (bypass) { + scx_bypass_depth++; + WARN_ON_ONCE(scx_bypass_depth <= 0); + if (scx_bypass_depth != 1) + goto unlock; + bypass_timestamp = ktime_get_ns(); + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); + } else { + scx_bypass_depth--; + WARN_ON_ONCE(scx_bypass_depth < 0); + if (scx_bypass_depth != 0) + goto unlock; + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); + } + + atomic_inc(&scx_breather_depth); + + /* + * No task property is changing. We just need to make sure all currently + * queued tasks are re-queued according to the new scx_rq_bypassing() + * state. As an optimization, walk each rq's runnable_list instead of + * the scx_tasks list. + * + * This function can't trust the scheduler and thus can't use + * cpus_read_lock(). Walk all possible CPUs instead of online. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p, *n; + + raw_spin_rq_lock(rq); + + if (bypass) { + WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); + rq->scx.flags |= SCX_RQ_BYPASSING; + } else { + WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING)); + rq->scx.flags &= ~SCX_RQ_BYPASSING; + } + + /* + * We need to guarantee that no tasks are on the BPF scheduler + * while bypassing. Either we see enabled or the enable path + * sees scx_rq_bypassing() before moving tasks to SCX. + */ + if (!scx_enabled()) { + raw_spin_rq_unlock(rq); + continue; + } + + /* + * The use of list_for_each_entry_safe_reverse() is required + * because each task is going to be removed from and added back + * to the runnable_list during iteration. Because they're added + * to the tail of the list, safe reverse iteration can still + * visit all nodes. + */ + list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, + scx.runnable_node) { + struct sched_enq_and_set_ctx ctx; + + /* cycling deq/enq is enough, see the function comment */ + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + sched_enq_and_set_task(&ctx); + } + + /* resched to restore ticks and idle state */ + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); + + raw_spin_rq_unlock(rq); + } + + atomic_dec(&scx_breather_depth); +unlock: + raw_spin_unlock_irqrestore(&bypass_lock, flags); + scx_clear_softlockup(); +} + +static void free_exit_info(struct scx_exit_info *ei) +{ + kvfree(ei->dump); + kfree(ei->msg); + kfree(ei->bt); + kfree(ei); +} + +static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) +{ + struct scx_exit_info *ei; + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) + return NULL; + + ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); + ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); + ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); + + if (!ei->bt || !ei->msg || !ei->dump) { + free_exit_info(ei); + return NULL; + } + + return ei; +} + +static const char *scx_exit_reason(enum scx_exit_kind kind) +{ + switch (kind) { + case SCX_EXIT_UNREG: + return "unregistered from user space"; + case SCX_EXIT_UNREG_BPF: + return "unregistered from BPF"; + case SCX_EXIT_UNREG_KERN: + return "unregistered from the main kernel"; + case SCX_EXIT_SYSRQ: + return "disabled by sysrq-S"; + case SCX_EXIT_ERROR: + return "runtime error"; + case SCX_EXIT_ERROR_BPF: + return "scx_bpf_error"; + case SCX_EXIT_ERROR_STALL: + return "runnable task stall"; + default: + return "<UNKNOWN>"; + } +} + +static void scx_disable_workfn(struct kthread_work *work) +{ + struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); + struct scx_exit_info *ei = sch->exit_info; + struct scx_task_iter sti; + struct task_struct *p; + int kind, cpu; + + kind = atomic_read(&sch->exit_kind); + while (true) { + if (kind == SCX_EXIT_DONE) /* already disabled? */ + return; + WARN_ON_ONCE(kind == SCX_EXIT_NONE); + if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) + break; + } + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + /* guarantee forward progress by bypassing scx_ops */ + scx_bypass(true); + + switch (scx_set_enable_state(SCX_DISABLING)) { + case SCX_DISABLING: + WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); + break; + case SCX_DISABLED: + pr_warn("sched_ext: ops error detected without ops (%s)\n", + sch->exit_info->msg); + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); + goto done; + default: + break; + } + + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ + mutex_lock(&scx_enable_mutex); + + static_branch_disable(&__scx_switched_all); + WRITE_ONCE(scx_switching_all, false); + + /* + * Shut down cgroup support before tasks so that the cgroup attach path + * doesn't race against scx_exit_task(). + */ + scx_cgroup_lock(); + scx_cgroup_exit(sch); + scx_cgroup_unlock(); + + /* + * The BPF scheduler is going away. All tasks including %TASK_DEAD ones + * must be switched out and exited synchronously. + */ + percpu_down_write(&scx_fork_rwsem); + + scx_init_task_enabled = false; + + scx_task_iter_start(&sti); + while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); + struct sched_enq_and_set_ctx ctx; + + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + p->sched_class = new_class; + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + scx_exit_task(p); + } + scx_task_iter_stop(&sti); + percpu_up_write(&scx_fork_rwsem); + + /* + * Invalidate all the rq clocks to prevent getting outdated + * rq clocks from a previous scx scheduler. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + scx_rq_clock_invalidate(rq); + } + + /* no task is on scx, turn off all the switches and flush in-progress calls */ + static_branch_disable(&__scx_enabled); + bitmap_zero(sch->has_op, SCX_OPI_END); + scx_idle_disable(); + synchronize_rcu(); + + if (ei->kind >= SCX_EXIT_ERROR) { + pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", + sch->ops.name, ei->reason); + + if (ei->msg[0] != '\0') + pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); +#ifdef CONFIG_STACKTRACE + stack_trace_print(ei->bt, ei->bt_len, 2); +#endif + } else { + pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", + sch->ops.name, ei->reason); + } + + if (sch->ops.exit) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); + + cancel_delayed_work_sync(&scx_watchdog_work); + + /* + * scx_root clearing must be inside cpus_read_lock(). See + * handle_hotplug(). + */ + cpus_read_lock(); + RCU_INIT_POINTER(scx_root, NULL); + cpus_read_unlock(); + + /* + * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs + * could observe an object of the same name still in the hierarchy when + * the next scheduler is loaded. + */ + kobject_del(&sch->kobj); + + free_percpu(scx_dsp_ctx); + scx_dsp_ctx = NULL; + scx_dsp_max_batch = 0; + + mutex_unlock(&scx_enable_mutex); + + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); +done: + scx_bypass(false); +} + +static void scx_disable(enum scx_exit_kind kind) +{ + int none = SCX_EXIT_NONE; + struct scx_sched *sch; + + if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) + kind = SCX_EXIT_ERROR; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) { + atomic_try_cmpxchg(&sch->exit_kind, &none, kind); + kthread_queue_work(sch->helper, &sch->disable_work); + } + rcu_read_unlock(); +} + +static void dump_newline(struct seq_buf *s) +{ + trace_sched_ext_dump(""); + + /* @s may be zero sized and seq_buf triggers WARN if so */ + if (s->size) + seq_buf_putc(s, '\n'); +} + +static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) +{ + va_list args; + +#ifdef CONFIG_TRACEPOINTS + if (trace_sched_ext_dump_enabled()) { + /* protected by scx_dump_state()::dump_lock */ + static char line_buf[SCX_EXIT_MSG_LEN]; + + va_start(args, fmt); + vscnprintf(line_buf, sizeof(line_buf), fmt, args); + va_end(args); + + trace_sched_ext_dump(line_buf); + } +#endif + /* @s may be zero sized and seq_buf triggers WARN if so */ + if (s->size) { + va_start(args, fmt); + seq_buf_vprintf(s, fmt, args); + va_end(args); + + seq_buf_putc(s, '\n'); + } +} + +static void dump_stack_trace(struct seq_buf *s, const char *prefix, + const unsigned long *bt, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + dump_line(s, "%s%pS", prefix, (void *)bt[i]); +} + +static void ops_dump_init(struct seq_buf *s, const char *prefix) +{ + struct scx_dump_data *dd = &scx_dump_data; + + lockdep_assert_irqs_disabled(); + + dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ + dd->first = true; + dd->cursor = 0; + dd->s = s; + dd->prefix = prefix; +} + +static void ops_dump_flush(void) +{ + struct scx_dump_data *dd = &scx_dump_data; + char *line = dd->buf.line; + + if (!dd->cursor) + return; + + /* + * There's something to flush and this is the first line. Insert a blank + * line to distinguish ops dump. + */ + if (dd->first) { + dump_newline(dd->s); + dd->first = false; + } + + /* + * There may be multiple lines in $line. Scan and emit each line + * separately. + */ + while (true) { + char *end = line; + char c; + + while (*end != '\n' && *end != '\0') + end++; + + /* + * If $line overflowed, it may not have newline at the end. + * Always emit with a newline. + */ + c = *end; + *end = '\0'; + dump_line(dd->s, "%s%s", dd->prefix, line); + if (c == '\0') + break; + + /* move to the next line */ + end++; + if (*end == '\0') + break; + line = end; + } + + dd->cursor = 0; +} + +static void ops_dump_exit(void) +{ + ops_dump_flush(); + scx_dump_data.cpu = -1; +} + +static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, + struct task_struct *p, char marker) +{ + static unsigned long bt[SCX_EXIT_BT_LEN]; + struct scx_sched *sch = scx_root; + char dsq_id_buf[19] = "(n/a)"; + unsigned long ops_state = atomic_long_read(&p->scx.ops_state); + unsigned int bt_len = 0; + + if (p->scx.dsq) + scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", + (unsigned long long)p->scx.dsq->id); + + dump_newline(s); + dump_line(s, " %c%c %s[%d] %+ldms", + marker, task_state_to_char(p), p->comm, p->pid, + jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); + dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", + scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, + p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, + ops_state >> SCX_OPSS_QSEQ_SHIFT); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); + dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + + if (SCX_HAS_OP(sch, dump_task)) { + ops_dump_init(s, " "); + SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p); + ops_dump_exit(); + } + +#ifdef CONFIG_STACKTRACE + bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); +#endif + if (bt_len) { + dump_newline(s); + dump_stack_trace(s, " ", bt, bt_len); + } +} + +static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) +{ + static DEFINE_SPINLOCK(dump_lock); + static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; + struct scx_sched *sch = scx_root; + struct scx_dump_ctx dctx = { + .kind = ei->kind, + .exit_code = ei->exit_code, + .reason = ei->reason, + .at_ns = ktime_get_ns(), + .at_jiffies = jiffies, + }; + struct seq_buf s; + struct scx_event_stats events; + unsigned long flags; + char *buf; + int cpu; + + spin_lock_irqsave(&dump_lock, flags); + + seq_buf_init(&s, ei->dump, dump_len); + + if (ei->kind == SCX_EXIT_NONE) { + dump_line(&s, "Debug dump triggered by %s", ei->reason); + } else { + dump_line(&s, "%s[%d] triggered exit kind %d:", + current->comm, current->pid, ei->kind); + dump_line(&s, " %s (%s)", ei->reason, ei->msg); + dump_newline(&s); + dump_line(&s, "Backtrace:"); + dump_stack_trace(&s, " ", ei->bt, ei->bt_len); + } + + if (SCX_HAS_OP(sch, dump)) { + ops_dump_init(&s, ""); + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx); + ops_dump_exit(); + } + + dump_newline(&s); + dump_line(&s, "CPU states"); + dump_line(&s, "----------"); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p; + struct seq_buf ns; + size_t avail, used; + bool idle; + + rq_lock(rq, &rf); + + idle = list_empty(&rq->scx.runnable_list) && + rq->curr->sched_class == &idle_sched_class; + + if (idle && !SCX_HAS_OP(sch, dump_cpu)) + goto next; + + /* + * We don't yet know whether ops.dump_cpu() will produce output + * and we may want to skip the default CPU dump if it doesn't. + * Use a nested seq_buf to generate the standard dump so that we + * can decide whether to commit later. + */ + avail = seq_buf_get_buf(&s, &buf); + seq_buf_init(&ns, buf, avail); + + dump_newline(&ns); + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", + cpu, rq->scx.nr_running, rq->scx.flags, + rq->scx.cpu_released, rq->scx.ops_qseq, + rq->scx.pnt_seq); + dump_line(&ns, " curr=%s[%d] class=%ps", + rq->curr->comm, rq->curr->pid, + rq->curr->sched_class); + if (!cpumask_empty(rq->scx.cpus_to_kick)) + dump_line(&ns, " cpus_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick)); + if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) + dump_line(&ns, " idle_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); + if (!cpumask_empty(rq->scx.cpus_to_preempt)) + dump_line(&ns, " cpus_to_preempt: %*pb", + cpumask_pr_args(rq->scx.cpus_to_preempt)); + if (!cpumask_empty(rq->scx.cpus_to_wait)) + dump_line(&ns, " cpus_to_wait : %*pb", + cpumask_pr_args(rq->scx.cpus_to_wait)); + + used = seq_buf_used(&ns); + if (SCX_HAS_OP(sch, dump_cpu)) { + ops_dump_init(&ns, " "); + SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL, + &dctx, cpu, idle); + ops_dump_exit(); + } + + /* + * If idle && nothing generated by ops.dump_cpu(), there's + * nothing interesting. Skip. + */ + if (idle && used == seq_buf_used(&ns)) + goto next; + + /* + * $s may already have overflowed when $ns was created. If so, + * calling commit on it will trigger BUG. + */ + if (avail) { + seq_buf_commit(&s, seq_buf_used(&ns)); + if (seq_buf_has_overflowed(&ns)) + seq_buf_set_overflow(&s); + } + + if (rq->curr->sched_class == &ext_sched_class) + scx_dump_task(&s, &dctx, rq->curr, '*'); + + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) + scx_dump_task(&s, &dctx, p, ' '); + next: + rq_unlock(rq, &rf); + } + + dump_newline(&s); + dump_line(&s, "Event counters"); + dump_line(&s, "--------------"); + + scx_read_events(sch, &events); + scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); + scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); + scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); + scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); + scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); + + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) + memcpy(ei->dump + dump_len - sizeof(trunc_marker), + trunc_marker, sizeof(trunc_marker)); + + spin_unlock_irqrestore(&dump_lock, flags); +} + +static void scx_error_irq_workfn(struct irq_work *irq_work) +{ + struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work); + struct scx_exit_info *ei = sch->exit_info; + + if (ei->kind >= SCX_EXIT_ERROR) + scx_dump_state(ei, sch->ops.exit_dump_len); + + kthread_queue_work(sch->helper, &sch->disable_work); +} + +static void scx_vexit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + const char *fmt, va_list args) +{ + struct scx_exit_info *ei = sch->exit_info; + int none = SCX_EXIT_NONE; + + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) + return; + + ei->exit_code = exit_code; +#ifdef CONFIG_STACKTRACE + if (kind >= SCX_EXIT_ERROR) + ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); +#endif + vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); + + /* + * Set ei->kind and ->reason for scx_dump_state(). They'll be set again + * in scx_disable_workfn(). + */ + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + irq_work_queue(&sch->error_irq_work); +} + +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) +{ + struct scx_sched *sch; + int node, ret; + + sch = kzalloc(sizeof(*sch), GFP_KERNEL); + if (!sch) + return ERR_PTR(-ENOMEM); + + sch->exit_info = alloc_exit_info(ops->exit_dump_len); + if (!sch->exit_info) { + ret = -ENOMEM; + goto err_free_sch; + } + + ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); + if (ret < 0) + goto err_free_ei; + + sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]), + GFP_KERNEL); + if (!sch->global_dsqs) { + ret = -ENOMEM; + goto err_free_hash; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + ret = -ENOMEM; + goto err_free_gdsqs; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + sch->global_dsqs[node] = dsq; + } + + sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); + if (!sch->event_stats_cpu) + goto err_free_gdsqs; + + sch->helper = kthread_run_worker(0, "sched_ext_helper"); + if (!sch->helper) + goto err_free_event_stats; + sched_set_fifo(sch->helper->task); + + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); + init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); + kthread_init_work(&sch->disable_work, scx_disable_workfn); + sch->ops = *ops; + ops->priv = sch; + + sch->kobj.kset = scx_kset; + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + if (ret < 0) + goto err_stop_helper; + + return sch; + +err_stop_helper: + kthread_stop(sch->helper->task); +err_free_event_stats: + free_percpu(sch->event_stats_cpu); +err_free_gdsqs: + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); + kfree(sch->global_dsqs); +err_free_hash: + rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); +err_free_ei: + free_exit_info(sch->exit_info); +err_free_sch: + kfree(sch); + return ERR_PTR(ret); +} + +static void check_hotplug_seq(struct scx_sched *sch, + const struct sched_ext_ops *ops) +{ + unsigned long long global_hotplug_seq; + + /* + * If a hotplug event has occurred between when a scheduler was + * initialized, and when we were able to attach, exit and notify user + * space about it. + */ + if (ops->hotplug_seq) { + global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); + if (ops->hotplug_seq != global_hotplug_seq) { + scx_exit(sch, SCX_EXIT_UNREG_KERN, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); + } + } +} + +static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) +{ + /* + * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the + * ops.enqueue() callback isn't implemented. + */ + if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { + scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + return -EINVAL; + } + + /* + * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle + * selection policy to be enabled. + */ + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && + (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); + return -EINVAL; + } + + if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) + pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + + return 0; +} + +static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +{ + struct scx_sched *sch; + struct scx_task_iter sti; + struct task_struct *p; + unsigned long timeout; + int i, cpu, ret; + + if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), + cpu_possible_mask)) { + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); + return -EINVAL; + } + + mutex_lock(&scx_enable_mutex); + + if (scx_enable_state() != SCX_DISABLED) { + ret = -EBUSY; + goto err_unlock; + } + + sch = scx_alloc_and_add_sched(ops); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); + goto err_unlock; + } + + /* + * Transition to ENABLING and clear exit info to arm the disable path. + * Failure triggers full disabling from here on. + */ + WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); + WARN_ON_ONCE(scx_root); + + atomic_long_set(&scx_nr_rejected, 0); + + for_each_possible_cpu(cpu) + cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + + /* + * Keep CPUs stable during enable so that the BPF scheduler can track + * online CPUs by watching ->on/offline_cpu() after ->init(). + */ + cpus_read_lock(); + + /* + * Make the scheduler instance visible. Must be inside cpus_read_lock(). + * See handle_hotplug(). + */ + rcu_assign_pointer(scx_root, sch); + + scx_idle_enable(ops); + + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); + if (ret) { + ret = ops_sanitize_err(sch, "init", ret); + cpus_read_unlock(); + scx_error(sch, "ops.init() failed (%d)", ret); + goto err_disable; + } + } + + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) + if (((void (**)(void))ops)[i]) + set_bit(i, sch->has_op); + + check_hotplug_seq(sch, ops); + scx_idle_update_selcpu_topology(ops); + + cpus_read_unlock(); + + ret = validate_ops(sch, ops); + if (ret) + goto err_disable; + + WARN_ON_ONCE(scx_dsp_ctx); + scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; + scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, + scx_dsp_max_batch), + __alignof__(struct scx_dsp_ctx)); + if (!scx_dsp_ctx) { + ret = -ENOMEM; + goto err_disable; + } + + if (ops->timeout_ms) + timeout = msecs_to_jiffies(ops->timeout_ms); + else + timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + WRITE_ONCE(scx_watchdog_timeout, timeout); + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + queue_delayed_work(system_unbound_wq, &scx_watchdog_work, + scx_watchdog_timeout / 2); + + /* + * Once __scx_enabled is set, %current can be switched to SCX anytime. + * This can lead to stalls as some BPF schedulers (e.g. userspace + * scheduling) may not function correctly before all tasks are switched. + * Init in bypass mode to guarantee forward progress. + */ + scx_bypass(true); + + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) + if (((void (**)(void))ops)[i]) + set_bit(i, sch->has_op); + + if (sch->ops.cpu_acquire || sch->ops.cpu_release) + sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; + + /* + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. + */ + percpu_down_write(&scx_fork_rwsem); + + WARN_ON_ONCE(scx_init_task_enabled); + scx_init_task_enabled = true; + + /* + * Enable ops for every task. Fork is excluded by scx_fork_rwsem + * preventing new tasks from being added. No need to exclude tasks + * leaving as sched_ext_free() can handle both prepped and enabled + * tasks. Prep all tasks first and then enable them with preemption + * disabled. + * + * All cgroups should be initialized before scx_init_task() so that the + * BPF scheduler can reliably track each task's cgroup membership from + * scx_init_task(). Lock out cgroup on/offlining and task migrations + * while tasks are being initialized so that scx_cgroup_can_attach() + * never sees uninitialized tasks. + */ + scx_cgroup_lock(); + ret = scx_cgroup_init(sch); + if (ret) + goto err_disable_unlock_all; + + scx_task_iter_start(&sti); + while ((p = scx_task_iter_next_locked(&sti))) { + /* + * @p may already be dead, have lost all its usages counts and + * be waiting for RCU grace period before being freed. @p can't + * be initialized for SCX in such cases and should be ignored. + */ + if (!tryget_task_struct(p)) + continue; + + scx_task_iter_unlock(&sti); + + ret = scx_init_task(p, task_group(p), false); + if (ret) { + put_task_struct(p); + scx_task_iter_relock(&sti); + scx_task_iter_stop(&sti); + scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); + goto err_disable_unlock_all; + } + + scx_set_task_state(p, SCX_TASK_READY); + + put_task_struct(p); + scx_task_iter_relock(&sti); + } + scx_task_iter_stop(&sti); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + /* + * All tasks are READY. It's safe to turn on scx_enabled() and switch + * all eligible tasks. + */ + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + static_branch_enable(&__scx_enabled); + + /* + * We're fully committed and can't fail. The task READY -> ENABLED + * transitions here are synchronized against sched_ext_free() through + * scx_tasks_lock. + */ + percpu_down_write(&scx_fork_rwsem); + scx_task_iter_start(&sti); + while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); + struct sched_enq_and_set_ctx ctx; + + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + } + scx_task_iter_stop(&sti); + percpu_up_write(&scx_fork_rwsem); + + scx_bypass(false); + + if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { + WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); + goto err_disable; + } + + if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) + static_branch_enable(&__scx_switched_all); + + pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", + sch->ops.name, scx_switched_all() ? "" : " (partial)"); + kobject_uevent(&sch->kobj, KOBJ_ADD); + mutex_unlock(&scx_enable_mutex); + + atomic_long_inc(&scx_enable_seq); + + return 0; + +err_unlock: + mutex_unlock(&scx_enable_mutex); + return ret; + +err_disable_unlock_all: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + scx_bypass(false); +err_disable: + mutex_unlock(&scx_enable_mutex); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_error() for cases scx_error() + * wasn't already invoked and exit indicating success so that the error + * is notified through ops.exit() with all the details. + * + * Flush scx_disable_work to ensure that error is reported before init + * completion. sch's base reference will be put by bpf_scx_unreg(). + */ + scx_error(sch, "scx_enable() failed (%d)", ret); + kthread_flush_work(&sch->disable_work); + return 0; +} + + +/******************************************************************************** + * bpf_struct_ops plumbing. + */ +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> + +static const struct btf_type *task_struct_type; + +static bool bpf_scx_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type != BPF_READ) + return false; + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (off % size != 0) + return false; + + return btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, + int size) +{ + const struct btf_type *t; + + t = btf_type_by_id(reg->btf, reg->btf_id); + if (t == task_struct_type) { + if (off >= offsetof(struct task_struct, scx.slice) && + off + size <= offsetofend(struct task_struct, scx.slice)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.disallow) && + off + size <= offsetofend(struct task_struct, scx.disallow)) + return SCALAR_VALUE; + } + + return -EACCES; +} + +static const struct bpf_verifier_ops bpf_scx_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_scx_is_valid_access, + .btf_struct_access = bpf_scx_btf_struct_access, +}; + +static int bpf_scx_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct sched_ext_ops *uops = udata; + struct sched_ext_ops *ops = kdata; + u32 moff = __btf_member_bit_offset(t, member) / 8; + int ret; + + switch (moff) { + case offsetof(struct sched_ext_ops, dispatch_max_batch): + if (*(u32 *)(udata + moff) > INT_MAX) + return -E2BIG; + ops->dispatch_max_batch = *(u32 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, flags): + if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) + return -EINVAL; + ops->flags = *(u64 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, name): + ret = bpf_obj_name_cpy(ops->name, uops->name, + sizeof(ops->name)); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + return 1; + case offsetof(struct sched_ext_ops, timeout_ms): + if (msecs_to_jiffies(*(u32 *)(udata + moff)) > + SCX_WATCHDOG_MAX_TIMEOUT) + return -E2BIG; + ops->timeout_ms = *(u32 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, exit_dump_len): + ops->exit_dump_len = + *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; + return 1; + case offsetof(struct sched_ext_ops, hotplug_seq): + ops->hotplug_seq = *(u64 *)(udata + moff); + return 1; + } + + return 0; +} + +static int bpf_scx_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct sched_ext_ops, init_task): +#ifdef CONFIG_EXT_GROUP_SCHED + case offsetof(struct sched_ext_ops, cgroup_init): + case offsetof(struct sched_ext_ops, cgroup_exit): + case offsetof(struct sched_ext_ops, cgroup_prep_move): +#endif + case offsetof(struct sched_ext_ops, cpu_online): + case offsetof(struct sched_ext_ops, cpu_offline): + case offsetof(struct sched_ext_ops, init): + case offsetof(struct sched_ext_ops, exit): + break; + default: + if (prog->sleepable) + return -EINVAL; + } + + return 0; +} + +static int bpf_scx_reg(void *kdata, struct bpf_link *link) +{ + return scx_enable(kdata, link); +} + +static void bpf_scx_unreg(void *kdata, struct bpf_link *link) +{ + struct sched_ext_ops *ops = kdata; + struct scx_sched *sch = ops->priv; + + scx_disable(SCX_EXIT_UNREG); + kthread_flush_work(&sch->disable_work); + kobject_put(&sch->kobj); +} + +static int bpf_scx_init(struct btf *btf) +{ + task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); + + return 0; +} + +static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) +{ + /* + * sched_ext does not support updating the actively-loaded BPF + * scheduler, as registering a BPF scheduler can always fail if the + * scheduler returns an error code for e.g. ops.init(), ops.init_task(), + * etc. Similarly, we can always race with unregistration happening + * elsewhere, such as with sysrq. + */ + return -EOPNOTSUPP; +} + +static int bpf_scx_validate(void *kdata) +{ + return 0; +} + +static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } +static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} +static void sched_ext_ops__tick(struct task_struct *p) {} +static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__running(struct task_struct *p) {} +static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} +static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} +static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } +static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } +static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} +static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} +static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} +static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} +static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} +static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } +static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} +static void sched_ext_ops__enable(struct task_struct *p) {} +static void sched_ext_ops__disable(struct task_struct *p) {} +#ifdef CONFIG_EXT_GROUP_SCHED +static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } +static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} +static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } +static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} +#endif +static void sched_ext_ops__cpu_online(s32 cpu) {} +static void sched_ext_ops__cpu_offline(s32 cpu) {} +static s32 sched_ext_ops__init(void) { return -EINVAL; } +static void sched_ext_ops__exit(struct scx_exit_info *info) {} +static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} +static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} +static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} + +static struct sched_ext_ops __bpf_ops_sched_ext_ops = { + .select_cpu = sched_ext_ops__select_cpu, + .enqueue = sched_ext_ops__enqueue, + .dequeue = sched_ext_ops__dequeue, + .dispatch = sched_ext_ops__dispatch, + .tick = sched_ext_ops__tick, + .runnable = sched_ext_ops__runnable, + .running = sched_ext_ops__running, + .stopping = sched_ext_ops__stopping, + .quiescent = sched_ext_ops__quiescent, + .yield = sched_ext_ops__yield, + .core_sched_before = sched_ext_ops__core_sched_before, + .set_weight = sched_ext_ops__set_weight, + .set_cpumask = sched_ext_ops__set_cpumask, + .update_idle = sched_ext_ops__update_idle, + .cpu_acquire = sched_ext_ops__cpu_acquire, + .cpu_release = sched_ext_ops__cpu_release, + .init_task = sched_ext_ops__init_task, + .exit_task = sched_ext_ops__exit_task, + .enable = sched_ext_ops__enable, + .disable = sched_ext_ops__disable, +#ifdef CONFIG_EXT_GROUP_SCHED + .cgroup_init = sched_ext_ops__cgroup_init, + .cgroup_exit = sched_ext_ops__cgroup_exit, + .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, + .cgroup_move = sched_ext_ops__cgroup_move, + .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, + .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, +#endif + .cpu_online = sched_ext_ops__cpu_online, + .cpu_offline = sched_ext_ops__cpu_offline, + .init = sched_ext_ops__init, + .exit = sched_ext_ops__exit, + .dump = sched_ext_ops__dump, + .dump_cpu = sched_ext_ops__dump_cpu, + .dump_task = sched_ext_ops__dump_task, +}; + +static struct bpf_struct_ops bpf_sched_ext_ops = { + .verifier_ops = &bpf_scx_verifier_ops, + .reg = bpf_scx_reg, + .unreg = bpf_scx_unreg, + .check_member = bpf_scx_check_member, + .init_member = bpf_scx_init_member, + .init = bpf_scx_init, + .update = bpf_scx_update, + .validate = bpf_scx_validate, + .name = "sched_ext_ops", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_ops_sched_ext_ops +}; + + +/******************************************************************************** + * System integration and init. + */ + +static void sysrq_handle_sched_ext_reset(u8 key) +{ + scx_disable(SCX_EXIT_SYSRQ); +} + +static const struct sysrq_key_op sysrq_sched_ext_reset_op = { + .handler = sysrq_handle_sched_ext_reset, + .help_msg = "reset-sched-ext(S)", + .action_msg = "Disable sched_ext and revert all tasks to CFS", + .enable_mask = SYSRQ_ENABLE_RTNICE, +}; + +static void sysrq_handle_sched_ext_dump(u8 key) +{ + struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; + + if (scx_enabled()) + scx_dump_state(&ei, 0); +} + +static const struct sysrq_key_op sysrq_sched_ext_dump_op = { + .handler = sysrq_handle_sched_ext_dump, + .help_msg = "dump-sched-ext(D)", + .action_msg = "Trigger sched_ext debug dump", + .enable_mask = SYSRQ_ENABLE_RTNICE, +}; + +static bool can_skip_idle_kick(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + /* + * We can skip idle kicking if @rq is going to go through at least one + * full SCX scheduling cycle before going idle. Just checking whether + * curr is not idle is insufficient because we could be racing + * balance_one() trying to pull the next task from a remote rq, which + * may fail, and @rq may become idle afterwards. + * + * The race window is small and we don't and can't guarantee that @rq is + * only kicked while idle anyway. Skip only when sure. + */ + return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); +} + +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) +{ + struct rq *rq = cpu_rq(cpu); + struct scx_rq *this_scx = &this_rq->scx; + bool should_wait = false; + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + /* + * During CPU hotplug, a CPU may depend on kicking itself to make + * forward progress. Allow kicking self regardless of online state. + */ + if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { + if (rq->curr->sched_class == &ext_sched_class) + rq->curr->scx.slice = 0; + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + } + + if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { + pseqs[cpu] = rq->scx.pnt_seq; + should_wait = true; + } + + resched_curr(rq); + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } + + raw_spin_rq_unlock_irqrestore(rq, flags); + + return should_wait; +} + +static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + if (!can_skip_idle_kick(rq) && + (cpu_online(cpu) || cpu == cpu_of(this_rq))) + resched_curr(rq); + + raw_spin_rq_unlock_irqrestore(rq, flags); +} + +static void kick_cpus_irq_workfn(struct irq_work *irq_work) +{ + struct rq *this_rq = this_rq(); + struct scx_rq *this_scx = &this_rq->scx; + unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + bool should_wait = false; + s32 cpu; + + for_each_cpu(cpu, this_scx->cpus_to_kick) { + should_wait |= kick_one_cpu(cpu, this_rq, pseqs); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { + kick_one_cpu_if_idle(cpu, this_rq); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + if (!should_wait) + return; + + for_each_cpu(cpu, this_scx->cpus_to_wait) { + unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + + if (cpu != cpu_of(this_rq)) { + /* + * Pairs with smp_store_release() issued by this CPU in + * switch_class() on the resched path. + * + * We busy-wait here to guarantee that no other task can + * be scheduled on our core before the target CPU has + * entered the resched path. + */ + while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) + cpu_relax(); + } + + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } +} + +/** + * print_scx_info - print out sched_ext scheduler state + * @log_lvl: the log level to use when printing + * @p: target task + * + * If a sched_ext scheduler is enabled, print the name and state of the + * scheduler. If @p is on sched_ext, print further information about the task. + * + * This function can be safely called on any task as long as the task_struct + * itself is accessible. While safe, this function isn't synchronized and may + * print out mixups or garbages of limited length. + */ +void print_scx_info(const char *log_lvl, struct task_struct *p) +{ + struct scx_sched *sch = scx_root; + enum scx_enable_state state = scx_enable_state(); + const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; + char runnable_at_buf[22] = "?"; + struct sched_class *class; + unsigned long runnable_at; + + if (state == SCX_DISABLED) + return; + + /* + * Carefully check if the task was running on sched_ext, and then + * carefully copy the time it's been runnable, and its state. + */ + if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || + class != &ext_sched_class) { + printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, + scx_enable_state_str[state], all); + return; + } + + if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, + sizeof(runnable_at))) + scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", + jiffies_delta_msecs(runnable_at, jiffies)); + + /* print everything onto one line to conserve console space */ + printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", + log_lvl, sch->ops.name, scx_enable_state_str[state], all, + runnable_at_buf); +} + +static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) +{ + /* + * SCX schedulers often have userspace components which are sometimes + * involved in critial scheduling paths. PM operations involve freezing + * userspace which can lead to scheduling misbehaviors including stalls. + * Let's bypass while PM operations are in progress. + */ + switch (event) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + case PM_RESTORE_PREPARE: + scx_bypass(true); + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + case PM_POST_RESTORE: + scx_bypass(false); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_pm_notifier = { + .notifier_call = scx_pm_handler, +}; + +void __init init_sched_ext_class(void) +{ + s32 cpu, v; + + /* + * The following is to prevent the compiler from optimizing out the enum + * definitions so that BPF scheduler implementations can use them + * through the generated vmlinux.h. + */ + WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | + SCX_TG_ONLINE); + + scx_idle_init_masks(); + + scx_kick_cpus_pnt_seqs = + __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, + __alignof__(scx_kick_cpus_pnt_seqs[0])); + BUG_ON(!scx_kick_cpus_pnt_seqs); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + int n = cpu_to_node(cpu); + + init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + INIT_LIST_HEAD(&rq->scx.runnable_list); + INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); + + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); + init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); + init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + + if (cpu_online(cpu)) + cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; + } + + register_sysrq_key('S', &sysrq_sched_ext_reset_op); + register_sysrq_key('D', &sysrq_sched_ext_dump_op); + INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); +} + + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ +static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) +{ + if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + return false; + + lockdep_assert_irqs_disabled(); + + if (unlikely(!p)) { + scx_kf_error("called with NULL task"); + return false; + } + + if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { + scx_kf_error("invalid enq_flags 0x%llx", enq_flags); + return false; + } + + return true; +} + +static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct task_struct *ddsp_task; + + ddsp_task = __this_cpu_read(direct_dispatch_task); + if (ddsp_task) { + mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + return; + } + + if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { + scx_kf_error("dispatch buffer overflow"); + return; + } + + dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ + .task = p, + .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, + .dsq_id = dsq_id, + .enq_flags = enq_flags, + }; +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into + * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @enq_flags: SCX_ENQ_* + * + * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to + * call this function spuriously. Can be called from ops.enqueue(), + * ops.select_cpu(), and ops.dispatch(). + * + * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch + * and @p must match the task being enqueued. + * + * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p + * will be directly inserted into the corresponding dispatch queue after + * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be + * inserted into the local DSQ of the CPU returned by ops.select_cpu(). + * @enq_flags are OR'd with the enqueue flags on the enqueue path before the + * task is inserted. + * + * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id + * and this function can be called upto ops.dispatch_max_batch times to insert + * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the + * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * + * This function doesn't have any locking restrictions and may be called under + * BPF locks (in the future when BPF introduces more flexible locking). + * + * @p is allowed to run for @slice. The scheduling path is triggered on slice + * exhaustion. If zero, the current residual slice is maintained. If + * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with + * scx_bpf_kick_cpu() to trigger scheduling. + */ +__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + if (!scx_dsq_insert_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + scx_dsq_insert_commit(p, dsq_id, enq_flags); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()"); + scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags); +} + +/** + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into + * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime. All other aspects + * are identical to scx_bpf_dsq_insert(). + * + * @vtime ordering is according to time_before64() which considers wrapping. A + * numerically larger vtime may indicate an earlier position in the ordering and + * vice-versa. + * + * A DSQ can only be used as a FIFO or priority queue at any given time and this + * function must not be called on a DSQ which already has one or more FIFO tasks + * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and + * SCX_DSQ_GLOBAL) cannot be used as priority queues. + */ +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + if (!scx_dsq_insert_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()"); + scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_enqueue_dispatch, +}; + +static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, + struct task_struct *p, u64 dsq_id, u64 enq_flags) +{ + struct scx_sched *sch = scx_root; + struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; + struct rq *this_rq, *src_rq, *locked_rq; + bool dispatched = false; + bool in_balance; + unsigned long flags; + + if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + return false; + + /* + * Can be called from either ops.dispatch() locking this_rq() or any + * context where no rq lock is held. If latter, lock @p's task_rq which + * we'll likely need anyway. + */ + src_rq = task_rq(p); + + local_irq_save(flags); + this_rq = this_rq(); + in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; + + if (in_balance) { + if (this_rq != src_rq) { + raw_spin_rq_unlock(this_rq); + raw_spin_rq_lock(src_rq); + } + } else { + raw_spin_rq_lock(src_rq); + } + + /* + * If the BPF scheduler keeps calling this function repeatedly, it can + * cause similar live-lock conditions as consume_dispatch_q(). Insert a + * breather if necessary. + */ + scx_breather(src_rq); + + locked_rq = src_rq; + raw_spin_lock(&src_dsq->lock); + + /* + * Did someone else get to it? @p could have already left $src_dsq, got + * re-enqueud, or be in the process of being consumed by someone else. + */ + if (unlikely(p->scx.dsq != src_dsq || + u32_before(kit->cursor.priv, p->scx.dsq_seq) || + p->scx.holding_cpu >= 0) || + WARN_ON_ONCE(src_rq != task_rq(p))) { + raw_spin_unlock(&src_dsq->lock); + goto out; + } + + /* @p is still on $src_dsq and stable, determine the destination */ + dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p); + + /* + * Apply vtime and slice updates before moving so that the new time is + * visible before inserting into $dst_dsq. @p is still on $src_dsq but + * this is safe as we're locking it. + */ + if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) + p->scx.dsq_vtime = kit->vtime; + if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) + p->scx.slice = kit->slice; + + /* execute move */ + locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); + dispatched = true; +out: + if (in_balance) { + if (this_rq != locked_rq) { + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(this_rq); + } + } else { + raw_spin_rq_unlock_irqrestore(locked_rq, flags); + } + + kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME); + return dispatched; +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots + * + * Can only be called from ops.dispatch(). + */ +__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) +{ + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return 0; + + return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); +} + +/** + * scx_bpf_dispatch_cancel - Cancel the latest dispatch + * + * Cancel the latest dispatch. Can be called multiple times to cancel further + * dispatches. Can only be called from ops.dispatch(). + */ +__bpf_kfunc void scx_bpf_dispatch_cancel(void) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return; + + if (dspc->cursor > 0) + dspc->cursor--; + else + scx_kf_error("dispatch buffer underflow"); +} + +/** + * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ + * @dsq_id: DSQ to move task from + * + * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's + * local DSQ for execution. Can only be called from ops.dispatch(). + * + * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() + * before trying to move from the specified DSQ. It may also grab rq locks and + * thus can't be called under any BPF locks. + * + * Returns %true if a task has been moved, %false if there isn't any task to + * move. + */ +__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) +{ + struct scx_sched *sch = scx_root; + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_dispatch_q *dsq; + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return false; + + flush_dispatch_buf(sch, dspc->rq); + + dsq = find_user_dsq(sch, dsq_id); + if (unlikely(!dsq)) { + scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); + return false; + } + + if (consume_dispatch_q(sch, dspc->rq, dsq)) { + /* + * A successfully consumed task can be dequeued before it starts + * running while the CPU is trying to migrate other dispatched + * tasks. Bump nr_tasks to tell balance_scx() to retry on empty + * local DSQ. + */ + dspc->nr_tasks++; + return true; + } else { + return false; + } +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); + return scx_bpf_dsq_move_to_local(dsq_id); +} + +/** + * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs + * @it__iter: DSQ iterator in progress + * @slice: duration the moved task can run for in nsecs + * + * Override the slice of the next task that will be moved from @it__iter using + * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous + * slice duration is kept. + */ +__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, + u64 slice) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; + + kit->slice = slice; + kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( + struct bpf_iter_scx_dsq *it__iter, u64 slice) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()"); + scx_bpf_dsq_move_set_slice(it__iter, slice); +} + +/** + * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs + * @it__iter: DSQ iterator in progress + * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ + * + * Override the vtime of the next task that will be moved from @it__iter using + * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice + * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the + * override is ignored and cleared. + */ +__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, + u64 vtime) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; + + kit->vtime = vtime; + kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( + struct bpf_iter_scx_dsq *it__iter, u64 vtime) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()"); + scx_bpf_dsq_move_set_vtime(it__iter, vtime); +} + +/** + * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ + * @it__iter: DSQ iterator in progress + * @p: task to transfer + * @dsq_id: DSQ to move @p to + * @enq_flags: SCX_ENQ_* + * + * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ + * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can + * be the destination. + * + * For the transfer to be successful, @p must still be on the DSQ and have been + * queued before the DSQ iteration started. This function doesn't care whether + * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have + * been queued before the iteration started. + * + * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. + * + * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq + * lock (e.g. BPF timers or SYSCALL programs). + * + * Returns %true if @p has been consumed, %false if @p had already been consumed + * or dequeued. + */ +__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); + return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags); +} + +/** + * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ + * @it__iter: DSQ iterator in progress + * @p: task to transfer + * @dsq_id: DSQ to move @p to + * @enq_flags: SCX_ENQ_* + * + * Transfer @p which is on the DSQ currently iterated by @it__iter to the + * priority queue of the DSQ specified by @dsq_id. The destination must be a + * user DSQ as only user DSQs support priority queue. + * + * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() + * and scx_bpf_dsq_move_set_vtime() to update. + * + * All other aspects are identical to scx_bpf_dsq_move(). See + * scx_bpf_dsq_insert_vtime() for more information on @vtime. + */ +__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()"); + return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) +BTF_ID_FLAGS(func, scx_bpf_consume) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_dispatch, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + LIST_HEAD(tasks); + u32 nr_enqueued = 0; + struct rq *rq; + struct task_struct *p, *n; + + if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + + /* + * The BPF scheduler may choose to dispatch tasks back to + * @rq->scx.local_dsq. Move all candidate tasks off to a private list + * first to avoid processing the same tasks repeatedly. + */ + list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, + scx.dsq_list.node) { + /* + * If @p is being migrated, @p's current CPU may not agree with + * its allowed CPUs and the migration_cpu_stop is about to + * deactivate and re-activate @p anyway. Skip re-enqueueing. + * + * While racing sched property changes may also dequeue and + * re-enqueue a migrating task while its current CPU and allowed + * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to + * the current local DSQ for running tasks and thus are not + * visible to the BPF scheduler. + * + * Also skip re-enqueueing tasks that can only run on this + * CPU, as they would just be re-added to the same local + * DSQ without any benefit. + */ + if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + continue; + + dispatch_dequeue(rq, p); + list_add_tail(&p->scx.dsq_list.node, &tasks); + } + + list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { + list_del_init(&p->scx.dsq_list.node); + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + nr_enqueued++; + } + + return nr_enqueued; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) +BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) + +static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_cpu_release, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_create_dsq - Create a custom DSQ + * @dsq_id: DSQ to create + * @node: NUMA node to allocate from + * + * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable + * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. + */ +__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) +{ + struct scx_dispatch_q *dsq; + struct scx_sched *sch; + s32 ret; + + if (unlikely(node >= (int)nr_node_ids || + (node < 0 && node != NUMA_NO_NODE))) + return -EINVAL; + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) + return -EINVAL; + + dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) + return -ENOMEM; + + init_dsq(dsq, dsq_id); + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (sch) + ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, + dsq_hash_params); + else + ret = -ENODEV; + + rcu_read_unlock(); + if (ret) + kfree(dsq); + return ret; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_unlocked) +BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_unlocked) + +static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_unlocked, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct rq *this_rq; + unsigned long irq_flags; + + if (!kf_cpu_valid(cpu, NULL)) + return; + + local_irq_save(irq_flags); + + this_rq = this_rq(); + + /* + * While bypassing for PM ops, IRQ handling may not be online which can + * lead to irq_work_queue() malfunction such as infinite busy wait for + * IRQ status update. Suppress kicking. + */ + if (scx_rq_bypassing(this_rq)) + goto out; + + /* + * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting + * rq locks. We can probably be smarter and avoid bouncing if called + * from ops which don't hold a rq lock. + */ + if (flags & SCX_KICK_IDLE) { + struct rq *target_rq = cpu_rq(cpu); + + if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) + scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + + if (raw_spin_rq_trylock(target_rq)) { + if (can_skip_idle_kick(target_rq)) { + raw_spin_rq_unlock(target_rq); + goto out; + } + raw_spin_rq_unlock(target_rq); + } + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); + } else { + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); + + if (flags & SCX_KICK_PREEMPT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); + if (flags & SCX_KICK_WAIT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); + } + + irq_work_queue(&this_rq->scx.kick_cpus_irq_work); +out: + local_irq_restore(irq_flags); +} + +/** + * scx_bpf_dsq_nr_queued - Return the number of queued tasks + * @dsq_id: id of the DSQ + * + * Return the number of tasks in the DSQ matching @dsq_id. If not found, + * -%ENOENT is returned. + */ +__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +{ + struct scx_sched *sch; + struct scx_dispatch_q *dsq; + s32 ret; + + preempt_disable(); + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) { + ret = -ENODEV; + goto out; + } + + if (dsq_id == SCX_DSQ_LOCAL) { + ret = READ_ONCE(this_rq()->scx.local_dsq.nr); + goto out; + } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (ops_cpu_valid(sch, cpu, NULL)) { + ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); + goto out; + } + } else { + dsq = find_user_dsq(sch, dsq_id); + if (dsq) { + ret = READ_ONCE(dsq->nr); + goto out; + } + } + ret = -ENOENT; +out: + preempt_enable(); + return ret; +} + +/** + * scx_bpf_destroy_dsq - Destroy a custom DSQ + * @dsq_id: DSQ to destroy + * + * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with + * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is + * empty and no further tasks are dispatched to it. Ignored if called on a DSQ + * which doesn't exist. Can be called from any online scx_ops operations. + */ +__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) +{ + struct scx_sched *sch; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + destroy_dsq(sch, dsq_id); + rcu_read_unlock(); +} + +/** + * bpf_iter_scx_dsq_new - Create a DSQ iterator + * @it: iterator to initialize + * @dsq_id: DSQ to iterate + * @flags: %SCX_DSQ_ITER_* + * + * Initialize BPF iterator @it which can be used with bpf_for_each() to walk + * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes + * tasks which are already queued when this function is invoked. + */ +__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, + u64 flags) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + struct scx_sched *sch; + + BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > + sizeof(struct bpf_iter_scx_dsq)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != + __alignof__(struct bpf_iter_scx_dsq)); + + /* + * next() and destroy() will be called regardless of the return value. + * Always clear $kit->dsq. + */ + kit->dsq = NULL; + + sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held()); + if (unlikely(!sch)) + return -ENODEV; + + if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) + return -EINVAL; + + kit->dsq = find_user_dsq(sch, dsq_id); + if (!kit->dsq) + return -ENOENT; + + INIT_LIST_HEAD(&kit->cursor.node); + kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; + kit->cursor.priv = READ_ONCE(kit->dsq->seq); + + return 0; +} + +/** + * bpf_iter_scx_dsq_next - Progress a DSQ iterator + * @it: iterator to progress + * + * Return the next task. See bpf_iter_scx_dsq_new(). + */ +__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + unsigned long flags; + + if (!kit->dsq) + return NULL; + + raw_spin_lock_irqsave(&kit->dsq->lock, flags); + + if (list_empty(&kit->cursor.node)) + p = NULL; + else + p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); + + /* + * Only tasks which were queued before the iteration started are + * visible. This bounds BPF iterations and guarantees that vtime never + * jumps in the other direction while iterating. + */ + do { + p = nldsq_next_task(kit->dsq, p, rev); + } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); + + if (p) { + if (rev) + list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); + else + list_move(&kit->cursor.node, &p->scx.dsq_list.node); + } else { + list_del_init(&kit->cursor.node); + } + + raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); + + return p; +} + +/** + * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator + * @it: iterator to destroy + * + * Undo scx_iter_scx_dsq_new(). + */ +__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + + if (!kit->dsq) + return; + + if (!list_empty(&kit->cursor.node)) { + unsigned long flags; + + raw_spin_lock_irqsave(&kit->dsq->lock, flags); + list_del_init(&kit->cursor.node); + raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); + } + kit->dsq = NULL; +} + +__bpf_kfunc_end_defs(); + +static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, + char *fmt, unsigned long long *data, u32 data__sz) +{ + struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; + s32 ret; + + if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || + (data__sz && !data)) { + scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); + return -EINVAL; + } + + ret = copy_from_kernel_nofault(data_buf, data, data__sz); + if (ret < 0) { + scx_kf_error("failed to read data fields (%d)", ret); + return ret; + } + + ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, + &bprintf_data); + if (ret < 0) { + scx_kf_error("format preparation failed (%d)", ret); + return ret; + } + + ret = bstr_printf(line_buf, line_size, fmt, + bprintf_data.bin_args); + bpf_bprintf_cleanup(&bprintf_data); + if (ret < 0) { + scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); + return ret; + } + + return ret; +} + +static s32 bstr_format(struct scx_bstr_buf *buf, + char *fmt, unsigned long long *data, u32 data__sz) +{ + return __bstr_format(buf->data, buf->line, sizeof(buf->line), + fmt, data, data__sz); +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. + * @exit_code: Exit value to pass to user space via struct scx_exit_info. + * @fmt: error message format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops + * disabling. + */ +__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, + unsigned long long *data, u32 data__sz) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); + if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); + raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); +} + +/** + * scx_bpf_error_bstr - Indicate fatal error + * @fmt: error message format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * Indicate that the BPF scheduler encountered a fatal error and initiate ops + * disabling. + */ +__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, + u32 data__sz) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); + if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); + raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); +} + +/** + * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler + * @fmt: format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and + * dump_task() to generate extra debug dump specific to the BPF scheduler. + * + * The extra dump may be multiple lines. A single line may be split over + * multiple calls. The last line is automatically terminated. + */ +__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, + u32 data__sz) +{ + struct scx_dump_data *dd = &scx_dump_data; + struct scx_bstr_buf *buf = &dd->buf; + s32 ret; + + if (raw_smp_processor_id() != dd->cpu) { + scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + return; + } + + /* append the formatted string to the line buf */ + ret = __bstr_format(buf->data, buf->line + dd->cursor, + sizeof(buf->line) - dd->cursor, fmt, data, data__sz); + if (ret < 0) { + dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", + dd->prefix, fmt, data, data__sz, ret); + return; + } + + dd->cursor += ret; + dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); + + if (!dd->cursor) + return; + + /* + * If the line buf overflowed or ends in a newline, flush it into the + * dump. This is to allow the caller to generate a single line over + * multiple calls. As ops_dump_flush() can also handle multiple lines in + * the line buf, the only case which can lead to an unexpected + * truncation is when the caller keeps generating newlines in the middle + * instead of the end consecutively. Don't do that. + */ + if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') + ops_dump_flush(); +} + +/** + * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU + * @cpu: CPU of interest + * + * Return the maximum relative capacity of @cpu in relation to the most + * performant CPU in the system. The return value is in the range [1, + * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) +{ + if (kf_cpu_valid(cpu, NULL)) + return arch_scale_cpu_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU + * @cpu: CPU of interest + * + * Return the current relative performance of @cpu in relation to its maximum. + * The return value is in the range [1, %SCX_CPUPERF_ONE]. + * + * The current performance level of a CPU in relation to the maximum performance + * available in the system can be calculated as follows: + * + * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE + * + * The result is in the range [1, %SCX_CPUPERF_ONE]. + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) +{ + if (kf_cpu_valid(cpu, NULL)) + return arch_scale_freq_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_set - Set the relative performance target of a CPU + * @cpu: CPU of interest + * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * + * Set the target performance level of @cpu to @perf. @perf is in linear + * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the + * schedutil cpufreq governor chooses the target frequency. + * + * The actual performance level chosen, CPU grouping, and the overhead and + * latency of the operations are dependent on the hardware and cpufreq driver in + * use. Consult hardware and cpufreq documentation for more information. The + * current performance level can be monitored using scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) +{ + if (unlikely(perf > SCX_CPUPERF_ONE)) { + scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + return; + } + + if (kf_cpu_valid(cpu, NULL)) { + struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); + struct rq_flags rf; + + /* + * When called with an rq lock held, restrict the operation + * to the corresponding CPU to prevent ABBA deadlocks. + */ + if (locked_rq && rq != locked_rq) { + scx_kf_error("Invalid target CPU %d", cpu); + return; + } + + /* + * If no rq lock is held, allow to operate on any CPU by + * acquiring the corresponding rq lock. + */ + if (!locked_rq) { + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + } + + rq->scx.cpuperf_target = perf; + cpufreq_update_util(rq, 0); + + if (!locked_rq) + rq_unlock_irqrestore(rq, &rf); + } +} + +/** + * scx_bpf_nr_node_ids - Return the number of possible node IDs + * + * All valid node IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_node_ids(void) +{ + return nr_node_ids; +} + +/** + * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs + * + * All valid CPU IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) +{ + return nr_cpu_ids; +} + +/** + * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) +{ + return cpu_possible_mask; +} + +/** + * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) +{ + return cpu_online_mask; +} + +/** + * scx_bpf_put_cpumask - Release a possible/online cpumask + * @cpumask: cpumask to release + */ +__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global cpumask, which is read-only in the caller and + * is never released. The acquire / release semantics here are just used + * to make the cpumask is a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_task_running - Is task currently running? + * @p: task of interest + */ +__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) +{ + return task_rq(p)->curr == p; +} + +/** + * scx_bpf_task_cpu - CPU a task is currently associated with + * @p: task of interest + */ +__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) +{ + return task_cpu(p); +} + +/** + * scx_bpf_cpu_rq - Fetch the rq of a CPU + * @cpu: CPU of the rq + */ +__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) +{ + if (!kf_cpu_valid(cpu, NULL)) + return NULL; + + return cpu_rq(cpu); +} + +/** + * scx_bpf_task_cgroup - Return the sched cgroup of a task + * @p: task of interest + * + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with + * from the scheduler's POV. SCX operations should use this function to + * determine @p's current cgroup as, unlike following @p->cgroups, + * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all + * rq-locked operations. Can be called on the parameter tasks of rq-locked + * operations. The restriction guarantees that @p's rq is locked by the caller. + */ +#ifdef CONFIG_CGROUP_SCHED +__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) +{ + struct task_group *tg = p->sched_task_group; + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + + if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + goto out; + + cgrp = tg_cgrp(tg); + +out: + cgroup_get(cgrp); + return cgrp; +} +#endif + +/** + * scx_bpf_now - Returns a high-performance monotonically non-decreasing + * clock for the current CPU. The clock returned is in nanoseconds. + * + * It provides the following properties: + * + * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently + * to account for execution time and track tasks' runtime properties. + * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which + * eventually reads a hardware timestamp counter -- is neither performant nor + * scalable. scx_bpf_now() aims to provide a high-performance clock by + * using the rq clock in the scheduler core whenever possible. + * + * 2) High enough resolution for the BPF scheduler use cases: In most BPF + * scheduler use cases, the required clock resolution is lower than the most + * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically + * uses the rq clock in the scheduler core whenever it is valid. It considers + * that the rq clock is valid from the time the rq clock is updated + * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). + * + * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() + * guarantees the clock never goes backward when comparing them in the same + * CPU. On the other hand, when comparing clocks in different CPUs, there + * is no such guarantee -- the clock can go backward. It provides a + * monotonically *non-decreasing* clock so that it would provide the same + * clock values in two different scx_bpf_now() calls in the same CPU + * during the same period of when the rq clock is valid. + */ +__bpf_kfunc u64 scx_bpf_now(void) +{ + struct rq *rq; + u64 clock; + + preempt_disable(); + + rq = this_rq(); + if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { + /* + * If the rq clock is valid, use the cached rq clock. + * + * Note that scx_bpf_now() is re-entrant between a process + * context and an interrupt context (e.g., timer interrupt). + * However, we don't need to consider the race between them + * because such race is not observable from a caller. + */ + clock = READ_ONCE(rq->scx.clock); + } else { + /* + * Otherwise, return a fresh rq clock. + * + * The rq clock is updated outside of the rq lock. + * In this case, keep the updated rq clock invalid so the next + * kfunc call outside the rq lock gets a fresh rq clock. + */ + clock = sched_clock_cpu(cpu_of(rq)); + } + + preempt_enable(); + + return clock; +} + +static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) +{ + struct scx_event_stats *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into @events. */ + memset(events, 0, sizeof(*events)); + for_each_possible_cpu(cpu) { + e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); + scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); + } +} + +/* + * scx_bpf_events - Get a system-wide event counter to + * @events: output buffer from a BPF program + * @events__sz: @events len, must end in '__sz'' for the verifier + */ +__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, + size_t events__sz) +{ + struct scx_sched *sch; + struct scx_event_stats e_sys; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + scx_read_events(sch, &e_sys); + else + memset(&e_sys, 0, sizeof(e_sys)); + rcu_read_unlock(); + + /* + * We cannot entirely trust a BPF-provided size since a BPF program + * might be compiled against a different vmlinux.h, of which + * scx_event_stats would be larger (a newer vmlinux.h) or smaller + * (an older vmlinux.h). Hence, we use the smaller size to avoid + * memory corruption. + */ + events__sz = min(events__sz, sizeof(*events)); + memcpy(events, &e_sys, events__sz); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) +BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) +BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +#ifdef CONFIG_CGROUP_SCHED +BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) +#endif +BTF_ID_FLAGS(func, scx_bpf_now) +BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(scx_kfunc_ids_any) + +static const struct btf_kfunc_id_set scx_kfunc_set_any = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_any, +}; + +static int __init scx_init(void) +{ + int ret; + + /* + * kfunc registration can't be done from init_sched_ext_class() as + * register_btf_kfunc_id_set() needs most of the system to be up. + * + * Some kfuncs are context-sensitive and can only be called from + * specific SCX ops. They are grouped into BTF sets accordingly. + * Unfortunately, BPF currently doesn't have a way of enforcing such + * restrictions. Eventually, the verifier should be able to enforce + * them. For now, register them the same and make each kfunc explicitly + * check using scx_kf_allowed(). + */ + if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_enqueue_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_cpu_release)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_unlocked)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &scx_kfunc_set_unlocked)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &scx_kfunc_set_any))) { + pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); + return ret; + } + + ret = scx_idle_init(); + if (ret) { + pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); + return ret; + } + + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); + if (ret) { + pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); + return ret; + } + + ret = register_pm_notifier(&scx_pm_notifier); + if (ret) { + pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); + return ret; + } + + scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); + if (!scx_kset) { + pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); + return -ENOMEM; + } + + ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); + if (ret < 0) { + pr_err("sched_ext: Failed to add global attributes\n"); + return ret; + } + + return 0; +} +__initcall(scx_init); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h new file mode 100644 index 000000000000..6e5072f57771 --- /dev/null +++ b/kernel/sched/ext.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifdef CONFIG_SCHED_CLASS_EXT + +static inline bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); + +void scx_tick(struct rq *rq); +void init_scx_entity(struct sched_ext_entity *scx); +void scx_pre_fork(struct task_struct *p); +int scx_fork(struct task_struct *p); +void scx_post_fork(struct task_struct *p); +void scx_cancel_fork(struct task_struct *p); +bool scx_can_stop_tick(struct rq *rq); +void scx_rq_activate(struct rq *rq); +void scx_rq_deactivate(struct rq *rq); +int scx_check_setscheduler(struct task_struct *p, int policy); +bool task_should_scx(int policy); +bool scx_allow_ttwu_queue(const struct task_struct *p); +void init_sched_ext_class(void); + +static inline u32 scx_cpuperf_target(s32 cpu) +{ + if (scx_enabled()) + return cpu_rq(cpu)->scx.cpuperf_target; + else + return 0; +} + +static inline bool task_on_scx(const struct task_struct *p) +{ + return scx_enabled() && p->sched_class == &ext_sched_class; +} + +#ifdef CONFIG_SCHED_CORE +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi); +#endif + +#else /* CONFIG_SCHED_CLASS_EXT */ + +static inline void scx_tick(struct rq *rq) {} +static inline void scx_pre_fork(struct task_struct *p) {} +static inline int scx_fork(struct task_struct *p) { return 0; } +static inline void scx_post_fork(struct task_struct *p) {} +static inline void scx_cancel_fork(struct task_struct *p) {} +static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } +static inline bool scx_can_stop_tick(struct rq *rq) { return true; } +static inline void scx_rq_activate(struct rq *rq) {} +static inline void scx_rq_deactivate(struct rq *rq) {} +static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } +static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; } +static inline void init_sched_ext_class(void) {} + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); + +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + if (scx_enabled()) + __scx_update_idle(rq, idle, do_notify); +} +#else +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} +#endif + +#ifdef CONFIG_CGROUP_SCHED +#ifdef CONFIG_EXT_GROUP_SCHED +int scx_tg_online(struct task_group *tg); +void scx_tg_offline(struct task_group *tg); +int scx_cgroup_can_attach(struct cgroup_taskset *tset); +void scx_cgroup_move_task(struct task_struct *p); +void scx_cgroup_finish_attach(void); +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); +void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); +void scx_group_set_idle(struct task_group *tg, bool idle); +#else /* CONFIG_EXT_GROUP_SCHED */ +static inline int scx_tg_online(struct task_group *tg) { return 0; } +static inline void scx_tg_offline(struct task_group *tg) {} +static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } +static inline void scx_cgroup_move_task(struct task_struct *p) {} +static inline void scx_cgroup_finish_attach(void) {} +static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} +static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} +static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} +#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c new file mode 100644 index 000000000000..66da03cc0b33 --- /dev/null +++ b/kernel/sched/ext_idle.c @@ -0,0 +1,1314 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Built-in idle CPU tracking policy. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#include "ext_idle.h" + +/* Enable/disable built-in idle CPU selection policy */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +/* Enable/disable per-node idle cpumasks */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP +/* Enable/disable LLC aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); + +/* Enable/disable NUMA aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); + +/* + * cpumasks to track idle CPUs within each NUMA node. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask + * from is used to track all the idle CPUs in the system. + */ +struct scx_idle_cpus { + cpumask_var_t cpu; + cpumask_var_t smt; +}; + +/* + * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE + * is not enabled). + */ +static struct scx_idle_cpus scx_idle_global_masks; + +/* + * Per-node idle cpumasks. + */ +static struct scx_idle_cpus **scx_idle_node_masks; + +/* + * Local per-CPU cpumasks (used to generate temporary idle cpumasks). + */ +static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask); +static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask); +static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask); + +/* + * Return the idle masks associated to a target @node. + * + * NUMA_NO_NODE identifies the global idle cpumask. + */ +static struct scx_idle_cpus *idle_cpumask(int node) +{ + return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node]; +} + +/* + * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if + * per-node idle cpumasks are disabled. + */ +static int scx_cpu_node_if_enabled(int cpu) +{ + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +} + +bool scx_idle_test_and_clear_cpu(int cpu) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from the idle SMT mask. Ensure that + * @cpu is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. + */ + if (cpumask_intersects(smt, idle_smts)) + cpumask_andnot(idle_smts, idle_smts, smt); + else if (cpumask_test_cpu(cpu, idle_smts)) + __cpumask_clear_cpu(cpu, idle_smts); + } +#endif + + return cpumask_test_and_clear_cpu(cpu, idle_cpus); +} + +/* + * Pick an idle CPU in a specific NUMA node. + */ +static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (scx_idle_test_and_clear_cpu(cpu)) + return cpu; + else + goto retry; +} + +/* + * Tracks nodes that have not yet been visited when searching for an idle + * CPU across all available nodes. + */ +static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); + +/* + * Search for an idle CPU across all nodes, excluding @node. + */ +static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + nodemask_t *unvisited; + s32 cpu = -EBUSY; + + preempt_disable(); + unvisited = this_cpu_ptr(&per_cpu_unvisited); + + /* + * Restrict the search to the online nodes (excluding the current + * node that has been visited already). + */ + nodes_copy(*unvisited, node_states[N_ONLINE]); + node_clear(node, *unvisited); + + /* + * Traverse all nodes in order of increasing distance, starting + * from @node. + * + * This loop is O(N^2), with N being the amount of NUMA nodes, + * which might be quite expensive in large NUMA systems. However, + * this complexity comes into play only when a scheduler enables + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU + * without specifying a target NUMA node, so it shouldn't be a + * bottleneck is most cases. + * + * As a future optimization we may want to cache the list of nodes + * in a per-node array, instead of actually traversing them every + * time. + */ + for_each_node_numadist(node, *unvisited) { + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + break; + } + preempt_enable(); + + return cpu; +} + +/* + * Find an idle CPU in the system, starting from @node. + */ +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + s32 cpu; + + /* + * Always search in the starting node first (this is an + * optimization that can save some cycles even when the search is + * not limited to a single node). + */ + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + /* + * Stop the search if we are using only a single global cpumask + * (NUMA_NO_NODE) or if the search is restricted to the first node + * only. + */ + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) + return -EBUSY; + + /* + * Extend the search to the other online nodes. + */ + return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags); +} + +/* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) + return true; + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) +{ + bool enable_llc = false, enable_numa = false; + unsigned int nr_cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) + enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA + * optimization, as we would naturally select idle CPUs within + * specific NUMA nodes querying the corresponding per-node cpumask. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), nr_cpus); + } + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + str_enabled_disabled(enable_llc)); + pr_debug("sched_ext: NUMA idle selection %s\n", + str_enabled_disabled(enable_numa)); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Return true if @p can run on all possible CPUs, false otherwise. + */ +static inline bool task_affinity_all(const struct task_struct *p) +{ + return p->nr_cpus_allowed >= num_possible_cpus(); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same + * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain + * cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node, if the node cpumask is a + * subset of @cpus_allowed, to reduce memory access latency. + * + * 5. Pick any idle CPU within the @cpus_allowed domain. + * + * Step 3 and 4 are performed only if the system has, respectively, + * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. + * + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always + * begin in @prev_cpu's node and proceed to other nodes in order of + * increasing distance. + * + * Return the picked CPU if idle, or a negative value otherwise. + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL; + const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr; + int node = scx_cpu_node_if_enabled(prev_cpu); + s32 cpu; + + preempt_disable(); + + /* + * Determine the subset of CPUs usable by @p within @cpus_allowed. + */ + if (allowed != p->cpus_ptr) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask); + + if (task_affinity_all(p)) { + allowed = cpus_allowed; + } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) { + allowed = local_cpus; + } else { + cpu = -EBUSY; + goto out_enable; + } + + /* + * If @prev_cpu is not in the allowed CPUs, skip topology + * optimizations and try to pick any idle CPU usable by the + * task. + * + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize + * the current node, as it may optimize some waker->wakee + * workloads. + */ + if (!cpumask_test_cpu(prev_cpu, allowed)) { + node = scx_cpu_node_if_enabled(smp_processor_id()); + cpu = scx_pick_idle_cpu(allowed, node, flags); + goto out_enable; + } + } + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the subset of CPUs that the task can use in its + * current LLC and node. + * + * If the task can run on all CPUs, use the node and LLC cpumasks + * directly. + */ + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask); + const struct cpumask *cpus = numa_span(prev_cpu); + + if (allowed == p->cpus_ptr && task_affinity_all(p)) + numa_cpus = cpus; + else if (cpus && cpumask_and(local_cpus, allowed, cpus)) + numa_cpus = local_cpus; + } + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask); + const struct cpumask *cpus = llc_span(prev_cpu); + + if (allowed == p->cpus_ptr && task_affinity_all(p)) + llc_cpus = cpus; + else if (cpus && cpumask_and(local_cpus, allowed, cpus)) + llc_cpus = local_cpus; + } + + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + int waker_node; + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + cpu = smp_processor_id(); + if (cpus_share_cache(cpu, prev_cpu) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + waker_node = cpu_to_node(cpu); + if (!(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0 && + (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && + !cpumask_empty(idle_cpumask(waker_node)->cpu)) { + if (cpumask_test_cpu(cpu, allowed)) + goto out_unlock; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ + if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any full-idle core usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always + * begin in prev_cpu's node and proceed to other nodes in + * order of increasing distance. + */ + cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + + /* + * Give up if we're strictly looking for a full-idle SMT + * core. + */ + if (flags & SCX_PICK_IDLE_CORE) { + cpu = -EBUSY; + goto out_unlock; + } + } + + /* + * Use @prev_cpu if it's idle. + */ + if (scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin + * in prev_cpu's node and proceed to other nodes in order of + * increasing distance. + */ + cpu = scx_pick_idle_cpu(allowed, node, flags); + +out_unlock: + rcu_read_unlock(); +out_enable: + preempt_enable(); + + return cpu; +} + +/* + * Initialize global and per-node idle cpumasks. + */ +void scx_idle_init_masks(void) +{ + int i; + + /* Allocate global idle cpumasks */ + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); + + /* Allocate per-node idle cpumasks */ + scx_idle_node_masks = kcalloc(num_possible_nodes(), + sizeof(*scx_idle_node_masks), GFP_KERNEL); + BUG_ON(!scx_idle_node_masks); + + for_each_node(i) { + scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, i); + BUG_ON(!scx_idle_node_masks[i]); + + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i)); + } + + /* Allocate local per-cpu idle cpumasks */ + for_each_possible_cpu(i) { + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + } +} + +static void update_builtin_idle(int cpu, bool idle) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + + assign_cpu(cpu, idle_cpus, idle); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + if (idle) { + /* + * idle_smt handling is racy but that's fine as it's + * only for optimization and self-correcting. + */ + if (!cpumask_subset(smt, idle_cpus)) + return; + cpumask_or(idle_smts, idle_smts, smt); + } else { + cpumask_andnot(idle_smts, idle_smts, smt); + } + } +#endif +} + +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + struct scx_sched *sch = scx_root; + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + * + * This must come after builtin idle update so that BPF schedulers can + * create interlocking between ops.update_idle() and ops.enqueue() - + * either enqueue() sees the idle bit or update_idle() sees the task + * that enqueue() queued. + */ + if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); +} + +static void reset_idle_masks(struct sched_ext_ops *ops) +{ + int node; + + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); + return; + } + + for_each_node(node) { + const struct cpumask *node_mask = cpumask_of_node(node); + + cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); + cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); + } +} +#endif /* CONFIG_SMP */ + +void scx_idle_enable(struct sched_ext_ops *ops) +{ + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + else + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node); + else + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP + reset_idle_masks(ops); +#endif +} + +void scx_idle_disable(void) +{ + static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_per_node); +} + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ + +static int validate_node(int node) +{ + if (!static_branch_likely(&scx_builtin_idle_per_node)) { + scx_kf_error("per-node idle tracking is disabled"); + return -EOPNOTSUPP; + } + + /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ + if (node == NUMA_NO_NODE) + return -ENOENT; + + /* Make sure node is in a valid range */ + if (node < 0 || node >= nr_node_ids) { + scx_kf_error("invalid node %d", node); + return -EINVAL; + } + + /* Make sure the node is part of the set of possible nodes */ + if (!node_possible(node)) { + scx_kf_error("unavailable node %d", node); + return -EINVAL; + } + + return node; +} + +__bpf_kfunc_start_defs(); + +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_kf_error("built-in idle tracking is disabled"); + return false; +} + +s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *allowed, u64 flags) +{ + struct rq *rq; + struct rq_flags rf; + s32 cpu; + + if (!kf_cpu_valid(prev_cpu, NULL)) + return -EINVAL; + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + /* + * If called from an unlocked context, acquire the task's rq lock, + * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed. + * + * Otherwise, allow to use this kfunc only from ops.select_cpu() + * and ops.select_enqueue(). + */ + if (scx_kf_allowed_if_unlocked()) { + rq = task_rq_lock(p, &rf); + } else { + if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) + return -EPERM; + rq = scx_locked_rq(); + } + + /* + * Validate locking correctness to access p->cpus_ptr and + * p->nr_cpus_allowed: if we're holding an rq lock, we're safe; + * otherwise, assert that p->pi_lock is held. + */ + if (!rq) + lockdep_assert_held(&p->pi_lock); + +#ifdef CONFIG_SMP + /* + * This may also be called from ops.enqueue(), so we need to handle + * per-CPU tasks as well. For these tasks, we can skip all idle CPU + * selection optimizations and simply check whether the previously + * used CPU is idle and within the allowed cpumask. + */ + if (p->nr_cpus_allowed == 1) { + if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && + scx_idle_test_and_clear_cpu(prev_cpu)) + cpu = prev_cpu; + else + cpu = -EBUSY; + } else { + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, + allowed ?: p->cpus_ptr, flags); + } +#else + cpu = -EBUSY; +#endif + if (scx_kf_allowed_if_unlocked()) + task_rq_unlock(rq, p, &rf); + + return cpu; +} + +/** + * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or + * trigger an error if @cpu is invalid + * @cpu: target CPU + */ +__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +{ +#ifdef CONFIG_NUMA + if (!kf_cpu_valid(cpu, NULL)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +#else + return 0; +#endif +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked + * context such as a BPF test_run() call, as long as built-in CPU selection + * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE + * is set. + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ + s32 cpu; + + cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0); + if (cpu >= 0) { + *is_idle = true; + return cpu; + } + *is_idle = false; + + return prev_cpu; +} + +/** + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p, + * prioritizing those in @cpus_allowed + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @cpus_allowed: cpumask of allowed CPUs + * @flags: %SCX_PICK_IDLE* flags + * + * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked + * context such as a BPF test_run() call, as long as built-in CPU selection + * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE + * is set. + * + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the selected idle CPU, which will be automatically awakened upon + * returning from ops.select_cpu() and can be used for direct dispatch, or + * a negative value if no idle CPU is available. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags); +} + +/** + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the + * idle-tracking per-CPU cpumask of a target NUMA node. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the + * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be + * used to determine if an entire physical core is free. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(node)->smt; + else + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(NUMA_NO_NODE)->smt; + else + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global idle cpumask, which is read-only in the + * caller and is never released. The acquire / release semantics here + * are just used to make the cpumask a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!check_builtin_idle_enabled()) + return false; + + if (kf_cpu_valid(cpu, NULL)) + return scx_idle_test_and_clear_cpu(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_* flags + * + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. + * + * Returns the picked idle cpu number on success, or -%EBUSY if no matching + * cpu was found. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node). + * + * Always returns an error if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if + * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + node = validate_node(node); + if (node < 0) + return node; + + return scx_pick_idle_cpu(cpus_allowed, node, flags); +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_idle_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_kf_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); +} + +/** + * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available + * or pick any CPU from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node, regardless of + * the CPU idle state). + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + s32 cpu; + + node = validate_node(node); + if (node < 0) + return node; + + cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + if (flags & SCX_PICK_IDLE_IN_NODE) + cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed); + else + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_any_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_kf_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_idle) +BTF_ID_FLAGS(func, scx_bpf_cpu_node) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_idle) + +static const struct btf_kfunc_id_set scx_kfunc_set_idle = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_idle, +}; + +int scx_idle_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + + return ret; +} diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h new file mode 100644 index 000000000000..37be78a7502b --- /dev/null +++ b/kernel/sched/ext_idle.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#ifndef _KERNEL_SCHED_EXT_IDLE_H +#define _KERNEL_SCHED_EXT_IDLE_H + +struct sched_ext_ops; + +#ifdef CONFIG_SMP +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); +void scx_idle_init_masks(void); +bool scx_idle_test_and_clear_cpu(int cpu); +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); +#else /* !CONFIG_SMP */ +static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} +static inline void scx_idle_init_masks(void) {} +static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } +static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* CONFIG_SMP */ + +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags); +void scx_idle_enable(struct sched_ext_ops *ops); +void scx_idle_disable(void); +int scx_idle_init(void); + +#endif /* _KERNEL_SCHED_EXT_IDLE_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c62805dbd608..b4326827e326 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -37,6 +37,7 @@ #include <linux/sched/cputime.h> #include <linux/sched/isolation.h> #include <linux/sched/nohz.h> +#include <linux/sched/prio.h> #include <linux/cpuidle.h> #include <linux/interrupt.h> @@ -51,6 +52,8 @@ #include <asm/switch_to.h> +#include <uapi/linux/sched/types.h> + #include "sched.h" #include "stats.h" #include "autogroup.h" @@ -61,7 +64,7 @@ * Options are: * * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -71,22 +74,16 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { - int _shift = 0; - - if (kstrtoint(str, 0, &_shift)) - pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - - sched_thermal_decay_shift = clamp(_shift, 0, 10); + pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -136,7 +133,7 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #endif #ifdef CONFIG_SYSCTL -static struct ctl_table sched_fair_sysctls[] = { +static const struct ctl_table sched_fair_sysctls[] = { #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", @@ -157,7 +154,6 @@ static struct ctl_table sched_fair_sysctls[] = { .extra1 = SYSCTL_ZERO, }, #endif /* CONFIG_NUMA_BALANCING */ - {} }; static int __init sched_fair_sysctl_init(void) @@ -388,8 +384,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) /* * With cfs_rq being unthrottled/throttled during an enqueue, - * it can happen the tmp_alone_branch points the a leaf that - * we finally want to del. In this case, tmp_alone_branch moves + * it can happen the tmp_alone_branch points to the leaf that + * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue. */ @@ -403,10 +399,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) static inline void assert_list_leaf_cfs_rq(struct rq *rq) { - SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); + WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } -/* Iterate thr' all leaf cfs_rq's on a runqueue */ +/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ leaf_cfs_rq_list) @@ -518,7 +514,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) static int se_is_idle(struct sched_entity *se) { - return 0; + return task_has_idle_policy(task_of(se)); } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -530,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) @@ -539,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) return max_vruntime; } -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) @@ -595,13 +591,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * [[ NOTE: this is only equal to the ideal scheduler under the condition * that join/leave operations happen at lag_i = 0, otherwise the - * virtual time has non-continguous motion equivalent to: + * virtual time has non-contiguous motion equivalent to: * * V +-= lag_i / W * * Also see the comment in place_entity() that deals with this. ]] * - * However, since v_i is u64, and the multiplcation could easily overflow + * However, since v_i is u64, and the multiplication could easily overflow * transform it into a relative form that uses smaller quantities: * * Substitute: v_i == (v_i - v0) + v0 @@ -671,7 +667,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) } if (load) { - /* sign flips effective floor / ceil */ + /* sign flips effective floor / ceiling */ if (avg < 0) avg -= (load - 1); avg = div_s64(avg, load); @@ -696,21 +692,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * * XXX could add max_slice to the augmented data to track this. */ -static s64 entity_lag(u64 avruntime, struct sched_entity *se) +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { s64 vlag, limit; - vlag = avruntime - se->vruntime; - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + WARN_ON_ONCE(!se->on_rq); - return clamp(vlag, -limit, limit); -} - -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - SCHED_WARN_ON(!se->on_rq); + vlag = avg_vruntime(cfs_rq) - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); - se->vlag = entity_lag(avg_vruntime(cfs_rq), se); + se->vlag = clamp(vlag, -limit, limit); } /* @@ -727,7 +718,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) * - * Note: using 'avg_vruntime() > se->vruntime' is inacurate due + * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. */ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) @@ -786,8 +777,22 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } /* ensure we never gain time by being placed backwards. */ - u64_u32_store(cfs_rq->min_vruntime, - __update_min_vruntime(cfs_rq, vruntime)); + cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); +} + +static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 min_slice = ~0ULL; + + if (curr && curr->on_rq) + min_slice = curr->slice; + + if (root) + min_slice = min(min_slice, root->min_slice); + + return min_slice; } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) @@ -806,19 +811,34 @@ static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node } } +static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->min_slice < se->min_slice) + se->min_slice = rse->min_slice; + } +} + /* * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { u64 old_min_vruntime = se->min_vruntime; + u64 old_min_slice = se->min_slice; struct rb_node *node = &se->run_node; se->min_vruntime = se->vruntime; __min_vruntime_update(se, node->rb_right); __min_vruntime_update(se, node->rb_left); - return se->min_vruntime == old_min_vruntime; + se->min_slice = se->slice; + __min_slice_update(se, node->rb_right); + __min_slice_update(se, node->rb_left); + + return se->min_vruntime == old_min_vruntime && + se->min_slice == old_min_slice; } RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, @@ -831,6 +851,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); se->min_vruntime = se->vruntime; + se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less, &min_vruntime_cb); } @@ -863,6 +884,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ +static inline void set_protect_slice(struct sched_entity *se) +{ + se->vlag = se->deadline; +} + +static inline bool protect_slice(struct sched_entity *se) +{ + return se->vlag == se->deadline; +} + +static inline void cancel_protect_slice(struct sched_entity *se) +{ + if (protect_slice(se)) + se->vlag = se->deadline + 1; +} + +/* * Earliest Eligible Virtual Deadline First * * In order to provide latency guarantees for different request sizes @@ -892,17 +933,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * We can safely skip eligibility check if there is only one entity * in this cfs_rq, saving some cycles. */ - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - /* - * Once selected, run a task until it either becomes non-eligible or - * until it gets a new slice. See the HACK in set_next_entity(). - */ - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -946,7 +983,6 @@ found: return best; } -#ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -973,7 +1009,6 @@ int sched_update_scaling(void) return 0; } #endif -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -981,17 +1016,18 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i * this is probably good enough. */ -static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) +static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { if ((s64)(se->vruntime - se->deadline) < 0) - return; + return false; /* * For EEVDF the virtual time slope is determined by w_i (iow. * nice) while the request time r_i is determined by * sysctl_sched_base_slice. */ - se->slice = sysctl_sched_base_slice; + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; /* * EEVDF: vd_i = ve_i + r_i / w_i @@ -1001,10 +1037,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * The task has consumed its request, reschedule. */ - if (cfs_rq->nr_running > 1) { - resched_curr(rq_of(cfs_rq)); - clear_buddies(cfs_rq, se); - } + return true; } #include "pelt.h" @@ -1030,14 +1063,15 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ + /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */ } /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * - * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) + * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such @@ -1084,7 +1118,7 @@ void post_init_entity_util_avg(struct task_struct *p) if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg = cfs_rq->avg.util_avg * se_weight(se); sa->util_avg /= (cfs_rq->avg.load_avg + 1); if (sa->util_avg > cap) @@ -1137,8 +1171,38 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) trace_sched_stat_runtime(p, delta_exec); account_group_exec_runtime(p, delta_exec); cgroup_account_cputime(p, delta_exec); - if (p->dl_server) - dl_server_update(p->dl_server, delta_exec); +} + +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (curr->vlag == curr->deadline) + return false; + + return !entity_eligible(cfs_rq, curr); +} + +static inline bool do_preempt_short(struct cfs_rq *cfs_rq, + struct sched_entity *pse, struct sched_entity *se) +{ + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (pse->slice >= se->slice) + return false; + + if (!entity_eligible(cfs_rq, pse)) + return false; + + if (entity_before(pse, se)) + return true; + + if (!entity_eligible(cfs_rq, se)) + return true; + + return false; } /* @@ -1146,12 +1210,12 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) */ s64 update_curr_common(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; s64 delta_exec; - delta_exec = update_curr_se(rq, &curr->se); + delta_exec = update_curr_se(rq, &donor->se); if (likely(delta_exec > 0)) - update_curr_task(curr, delta_exec); + update_curr_task(donor, delta_exec); return delta_exec; } @@ -1162,28 +1226,54 @@ s64 update_curr_common(struct rq *rq) static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); s64 delta_exec; + bool resched; if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq_of(cfs_rq), curr); + delta_exec = update_curr_se(rq, curr); if (unlikely(delta_exec <= 0)) return; curr->vruntime += calc_delta_fair(delta_exec, curr); - update_deadline(cfs_rq, curr); + resched = update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); - if (entity_is_task(curr)) - update_curr_task(task_of(curr), delta_exec); + if (entity_is_task(curr)) { + struct task_struct *p = task_of(curr); + + update_curr_task(p, delta_exec); + + /* + * If the fair_server is active, we need to account for the + * fair_server time whether or not the task is running on + * behalf of fair_server or not: + * - If the task is running on behalf of fair_server, we need + * to limit its time based on the assigned runtime. + * - Fair task that runs outside of fair_server should account + * against fair_server such that it can account for this time + * and possibly avoid running this period. + */ + if (dl_server_active(&rq->fair_server)) + dl_server_update(&rq->fair_server, delta_exec); + } account_cfs_rq_runtime(cfs_rq, delta_exec); + + if (cfs_rq->nr_queued == 1) + return; + + if (resched || did_preempt_short(cfs_rq, curr)) { + resched_curr_lazy(rq); + clear_buddies(cfs_rq, curr); + } } static void update_curr_fair(struct rq *rq) { - update_curr(cfs_rq_of(&rq->curr->se)); + update_curr(cfs_rq_of(&rq->donor->se)); } static inline void @@ -1622,7 +1712,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, - * which should be ok given the number of nodes rarely exceeds 8. + * which should be OK given the number of nodes rarely exceeds 8. */ for_each_online_node(node) { unsigned long faults; @@ -1748,7 +1838,7 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) continue; if (zone_watermark_ok(zone, 0, - wmark_pages(zone, WMARK_PROMO) + enough_wmark, + promo_wmark_pages(zone) + enough_wmark, ZONE_MOVABLE, 0)) return true; } @@ -1846,8 +1936,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared. */ - if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && - !node_is_toptier(src_nid)) { + if (folio_use_access_time(folio)) { struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; @@ -2054,7 +2143,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); ns->util += cpu_util_cfs(cpu); - ns->nr_running += rq->cfs.h_nr_running; + ns->nr_running += rq->cfs.h_nr_runnable; ns->compute_capacity += capacity_of(cpu); if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { @@ -3194,6 +3283,15 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) return true; } + /* + * This vma has not been accessed for a while, and if the number + * the threads in the same process is low, which means no other + * threads can help scan this vma, force a vma scan. + */ + if (READ_ONCE(mm->numa_scan_seq) > + (vma->numab_state->prev_scan_seq + get_nr_threads(current))) + return true; + return false; } @@ -3217,7 +3315,7 @@ static void task_numa_work(struct callback_head *work) bool vma_pids_skipped; bool vma_pids_forced = false; - SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* @@ -3231,6 +3329,15 @@ static void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Memory is pinned to only one NUMA node via cpuset.mems, naturally + * no page can be migrated. + */ + if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) { + trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed); + return; + } + if (!mm->numa_next_scan) { mm->numa_next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); @@ -3286,7 +3393,7 @@ retry_pids: vma = vma_next(&vmi); } - do { + for (; vma; vma = vma_next(&vmi)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); @@ -3296,7 +3403,7 @@ retry_pids: /* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso + * hinting faults in read-only file-backed mappings or the vDSO * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || @@ -3307,7 +3414,7 @@ retry_pids: /* * Skip inaccessible VMAs to avoid any confusion between - * PROT_NONE and NUMA hinting ptes + * PROT_NONE and NUMA hinting PTEs */ if (!vma_is_accessible(vma)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); @@ -3316,11 +3423,17 @@ retry_pids: /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { - vma->numab_state = kzalloc(sizeof(struct vma_numab_state), - GFP_KERNEL); - if (!vma->numab_state) + struct vma_numab_state *ptr; + + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + if (!ptr) continue; + if (cmpxchg(&vma->numab_state, NULL, ptr)) { + kfree(ptr); + continue; + } + vma->numab_state->start_scan_seq = mm->numa_scan_seq; vma->numab_state->next_scan = now + @@ -3339,7 +3452,7 @@ retry_pids: } /* - * Scanning the VMA's of short lived tasks add more overhead. So + * Scanning the VMAs of short lived tasks add more overhead. So * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, @@ -3383,7 +3496,7 @@ retry_pids: /* * Try to scan sysctl_numa_balancing_size worth of * hpages that have at least one present PTE that - * is not already pte-numa. If the VMA contains + * is not already PTE-numa. If the VMA contains * areas that are unused or already full of prot_numa * PTEs, scan up to virtpages, to skip through those * areas faster. @@ -3408,7 +3521,7 @@ retry_pids: */ if (vma_pids_forced) break; - } for_each_vma(vmi, vma); + } /* * If no VMAs are remaining and VMAs were skipped due to the PID @@ -3590,9 +3703,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &rq->cfs_tasks); } #endif - cfs_rq->nr_running++; - if (se_is_idle(se)) - cfs_rq->idle_nr_running++; + cfs_rq->nr_queued++; } static void @@ -3605,9 +3716,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } #endif - cfs_rq->nr_running--; - if (se_is_idle(se)) - cfs_rq->idle_nr_running--; + cfs_rq->nr_queued--; } /* @@ -3682,137 +3791,33 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif -static void reweight_eevdf(struct sched_entity *se, u64 avruntime, - unsigned long weight) -{ - unsigned long old_weight = se->load.weight; - s64 vlag, vslice; - - /* - * VRUNTIME - * ======== - * - * COROLLARY #1: The virtual runtime of the entity needs to be - * adjusted if re-weight at !0-lag point. - * - * Proof: For contradiction assume this is not true, so we can - * re-weight without changing vruntime at !0-lag point. - * - * Weight VRuntime Avg-VRuntime - * before w v V - * after w' v' V' - * - * Since lag needs to be preserved through re-weight: - * - * lag = (V - v)*w = (V'- v')*w', where v = v' - * ==> V' = (V - v)*w/w' + v (1) - * - * Let W be the total weight of the entities before reweight, - * since V' is the new weighted average of entities: - * - * V' = (WV + w'v - wv) / (W + w' - w) (2) - * - * by using (1) & (2) we obtain: - * - * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v - * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v - * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v - * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) - * - * Since we are doing at !0-lag point which means V != v, we - * can simplify (3): - * - * ==> W / (W + w' - w) = w / w' - * ==> Ww' = Ww + ww' - ww - * ==> W * (w' - w) = w * (w' - w) - * ==> W = w (re-weight indicates w' != w) - * - * So the cfs_rq contains only one entity, hence vruntime of - * the entity @v should always equal to the cfs_rq's weighted - * average vruntime @V, which means we will always re-weight - * at 0-lag point, thus breach assumption. Proof completed. - * - * - * COROLLARY #2: Re-weight does NOT affect weighted average - * vruntime of all the entities. - * - * Proof: According to corollary #1, Eq. (1) should be: - * - * (V - v)*w = (V' - v')*w' - * ==> v' = V' - (V - v)*w/w' (4) - * - * According to the weighted average formula, we have: - * - * V' = (WV - wv + w'v') / (W - w + w') - * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') - * = (WV - wv + w'V' - Vw + wv) / (W - w + w') - * = (WV + w'V' - Vw) / (W - w + w') - * - * ==> V'*(W - w + w') = WV + w'V' - Vw - * ==> V' * (W - w) = (W - w) * V (5) - * - * If the entity is the only one in the cfs_rq, then reweight - * always occurs at 0-lag point, so V won't change. Or else - * there are other entities, hence W != w, then Eq. (5) turns - * into V' = V. So V won't change in either case, proof done. - * - * - * So according to corollary #1 & #2, the effect of re-weight - * on vruntime should be: - * - * v' = V' - (V - v) * w / w' (4) - * = V - (V - v) * w / w' - * = V - vl * w / w' - * = V - vl' - */ - if (avruntime != se->vruntime) { - vlag = entity_lag(avruntime, se); - vlag = div_s64(vlag * old_weight, weight); - se->vruntime = avruntime - vlag; - } - - /* - * DEADLINE - * ======== - * - * When the weight changes, the virtual time slope changes and - * we should adjust the relative virtual deadline accordingly. - * - * d' = v' + (d - v)*w/w' - * = V' - (V - v)*w/w' + (d - v)*w/w' - * = V - (V - v)*w/w' + (d - v)*w/w' - * = V + (d - V)*w/w' - */ - vslice = (s64)(se->deadline - avruntime); - vslice = div_s64(vslice * old_weight, weight); - se->deadline = avruntime + vslice; -} +static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; - u64 avruntime; if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); - avruntime = avg_vruntime(cfs_rq); + update_entity_lag(cfs_rq, se); + se->deadline -= se->vruntime; + se->rel_deadline = 1; + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - if (se->on_rq) { - reweight_eevdf(se, avruntime, weight); - } else { - /* - * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), - * we need to scale se->vlag when w_i changes. - */ - se->vlag = div_s64(se->vlag * se->load.weight, weight); - } + /* + * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), + * we need to scale se->vlag when w_i changes. + */ + se->vlag = div_s64(se->vlag * se->load.weight, weight); + if (se->rel_deadline) + se->deadline = div_s64(se->deadline * se->load.weight, weight); update_load_set(&se->load, weight); @@ -3826,9 +3831,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { + place_entity(cfs_rq, se, 0); update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; /* * The entity's vruntime has been adjusted, so let's check @@ -3841,15 +3848,15 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } } -void reweight_task(struct task_struct *p, int prio) +static void reweight_task_fair(struct rq *rq, struct task_struct *p, + const struct load_weight *lw) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct load_weight *load = &se->load; - unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); - load->inv_weight = sched_prio_to_wmult[prio]; + reweight_entity(cfs_rq, se, lw->weight); + load->inv_weight = lw->inv_weight; } static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); @@ -3973,7 +3980,11 @@ static void update_cfs_group(struct sched_entity *se) struct cfs_rq *gcfs_rq = group_cfs_rq(se); long shares; - if (!gcfs_rq) + /* + * When a group becomes empty, preserve its weight. This matters for + * DELAY_DEQUEUE. + */ + if (!gcfs_rq || !gcfs_rq->load.weight) return; if (throttled_hierarchy(gcfs_rq)) @@ -4034,7 +4045,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa) * Make sure that rounding and/or propagation of PELT values never * break this. */ - SCHED_WARN_ON(sa->load_avg || + WARN_ON_ONCE(sa->load_avg || sa->util_avg || sa->runnable_avg); @@ -4059,15 +4070,17 @@ static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) { struct cfs_rq *prev_cfs_rq; struct list_head *prev; + struct rq *rq = rq_of(cfs_rq); if (cfs_rq->on_list) { prev = cfs_rq->leaf_cfs_rq_list.prev; } else { - struct rq *rq = rq_of(cfs_rq); - prev = rq->tmp_alone_branch; } + if (prev == &rq->leaf_cfs_rq_list) + return false; + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); return (prev_cfs_rq->tg->parent == cfs_rq->tg); @@ -4745,7 +4758,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration + * track group sched_entity load average for task_h_load calculation in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); @@ -4828,7 +4841,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { @@ -4931,13 +4944,6 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, goto done; /* - * To avoid overestimation of actual task utilization, skip updates if - * we cannot grant there is idle time in this CPU. - */ - if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) - return; - - /* * To avoid underestimate of task utilization, skip updates of EWMA if * we cannot grant that thread got all CPU time it wanted. */ @@ -4971,13 +4977,22 @@ done: trace_sched_util_est_se_tp(&p->se); } +static inline unsigned long get_actual_cpu_capacity(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + + return capacity; +} + static inline int util_fits_cpu(unsigned long util, unsigned long uclamp_min, unsigned long uclamp_max, int cpu) { - unsigned long capacity_orig, capacity_orig_thermal; unsigned long capacity = capacity_of(cpu); + unsigned long capacity_orig; bool fits, uclamp_max_fits; /* @@ -4999,7 +5014,7 @@ static inline int util_fits_cpu(unsigned long util, * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * - * Only exception is for thermal pressure since it has a direct impact + * Only exception is for HW or cpufreq pressure since it has a direct impact * on available OPP of the system. * * We honour it for uclamp_min only as a drop in performance level @@ -5009,7 +5024,6 @@ static inline int util_fits_cpu(unsigned long util, * goal is to cap the task. So it's okay if it's getting less. */ capacity_orig = arch_scale_cpu_capacity(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -5026,14 +5040,14 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * In the above example if a task is capped to a specific performance * point, y, then when: * - * * util = 80% of x then it does not fit on cpu0 and should migrate - * to cpu1 - * * util = 80% of y then it is forced to fit on cpu1 to honour + * * util = 80% of x then it does not fit on CPU0 and should migrate + * to CPU1 + * * util = 80% of y then it is forced to fit on CPU1 to honour * uclamp_max request. * * which is what we're enforcing here. A task always fits if @@ -5064,7 +5078,7 @@ static inline int util_fits_cpu(unsigned long util, * | | | | | | | * | | | | | | | (region c, boosted, util < uclamp_min) * +---------------------------------------- - * cpu0 cpu1 cpu2 + * CPU0 CPU1 CPU2 * * a) If util > uclamp_max, then we're capped, we don't care about * actual fitness value here. We only care if uclamp_max fits @@ -5084,7 +5098,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + if (fits && (util < uclamp_min) && + (uclamp_min > get_actual_cpu_capacity(cpu))) return -1; return fits; @@ -5104,15 +5119,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { + int cpu = cpu_of(rq); + if (!sched_asym_cpucap_active()) return; - if (!p || p->nr_cpus_allowed == 1) { - rq->misfit_task_load = 0; - return; - } + /* + * Affinity allows us to go somewhere higher? Or are we on biggest + * available CPU already? Or do we fit into this CPU ? + */ + if (!p || (p->nr_cpus_allowed == 1) || + (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || + task_fits_cpu(p, cpu)) { - if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -5128,7 +5147,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return !cfs_rq->nr_running; + return !cfs_rq->nr_queued; } #define UPDATE_TG 0x0 @@ -5148,7 +5167,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -5166,13 +5185,30 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ +void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_entity *se = &p->se; + + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + if (attr->sched_runtime) { + se->custom_slice = 1; + se->slice = clamp_t(u64, attr->sched_runtime, + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ + } else { + se->custom_slice = 0; + se->slice = sysctl_sched_base_slice; + } +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u64 vslice, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; - se->slice = sysctl_sched_base_slice; + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; vslice = calc_delta_fair(se->slice, se); /* @@ -5183,7 +5219,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5253,8 +5289,14 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; + if (se->rel_deadline) { + se->deadline += se->vruntime; + se->rel_deadline = 0; + return; + } + /* - * When joining the competition; the exisiting tasks will be, + * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks * off with half a slice to ease into the competition. */ @@ -5270,7 +5312,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); -static inline bool cfs_bandwidth_used(void); +static void +requeue_delayed_entity(struct sched_entity *se); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -5290,7 +5333,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -5323,7 +5366,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); if (!throttled_hierarchy(cfs_rq)) { list_add_leaf_cfs_rq(cfs_rq); @@ -5359,24 +5402,94 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +static void set_delayed(struct sched_entity *se) { - int action = UPDATE_TG; + se->sched_delayed = 1; - if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) - action |= DO_DETACH; + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since dequeue_entities() + * will account it for blocked tasks. + */ + if (!entity_is_task(se)) + return; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->h_nr_runnable--; + if (cfs_rq_throttled(cfs_rq)) + break; + } +} + +static void clear_delayed(struct sched_entity *se) +{ + se->sched_delayed = 0; /* - * Update run-time statistics of the 'current'. + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since a dequeue has + * already accounted for it or an enqueue of a task + * below it will account for it in enqueue_task_fair(). */ + if (!entity_is_task(se)) + return; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->h_nr_runnable++; + if (cfs_rq_throttled(cfs_rq)) + break; + } +} + +static inline void finish_delayed_dequeue_entity(struct sched_entity *se) +{ + clear_delayed(se); + if (sched_feat(DELAY_ZERO) && se->vlag > 0) + se->vlag = 0; +} + +static bool +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + bool sleep = flags & DEQUEUE_SLEEP; + int action = UPDATE_TG; + update_curr(cfs_rq); + clear_buddies(cfs_rq, se); + + if (flags & DEQUEUE_DELAYED) { + WARN_ON_ONCE(!se->sched_delayed); + } else { + bool delay = sleep; + /* + * DELAY_DEQUEUE relies on spurious wakeups, special task + * states must not suffer spurious wakeups, excempt them. + */ + if (flags & DEQUEUE_SPECIAL) + delay = false; + + WARN_ON_ONCE(delay && se->sched_delayed); + + if (sched_feat(DELAY_DEQUEUE) && delay && + !entity_eligible(cfs_rq, se)) { + update_load_avg(cfs_rq, se, 0); + set_delayed(se); + return false; + } + } + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. @@ -5386,9 +5499,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_dequeue_fair(cfs_rq, se, flags); - clear_buddies(cfs_rq, se); - update_entity_lag(cfs_rq, se); + if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { + se->deadline -= se->vruntime; + se->rel_deadline = 1; + } + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; @@ -5403,13 +5519,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- ie. we'll be penalized. + * further than we started -- i.e. we'll be penalized. */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); - if (cfs_rq->nr_running == 0) + if (flags & DEQUEUE_DELAYED) + finish_delayed_dequeue_entity(se); + + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); + + return true; } static void @@ -5427,19 +5548,17 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. - */ - se->vlag = se->deadline; + + set_protect_slice(se); } update_stats_curr_start(cfs_rq, se); + WARN_ON_ONCE(cfs_rq->curr); cfs_rq->curr = se; /* * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it + * least twice that of our own weight (i.e. don't track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && @@ -5455,6 +5574,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); + /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups @@ -5463,16 +5584,29 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq) +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { + struct sched_entity *se; + /* - * Enabling NEXT_BUDDY will affect latency but not fairness. + * Picking the ->next buddy will affect latency but not fairness. */ - if (sched_feat(NEXT_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) + if (sched_feat(PICK_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + WARN_ON_ONCE(cfs_rq->next->sched_delayed); return cfs_rq->next; + } - return pick_eevdf(cfs_rq); + se = pick_eevdf(cfs_rq); + if (se->sched_delayed) { + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + /* + * Must not reference @se again, see __block_task(). + */ + return NULL; + } + return se; } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -5496,6 +5630,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } + WARN_ON_ONCE(cfs_rq->curr != prev); cfs_rq->curr = NULL; } @@ -5519,15 +5654,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; #endif } @@ -5724,7 +5853,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_self = 0; - if (SCHED_WARN_ON((s64)delta < 0)) + if (WARN_ON_ONCE((s64)delta < 0)) delta = 0; cfs_rq->throttled_clock_self_time += delta; @@ -5744,8 +5873,8 @@ static int tg_throttle_down(struct task_group *tg, void *data) cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); - SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_running) + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_queued) cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -5758,7 +5887,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, dequeue = 1; + long queued_delta, runnable_delta, idle_delta, dequeue = 1; + long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5788,21 +5918,33 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); + int flags; + /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) goto done; - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + /* + * Abuse SPECIAL to avoid delayed dequeue in this instance. + * This avoids teaching dequeue_entities() about throttled + * entities and keeps things relatively simple. + */ + flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; + if (se->sched_delayed) + flags |= DEQUEUE_DELAYED; + dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5821,23 +5963,27 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; } /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, task_delta); + sub_nr_running(rq, queued_delta); + /* Stop the fair server if throttling resulted in no runnable tasks */ + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) + dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - SCHED_WARN_ON(cfs_rq->throttled_clock); - if (cfs_rq->nr_running) + WARN_ON_ONCE(cfs_rq->throttled_clock); + if (cfs_rq->nr_queued) cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5847,7 +5993,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta; + long queued_delta, runnable_delta, idle_delta; + long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5880,20 +6027,27 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (se->on_rq) + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -5907,24 +6061,29 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } + /* Start the fair server if un-throttling resulted in new runnable tasks */ + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) + dl_server_start(&rq->fair_server); + /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); + add_nr_running(rq, queued_delta); unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) + if (rq->curr == rq->idle && rq->cfs.nr_queued) resched_curr(rq); } @@ -5979,7 +6138,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) } /* Already enqueued */ - if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list))) return; first = list_empty(&rq->cfsb_csd_list); @@ -5998,7 +6157,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { lockdep_assert_rq_held(rq_of(cfs_rq)); - if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) || cfs_rq->runtime_remaining <= 0)) return; @@ -6034,7 +6193,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) goto next; /* By the above checks, this should never be true */ - SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; @@ -6055,7 +6214,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) * We currently only expect to be unthrottling * a single cfs_rq locally. */ - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle); } @@ -6080,7 +6239,7 @@ next: rq_unlock_irqrestore(rq, &rf); } - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); rcu_read_unlock(); @@ -6225,7 +6384,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued) return; __return_cfs_rq_runtime(cfs_rq); @@ -6397,14 +6556,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); /* Add a random offset so that timers interleave */ hrtimer_set_expires(&cfs_b->period_timer, get_random_u32_below(cfs_b->period)); - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; + hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cfs_b->slack_started = false; } @@ -6496,6 +6655,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) lockdep_assert_rq_held(rq); + // Do not unthrottle for an active CPU + if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) + return; + /* * The rq clock has already been updated in the * set_rq_offline(), so we should skip updating @@ -6511,18 +6674,20 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) continue; /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = 1; - /* * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + if (!cfs_rq_throttled(cfs_rq)) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = 1; + unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); @@ -6549,7 +6714,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) { int cpu = cpu_of(rq); - if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) + if (!cfs_bandwidth_used()) return; if (!tick_nohz_full_cpu(cpu)) @@ -6571,11 +6736,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) #else /* CONFIG_CFS_BANDWIDTH */ -static inline bool cfs_bandwidth_used(void) -{ - return false; -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -6631,15 +6791,15 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - SCHED_WARN_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { + if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; u64 slice = se->slice; s64 delta = slice - ran; if (delta < 0) { - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); return; } @@ -6654,12 +6814,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) */ static void hrtick_update(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; - if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -6675,28 +6835,53 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); - unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + unsigned long rq_util_min, rq_util_max; + + if (!sched_energy_enabled()) + return false; + + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); /* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } -static inline void update_overutilized_status(struct rq *rq) +/* + * overutilized value make sense only if EAS is enabled + */ +static inline bool is_rd_overutilized(struct root_domain *rd) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { - WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); - } + return !sched_energy_enabled() || READ_ONCE(rd->overutilized); +} + +static inline void set_rd_overutilized(struct root_domain *rd, bool flag) +{ + if (!sched_energy_enabled()) + return; + + WRITE_ONCE(rd->overutilized, flag); + trace_sched_overutilized_tp(rd, flag); +} + +static inline void check_update_overutilized_status(struct rq *rq) +{ + /* + * overutilized field is used for load balancing decisions only + * if energy aware scheduler is being used + */ + + if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) + set_rd_overutilized(rq->rd, 1); } #else -static inline void update_overutilized_status(struct rq *rq) { } +static inline void check_update_overutilized_status(struct rq *rq) { } #endif /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) { - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + return unlikely(rq->nr_running == rq->cfs.h_nr_idle && rq->nr_running); } @@ -6707,6 +6892,37 @@ static int sched_idle_cpu(int cpu) } #endif +static void +requeue_delayed_entity(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * se->sched_delayed should imply: se->on_rq == 1. + * Because a delayed entity is one that is still on + * the runqueue competing until elegibility. + */ + WARN_ON_ONCE(!se->sched_delayed); + WARN_ON_ONCE(!se->on_rq); + + if (sched_feat(DELAY_ZERO)) { + update_entity_lag(cfs_rq, se); + if (se->vlag > 0) { + cfs_rq->nr_queued--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->vlag = 0; + place_entity(cfs_rq, se, 0); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; + } + } + + update_load_avg(cfs_rq, se, 0); + clear_delayed(se); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -6717,8 +6933,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int idle_h_nr_running = task_has_idle_policy(p); + int h_nr_idle = task_has_idle_policy(p); + int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); + int rq_h_nr_queued = rq->cfs.h_nr_queued; + u64 slice = 0; /* * The code below (indirectly) updates schedutil which looks at @@ -6726,7 +6945,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - util_est_enqueue(&rq->cfs, p); + if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED)) + util_est_enqueue(&rq->cfs, p); + + if (flags & ENQUEUE_DELAYED) { + requeue_delayed_entity(se); + return; + } /* * If in_iowait is set, the code below may not trigger any cpufreq @@ -6736,17 +6961,35 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + if (task_new && se->sched_delayed) + h_nr_runnable = 0; + for_each_sched_entity(se) { - if (se->on_rq) + if (se->on_rq) { + if (se->sched_delayed) + requeue_delayed_entity(se); break; + } cfs_rq = cfs_rq_of(se); + + /* + * Basically set the slice of group entries to the min_slice of + * their respective cfs_rq. This ensures the group can service + * its entities in the desired time-frame. + */ + if (slice) { + se->slice = slice; + se->custom_slice = 1; + } enqueue_entity(cfs_rq, se, flags); + slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6762,17 +7005,30 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se_update_runnable(se); update_cfs_group(se); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; + se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; } + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { + /* Account for idle runtime */ + if (!rq->nr_running) + dl_server_update_idle_time(rq, rq->curr); + dl_server_start(&rq->fair_server); + } + /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -6791,7 +7047,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * and the following generally works well enough in practice. */ if (!task_new) - update_overutilized_status(rq); + check_update_overutilized_status(rq); enqueue_throttle: assert_list_leaf_cfs_rq(rq); @@ -6802,36 +7058,61 @@ enqueue_throttle: static void set_next_buddy(struct sched_entity *se); /* - * The dequeue_task method is called before nr_running is - * decreased. We remove the task from the rbtree and - * update the fair scheduling stats: + * Basically dequeue_task_fair(), except it can deal with dequeue_entity() + * failing half-way through and resume the dequeue later. + * + * Returns: + * -1 - dequeue delayed + * 0 - dequeue throttled + * 1 - dequeue complete */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - int task_sleep = flags & DEQUEUE_SLEEP; - int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + int rq_h_nr_queued = rq->cfs.h_nr_queued; + bool task_sleep = flags & DEQUEUE_SLEEP; + bool task_delayed = flags & DEQUEUE_DELAYED; + struct task_struct *p = NULL; + int h_nr_idle = 0; + int h_nr_queued = 0; + int h_nr_runnable = 0; + struct cfs_rq *cfs_rq; + u64 slice = 0; - util_est_dequeue(&rq->cfs, p); + if (entity_is_task(se)) { + p = task_of(se); + h_nr_queued = 1; + h_nr_idle = task_has_idle_policy(p); + if (task_sleep || task_delayed || !se->sched_delayed) + h_nr_runnable = 1; + } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, flags); - cfs_rq->h_nr_running--; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (!dequeue_entity(cfs_rq, se, flags)) { + if (p && &p->se == se) + return -1; + + slice = cfs_rq_min_slice(cfs_rq); + break; + } + + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; + return 0; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + slice = cfs_rq_min_slice(cfs_rq); + /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); /* @@ -6843,6 +7124,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; } flags |= DEQUEUE_SLEEP; + flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); } for_each_sched_entity(se) { @@ -6852,33 +7134,80 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) se_update_runnable(se); update_cfs_group(se); - cfs_rq->h_nr_running--; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; + se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; - + return 0; } - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, 1); + sub_nr_running(rq, h_nr_queued); + + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) + dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; -dequeue_throttle: - util_est_update(&rq->cfs, p, task_sleep); + if (p && task_delayed) { + WARN_ON_ONCE(!task_sleep); + WARN_ON_ONCE(p->on_rq != 1); + + /* Fix-up what dequeue_task_fair() skipped */ + hrtick_update(rq); + + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ + __block_task(rq, p); + } + + return 1; +} + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + if (!p->se.sched_delayed) + util_est_dequeue(&rq->cfs, p); + + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) + return false; + + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); + return true; +} + +static inline unsigned int cfs_h_nr_delayed(struct rq *rq) +{ + return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); } #ifdef CONFIG_SMP -/* Working cpumask for: load_balance, load_balance_newidle. */ +/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); @@ -7037,8 +7366,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; - if (sync && cpu_rq(this_cpu)->nr_running == 1) - return this_cpu; + if (sync) { + struct rq *rq = cpu_rq(this_cpu); + + if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1) + return this_cpu; + } if (available_idle_cpu(prev_cpu)) return prev_cpu; @@ -7110,13 +7443,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* - * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. + * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. */ static int -find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -7172,7 +7505,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, +static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { int new_cpu = cpu; @@ -7197,13 +7530,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu); + group = sched_balance_find_dst_group(sd, p, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_group_cpu(group, p, cpu); + new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; @@ -7471,7 +7804,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = get_actual_cpu_capacity(cpu); /* * First, select CPU which fits better (-1 being better than 0). @@ -7515,7 +7848,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * On asymmetric system, update task utilization because we will check - * that the task fits with cpu's capacity. + * that the task fits with CPU's capacity. */ if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); @@ -7772,6 +8105,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) } /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the IRQ utilization. + * + * The DL bandwidth number OTOH is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max) +{ + unsigned long util, irq, scale; + struct rq *rq = cpu_rq(cpu); + + scale = arch_scale_cpu_capacity(cpu); + + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + + if (min) { + /* + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. + */ + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); + } + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ + util = util_cfs + cpu_util_rt(rq); + util += cpu_util_dl(rq); + + /* + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. + */ + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); + + if (util >= scale) + return scale; + + /* + * There is still idle time; further improve the number by using the + * IRQ metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, scale); + util += irq; + + return min(scale, util); +} + +unsigned long sched_cpu_util(int cpu) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); +} + +/* * energy_env - Utilization landscape for energy estimation. * @task_busy_time: Utilization contribution by the task for which we test the * placement. Given by eenv_task_busy_time(). @@ -7867,8 +8299,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * Performance domain frequency: utilization clamping * must be considered since it affects the selection * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. + * NOTE: in case RT tasks are running, by default the min + * utilization can be max OPP. */ eff_util = effective_cpu_util(cpu, util, &min, &max); @@ -7948,7 +8380,7 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt @@ -7964,15 +8396,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; int prev_fits = -1, best_fits = -1; - unsigned long best_thermal_cap = 0; - unsigned long prev_thermal_cap = 0; + unsigned long best_actual_cap = 0; + unsigned long prev_actual_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || READ_ONCE(rd->overutilized)) + if (!pd) goto unlock; /* @@ -7995,7 +8427,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; - unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long cpu_cap, cpu_actual_cap, util; long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; unsigned long cur_delta, base_energy; @@ -8007,18 +8439,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpumask_empty(cpus)) continue; - /* Account thermal pressure for the energy estimation */ + /* Account external pressure for the energy estimation */ cpu = cpumask_first(cpus); - cpu_thermal_cap = arch_scale_cpu_capacity(cpu); - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); + cpu_actual_cap = get_actual_cpu_capacity(cpu); - eenv.cpu_cap = cpu_thermal_cap; + eenv.cpu_cap = cpu_actual_cap; eenv.pd_cap = 0; for_each_cpu(cpu, cpus) { struct rq *rq = cpu_rq(cpu); - eenv.pd_cap += cpu_thermal_cap; + eenv.pd_cap += cpu_actual_cap; if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; @@ -8039,7 +8470,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { /* * Open code uclamp_rq_util_with() except for - * the clamp() part. Ie: apply max aggregation + * the clamp() part. I.e.: apply max aggregation * only. util_fits_cpu() logic requires to * operate on non clamped util but must use the * max-aggregated uclamp_{min, max}. @@ -8089,7 +8520,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; - prev_thermal_cap = cpu_thermal_cap; + prev_actual_cap = cpu_actual_cap; best_delta = min(best_delta, prev_delta); } @@ -8104,7 +8535,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * but best energy cpu has better capacity. */ if ((max_fits < 0) && - (cpu_thermal_cap <= best_thermal_cap)) + (cpu_actual_cap <= best_actual_cap)) continue; cur_delta = compute_energy(&eenv, pd, cpus, p, @@ -8125,14 +8556,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) best_delta = cur_delta; best_energy_cpu = max_spare_cap_cpu; best_fits = max_fits; - best_thermal_cap = cpu_thermal_cap; + best_actual_cap = cpu_actual_cap; } } rcu_read_unlock(); if ((best_fits > prev_fits) || ((best_fits > 0) && (best_delta < prev_delta)) || - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) + ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) target = best_energy_cpu; return target; @@ -8175,7 +8606,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) cpumask_test_cpu(cpu, p->cpus_ptr)) return cpu; - if (sched_energy_enabled()) { + if (!is_rd_overutilized(this_rq()->rd)) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -8213,7 +8644,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (unlikely(sd)) { /* Slow path */ - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); @@ -8256,23 +8687,69 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) static void task_dead_fair(struct task_struct *p) { - remove_entity_load_avg(&p->se); + struct sched_entity *se = &p->se; + + if (se->sched_delayed) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + task_rq_unlock(rq, p, &rf); + } + + remove_entity_load_avg(se); +} + +/* + * Set the max capacity the task is allowed to run at for misfit detection. + */ +static void set_task_max_allowed_capacity(struct task_struct *p) +{ + struct asym_cap_data *entry; + + if (!sched_asym_cpucap_active()) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &asym_cap_list, link) { + cpumask_t *cpumask; + + cpumask = cpu_capacity_span(entry); + if (!cpumask_intersects(p->cpus_ptr, cpumask)) + continue; + + p->max_allowed_capacity = entry->capacity; + break; + } + rcu_read_unlock(); +} + +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) +{ + set_cpus_allowed_common(p, ctx); + set_task_max_allowed_capacity(p); } static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - if (rq->nr_running) + if (sched_fair_runnable(rq)) return 1; - return newidle_balance(rq, rf) != 0; + return sched_balance_newidle(rq, rf) != 0; } +#else +static inline void set_task_max_allowed_capacity(struct task_struct *p) {} #endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) + if (WARN_ON_ONCE(!se->on_rq)) return; if (se_is_idle(se)) return; @@ -8285,9 +8762,9 @@ static void set_next_buddy(struct sched_entity *se) */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct task_struct *donor = rq->donor; + struct sched_entity *se = &donor->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) @@ -8302,7 +8779,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { set_next_buddy(pse); } @@ -8316,19 +8793,10 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ - if (test_tsk_need_resched(curr)) + if (test_tsk_need_resched(rq->curr)) return; - /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(task_has_idle_policy(curr)) && - likely(!task_has_idle_policy(p))) - goto preempt; - - /* - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): - */ - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + if (!sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); @@ -8338,19 +8806,41 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int pse_is_idle = se_is_idle(pse); /* - * Preempt an idle group in favor of a non-idle group (and don't preempt + * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ - if (cse_is_idle && !pse_is_idle) + if (cse_is_idle && !pse_is_idle) { + /* + * When non-idle entity preempt an idle entity, + * don't give idle entity slice protection. + */ + cancel_protect_slice(se); goto preempt; + } + if (cse_is_idle != pse_is_idle) return; + /* + * BATCH and IDLE tasks do not preempt others. + */ + if (unlikely(!normal_policy(p->policy))) + return; + cfs_rq = cfs_rq_of(se); update_curr(cfs_rq); + /* + * If @p has a shorter slice than current and @p is eligible, override + * current's slice protection in order to allow preemption. + * + * Note that even if @p does not turn out to be the most eligible + * task at this moment, current's slice protection will be lost. + */ + if (do_preempt_short(cfs_rq, pse, se)) + cancel_protect_slice(se); /* - * XXX pick_eevdf(cfs_rq) != se ? + * If @p has become the most eligible task, force preemption. */ if (pick_eevdf(cfs_rq) == pse) goto preempt; @@ -8358,10 +8848,9 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); } -#ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; @@ -8369,99 +8858,62 @@ static struct task_struct *pick_task_fair(struct rq *rq) again: cfs_rq = &rq->cfs; - if (!cfs_rq->nr_running) + if (!cfs_rq->nr_queued) return NULL; do { - struct sched_entity *curr = cfs_rq->curr; + /* Might not have done put_prev_entity() */ + if (cfs_rq->curr && cfs_rq->curr->on_rq) + update_curr(cfs_rq); - /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; - - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; - } + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again; - se = pick_next_entity(cfs_rq); + se = pick_next_entity(rq, cfs_rq); + if (!se) + goto again; cfs_rq = group_cfs_rq(se); } while (cfs_rq); return task_of(se); } -#endif + +static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; int new_tasks; again: - if (!sched_fair_runnable(rq)) + p = pick_task_fair(rq); + if (!p) goto idle; + se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) + if (prev->sched_class != &fair_sched_class) goto simple; + __put_prev_set_next_dl_server(rq, prev, p); + /* * Because of the set_next_buddy() in dequeue_task_fair() it is rather * likely that a next task is from the same cgroup as the current. * * Therefore attempt to avoid putting and setting the entire cgroup * hierarchy, only change the part that actually changes. - */ - - do { - struct sched_entity *curr = cfs_rq->curr; - - /* - * Since we got here without doing put_prev_entity() we also - * have to consider cfs_rq->curr. If it is still a runnable - * entity, update_curr() will update its vruntime, otherwise - * forget we've ever seen it. - */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; - - /* - * This call to check_cfs_rq_runtime() will do the - * throttle and dequeue its entity in the parent(s). - * Therefore the nr_running test will indeed - * be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { - cfs_rq = &rq->cfs; - - if (!cfs_rq->nr_running) - goto idle; - - goto simple; - } - } - - se = pick_next_entity(cfs_rq); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - p = task_of(se); - - /* + * * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the * least amount of cfs_rqs. */ if (prev != p) { struct sched_entity *pse = &prev->se; + struct cfs_rq *cfs_rq; while (!(cfs_rq = is_same_group(se, pse))) { int se_depth = se->depth; @@ -8479,48 +8931,25 @@ again: put_prev_entity(cfs_rq, pse); set_next_entity(cfs_rq, se); - } - goto done; -simple: -#endif - if (prev) - put_prev_task(rq, prev); - - do { - se = pick_next_entity(cfs_rq); - set_next_entity(cfs_rq, se); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); + __set_next_task_fair(rq, p, true); + } - p = task_of(se); + return p; -done: __maybe_unused; -#ifdef CONFIG_SMP - /* - * Move the next running task to the front of - * the list, so our cfs_tasks list becomes MRU - * one. - */ - list_move(&p->se.group_node, &rq->cfs_tasks); +simple: #endif - - if (hrtick_enabled_fair(rq)) - hrtick_start_fair(rq, p); - - update_misfit_status(p, rq); - sched_fair_update_stop_tick(rq, p); - + put_prev_set_next_task(rq, prev, p); return p; idle: if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); + new_tasks = sched_balance_newidle(rq, rf); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -8539,15 +8968,34 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq) +static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) +{ + return pick_next_task_fair(rq, prev, NULL); +} + +static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) +{ + return !!dl_se->rq->cfs.nr_queued; +} + +static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { - return pick_next_task_fair(rq, NULL, NULL); + return pick_task_fair(dl_se->rq); +} + +void fair_server_init(struct rq *rq) +{ + struct sched_dl_entity *dl_se = &rq->fair_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); } /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -8598,7 +9046,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; - /* Tell the scheduler that we'd really like pse to run next. */ + /* Tell the scheduler that we'd really like se to run next. */ set_next_buddy(se); yield_task_fair(rq); @@ -8656,7 +9104,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) * topology where each level pairs two lower groups (or better). This results * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency - * of load-balance at each level inv. proportional to the number of CPUs in + * of load-balance at each level inversely proportional to the number of CPUs in * the groups. * * This yields: @@ -8858,43 +9306,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns 1, if task migration degrades locality - * Returns 0, if task migration improves locality i.e migration preferred. - * Returns -1, if task migration is not affected by locality. + * Returns a positive value, if task migration degrades locality. + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. */ -static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) - return -1; + return 0; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return -1; + return 0; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return -1; + return 0; /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) { if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) return 1; else - return -1; + return 0; } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return 0; + return -1; /* Leaving a core idle is often worse than degrading locality. */ if (env->idle == CPU_IDLE) - return -1; + return 0; dist = node_distance(src_nid, dst_nid); if (numa_group) { @@ -8905,38 +9353,78 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) dst_weight = task_weight(p, dst_nid, dist); } - return dst_weight < src_weight; + return src_weight - dst_weight; } #else -static inline int migrate_degrades_locality(struct task_struct *p, +static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return -1; + return 0; } #endif /* + * Check whether the task is ineligible on the destination cpu + * + * When the PLACE_LAG scheduling feature is enabled and + * dst_cfs_rq->nr_queued is greater than 1, if the task + * is ineligible, it will also be ineligible when + * it is migrated to the destination cpu. + */ +static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu) +{ + struct cfs_rq *dst_cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; +#else + dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; +#endif + if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued && + !entity_eligible(task_cfs_rq(p), &p->se)) + return 1; + + return 0; +} + +/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot; + long degrades, hot; lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) + p->sched_task_hot = 0; /* * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 1) delayed dequeued unless we migrate load, or + * 2) throttled_lb_pair, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU. */ + if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) + return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - /* Disregard pcpu kthreads; they are where they need to be. */ + /* + * We want to prioritize the migration of eligible tasks. + * For ineligible tasks we soft-limit them and only allow + * them to migrate when nr_balance_failed is non-zero to + * avoid load-balancing trying very hard to balance the load. + */ + if (!env->sd->nr_balance_failed && + task_is_ineligible_on_dst_cpu(p, env->dst_cpu)) + return 0; + + /* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; @@ -8962,12 +9450,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { - env->flags |= LBF_DST_PINNED; - env->new_dst_cpu = cpu; - break; - } + cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); + + if (cpu < nr_cpu_ids) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; } return 0; @@ -8991,16 +9478,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->flags & LBF_ACTIVE_LB) return 1; - tsk_cache_hot = migrate_degrades_locality(p, env); - if (tsk_cache_hot == -1) - tsk_cache_hot = task_hot(p, env); + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); + else + hot = degrades > 0; - if (tsk_cache_hot <= 0 || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) { - schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->stats.nr_forced_migrations); - } + if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + if (hot) + p->sched_task_hot = 1; return 1; } @@ -9015,6 +9501,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) { + p->sched_task_hot = 0; + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->stats.nr_forced_migrations); + } + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -9082,16 +9574,12 @@ static int detach_tasks(struct lb_env *env) * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash. */ - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) break; env->loop++; - /* - * We've more or less seen every task there is, call it quits - * unless we haven't found any movable task yet. - */ - if (env->loop > env->loop_max && - !(env->flags & LBF_ALL_PINNED)) + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) break; /* take a breather every nr_migrate tasks */ @@ -9179,6 +9667,9 @@ static int detach_tasks(struct lb_env *env) continue; next: + if (p->sched_task_hot) + schedstat_inc(p->stats.nr_failed_migrations_hot); + list_move(&p->se.group_node, tasks); } @@ -9261,7 +9752,7 @@ static inline bool others_have_blocked(struct rq *rq) if (cpu_util_dl(rq)) return true; - if (thermal_load_avg(rq)) + if (hw_load_avg(rq)) return true; if (cpu_util_irq(rq)) @@ -9289,28 +9780,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { static bool __update_blocked_others(struct rq *rq, bool *done) { - const struct sched_class *curr_class; - u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; - bool decayed; + bool updated; /* * update_load_avg() can call cpufreq_update_util(). Make sure that RT, * DL and IRQ signals have been updated before updating CFS. */ - curr_class = rq->curr->sched_class; - - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - - decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | - update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | - update_irq_load_avg(rq, 0); + updated = update_other_load_avgs(rq); if (others_have_blocked(rq)) *done = false; - return decayed; + return updated; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9331,7 +9812,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); if (cfs_rq == &rq->cfs) @@ -9423,7 +9904,7 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -static void update_blocked_averages(int cpu) +static void sched_balance_update_blocked_averages(int cpu) { bool decayed = false, done = true; struct rq *rq = cpu_rq(cpu); @@ -9442,25 +9923,25 @@ static void update_blocked_averages(int cpu) rq_unlock_irqrestore(rq, &rf); } -/********** Helpers for find_busiest_group ************************/ +/********** Helpers for sched_balance_find_src_group ************************/ /* - * sg_lb_stats - stats of a sched_group required for load_balancing + * sg_lb_stats - stats of a sched_group required for load-balancing: */ struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ - unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ - unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ - unsigned int idle_cpus; + unsigned long avg_load; /* Avg load over the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long group_capacity; /* Capacity over the CPUs of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ + unsigned int sum_nr_running; /* Nr of all tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ + unsigned int idle_cpus; /* Nr of idle CPUs in the group */ unsigned int group_weight; enum group_type group_type; - unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ - unsigned int group_smt_balance; /* Task on busy SMT be moved */ - unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -9468,19 +9949,18 @@ struct sg_lb_stats { }; /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. + * sd_lb_stats - stats of a sched_domain required for load-balancing: */ struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *local; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_capacity; /* Total capacity of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ - - struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* Tasks should go to sibling first */ + + struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; static inline void init_sd_lb_stats(struct sd_lb_stats *sds) @@ -9506,8 +9986,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) static unsigned long scale_rt_capacity(int cpu) { + unsigned long max = get_actual_cpu_capacity(cpu); struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -9519,12 +9999,9 @@ static unsigned long scale_rt_capacity(int cpu) /* * avg_rt.util_avg and avg_dl.util_avg track binary signals * (running and not running) with weights 0 and 1024 respectively. - * avg_thermal.load_avg tracks thermal pressure and the weighted - * average uses the actual delta max capacity(load). */ used = cpu_util_rt(rq); used += cpu_util_dl(rq); - used += thermal_load_avg(rq); if (unlikely(used >= max)) return 1; @@ -9617,16 +10094,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } -/* - * Check whether a rq has a misfit task and if it looks like we can actually - * help that task: we can migrate the task to a CPU of higher capacity, or - * the task's current CPU is heavily pressured. - */ -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +/* Check if the rq has a misfit task */ +static inline bool check_misfit_status(struct rq *rq) { - return rq->misfit_task_load && - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || - check_cpu_capacity(rq, sd)); + return rq->misfit_task_load; } /* @@ -9650,7 +10121,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -9798,7 +10269,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group (sgs->group_weight - sgs->idle_cpus != 1)) return false; - return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); + return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu)); } /* One group has more than one SMT CPU while the other group does not */ @@ -9815,7 +10286,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { - if (env->idle == CPU_NOT_IDLE) + if (!env->idle) return false; /* @@ -9839,7 +10310,7 @@ static inline long sibling_imbalance(struct lb_env *env, int ncores_busiest, ncores_local; long imbalance; - if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + if (!env->idle || !busiest->sum_nr_running) return 0; ncores_busiest = sds->busiest->cores; @@ -9873,7 +10344,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * When there is more than 1 task, the group_overloaded case already * takes care of cpu with reduced capacity */ - if (rq->cfs.h_nr_running != 1) + if (rq->cfs.h_nr_runnable != 1) return false; return check_cpu_capacity(rq, sd); @@ -9885,15 +10356,18 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. - * @sg_status: Holds flag indicating the status of the sched_group + * @sg_overloaded: sched_group is overloaded + * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - int *sg_status) + bool *sg_overloaded, + bool *sg_overutilized) { - int i, nr_running, local_group; + int i, nr_running, local_group, sd_flags = env->sd->flags; + bool balancing_at_rd = !env->sd->parent; memset(sgs, 0, sizeof(*sgs)); @@ -9906,21 +10380,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; - if (nr_running > 1) - *sg_status |= SG_OVERLOAD; - if (cpu_overutilized(i)) - *sg_status |= SG_OVERUTILIZED; + *sg_overutilized = 1; -#ifdef CONFIG_NUMA_BALANCING - sgs->nr_numa_running += rq->nr_numa_running; - sgs->nr_preferred_running += rq->nr_preferred_running; -#endif /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -9930,17 +10397,27 @@ static inline void update_sg_lb_stats(struct lb_env *env, continue; } + /* Overload indicator is only updated at root domain */ + if (balancing_at_rd && nr_running > 1) + *sg_overloaded = 1; + +#ifdef CONFIG_NUMA_BALANCING + /* Only fbq_classify_group() uses this to classify NUMA groups */ + if (sd_flags & SD_NUMA) { + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; + } +#endif if (local_group) continue; - if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + if (sd_flags & SD_ASYM_CPUCAPACITY) { /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + *sg_overloaded = 1; } - } else if ((env->idle != CPU_NOT_IDLE) && - sched_reduced_capacity(rq, env->sd)) { + } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { /* Check for a task running on a CPU with reduced capacity */ if (sgs->group_misfit_task_load < load) sgs->group_misfit_task_load = load; @@ -9952,7 +10429,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + if (!local_group && env->idle && sgs->sum_h_nr_running && sched_group_asym(env, sgs, group)) sgs->group_asym_packing = 1; @@ -10029,7 +10506,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); + return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu), + READ_ONCE(sg->asym_prefer_cpu)); case group_misfit_task: /* @@ -10090,7 +10568,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, has_spare: /* - * Select not overloaded group with lowest number of idle cpus + * Select not overloaded group with lowest number of idle CPUs * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up * that the group has less spare capacity but finally more idle @@ -10222,7 +10700,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -10310,13 +10788,13 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * find_idlest_group() finds and returns the least busy CPU group within the + * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -10564,7 +11042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; unsigned long sum_util = 0; - int sg_status = 0; + bool sg_overloaded = 0, sg_overutilized = 0; do { struct sg_lb_stats *sgs = &tmp_sgs; @@ -10580,7 +11058,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_status); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -10608,19 +11086,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { - struct root_domain *rd = env->dst_rq->rd; - /* update overload indicator if we are at root domain */ - WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); - } else if (sg_status & SG_OVERUTILIZED) { - struct root_domain *rd = env->dst_rq->rd; - - WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); + } else if (sg_overutilized) { + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } update_idle_cpu_scan(env, sum_util); @@ -10710,7 +11182,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * waiting task in this overloaded busiest group. Let's * try to pull it. */ - if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + if (env->idle && env->imbalance == 0) { env->migration_type = migrate_task; env->imbalance = 1; } @@ -10729,7 +11201,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * If there is no overload, we just want to even the number of - * idle cpus. + * idle CPUs. */ env->migration_type = migrate_task; env->imbalance = max_t(long, 0, @@ -10802,7 +11274,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; } -/******* find_busiest_group() helpers end here *********************/ +/******* sched_balance_find_src_group() helpers end here *********************/ /* * Decision matrix according to the local and busiest group type: @@ -10825,7 +11297,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ /** - * find_busiest_group - Returns the busiest group within the sched_domain + * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. * @env: The load balancing environment. * @@ -10834,7 +11306,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * * Return: - The busiest group if imbalance exists. */ -static struct sched_group *find_busiest_group(struct lb_env *env) +static struct sched_group *sched_balance_find_src_group(struct lb_env *env) { struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; @@ -10857,12 +11329,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_misfit_task) goto force_balance; - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } + if (!is_rd_overutilized(env->dst_rq->rd) && + rcu_dereference(env->dst_rq->rd->pd)) + goto out_balanced; /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) @@ -10925,7 +11394,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) { + if (!env->idle) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already @@ -10973,9 +11442,9 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the CPUs in the group. + * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. */ -static struct rq *find_busiest_queue(struct lb_env *env, +static struct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; @@ -11013,7 +11482,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - nr_running = rq->cfs.h_nr_running; + nr_running = rq->cfs.h_nr_runnable; if (!nr_running) continue; @@ -11133,7 +11602,7 @@ asym_active_balance(struct lb_env *env) * the lower priority @env::dst_cpu help it. Do not follow * CPU priority. */ - return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) && + return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) && (sched_asym_prefer(env->dst_cpu, env->src_cpu) || !sched_use_asym_prio(env->sd, env->src_cpu)); } @@ -11171,8 +11640,8 @@ static int need_active_balance(struct lb_env *env) * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. */ - if ((env->idle != CPU_NOT_IDLE) && - (env->src_rq->cfs.h_nr_running == 1)) { + if (env->idle && + (env->src_rq->cfs.h_nr_runnable == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -11252,11 +11721,33 @@ static int should_we_balance(struct lb_env *env) return group_balance_cpu(sg) == env->dst_cpu; } +static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd, + enum cpu_idle_type idle) +{ + if (!schedstat_enabled()) + return; + + switch (env->migration_type) { + case migrate_load: + __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance); + break; + case migrate_util: + __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance); + break; + case migrate_task: + __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance); + break; + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; + } +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ -static int load_balance(int this_cpu, struct rq *this_rq, +static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { @@ -11288,13 +11779,13 @@ redo: goto out_balanced; } - group = find_busiest_group(&env); + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(&env, group); + busiest = sched_balance_find_src_rq(&env, group); if (!busiest) { schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; @@ -11302,7 +11793,7 @@ redo: WARN_ON_ONCE(busiest == env.dst_rq); - schedstat_add(sd->lb_imbalance[idle], env.imbalance); + update_lb_imbalance_stat(&env, sd, idle); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -11312,7 +11803,7 @@ redo: env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* - * Attempt to move tasks. If find_busiest_group has found + * Attempt to move tasks. If sched_balance_find_src_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. @@ -11348,9 +11839,7 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - /* Stop if we tried all running tasks */ - if (env.loop < busiest->nr_running) - goto more_balance; + goto more_balance; } /* @@ -11427,8 +11916,12 @@ more_balance: * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. + * + * Similarly for migration_misfit which is not related to + * load/util migration, don't pollute nr_balance_failed. */ - if (idle != CPU_NEWLY_IDLE) + if (idle != CPU_NEWLY_IDLE && + env.migration_type != migrate_misfit) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -11507,12 +12000,17 @@ out_one_pinned: ld_moved = 0; /* - * newidle_balance() disregards balance intervals, so we could + * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. + * + * Similarly misfit migration which is not necessarily an indication of + * the system being busy and requires lb to backoff to let it settle + * down. */ - if (env.idle == CPU_NEWLY_IDLE) + if (env.idle == CPU_NEWLY_IDLE || + env.migration_type == migrate_misfit) goto out; /* tune up the balancing interval */ @@ -11645,10 +12143,23 @@ out_unlock: return 0; } -static DEFINE_SPINLOCK(balancing); +/* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); /* - * Scale the max load_balance interval with the number of CPUs in the system. + * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ void update_max_interval(void) @@ -11686,7 +12197,7 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; @@ -11723,25 +12234,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { - if (!spin_trylock(&balancing)) + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + idle = idle_cpu(cpu); + busy = !idle && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } if (need_serialize) - spin_unlock(&balancing); + atomic_set_release(&sched_balance_running, 0); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; @@ -11780,16 +12291,13 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set - * anywhere yet. */ static inline int find_new_ilb(void) { const struct cpumask *hk_mask; int ilb_cpu; - hk_mask = housekeeping_cpumask(HK_TYPE_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { @@ -11807,7 +12315,8 @@ static inline int find_new_ilb(void) * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU * SMP function call (IPI). * - * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set + * (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -11825,6 +12334,13 @@ static void kick_ilb(unsigned int flags) return; /* + * Don't bother if no new NOHZ balance work items for ilb_cpu, + * i.e. all bits in flags are already set in ilb_cpu. + */ + if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags) + return; + + /* * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets * the first flag owns it; cleared by nohz_csd_func(). */ @@ -11888,7 +12404,7 @@ static void nohz_balancer_kick(struct rq *rq) * If there's a runnable CFS task and the current CPU has reduced * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { + if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -11901,7 +12417,7 @@ static void nohz_balancer_kick(struct rq *rq) * currently idle; in which case, kick the ILB to move tasks * around. * - * When balancing betwen cores, all the SMT siblings of the + * When balancing between cores, all the SMT siblings of the * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { @@ -11918,7 +12434,7 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq, sd)) { + if (check_misfit_status(rq)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -11978,7 +12494,7 @@ unlock: void nohz_balance_exit_idle(struct rq *rq) { - SCHED_WARN_ON(rq != this_rq()); + WARN_ON_ONCE(rq != this_rq()); if (likely(!rq->nohz_tick_stopped)) return; @@ -12014,16 +12530,12 @@ void nohz_balance_enter_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - SCHED_WARN_ON(cpu != smp_processor_id()); + WARN_ON_ONCE(cpu != smp_processor_id()); /* If this CPU is going down, then nothing needs to be done: */ if (!cpu_active(cpu)) return; - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) - return; - /* * Can be set safely without rq->lock held * If a clear happens, it will have evaluated last additions because @@ -12062,7 +12574,7 @@ void nohz_balance_enter_idle(int cpu) out: /* * Each time a cpu enter idle, we assume that it has blocked load and - * enable the periodic update of the load of idle cpus + * enable the periodic update of the load of idle CPUs */ WRITE_ONCE(nohz.has_blocked, 1); } @@ -12080,13 +12592,13 @@ static bool update_nohz_stats(struct rq *rq) if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) return true; - update_blocked_averages(cpu); + sched_balance_update_blocked_averages(cpu); return rq->has_blocked_load; } /* - * Internal function that runs load balance for all idle cpus. The load balance + * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ @@ -12101,7 +12613,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) int balance_cpu; struct rq *rq; - SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); /* * We assume there will be no idle load after this update and clear @@ -12137,7 +12649,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) * work being done for other CPUs. Next load * balancing owner will pick it up. */ - if (need_resched()) { + if (!idle_cpu(this_cpu) && need_resched()) { if (flags & NOHZ_STATS_KICK) has_blocked_load = true; if (flags & NOHZ_NEXT_KICK) @@ -12162,7 +12674,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(rq, CPU_IDLE); + sched_balance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { @@ -12191,7 +12703,7 @@ abort: /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -12222,7 +12734,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * called from this function on (this) CPU that's not yet in the mask. That's * OK because the goal of nohz_run_idle_balance() is to run ILB only for * updating the blocked load of already idle CPUs without waking up one of - * those idle CPUs and outside the preempt disable / irq off phase of the local + * those idle CPUs and outside the preempt disable / IRQ off phase of the local * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) @@ -12233,7 +12745,7 @@ void nohz_run_idle_balance(int cpu) /* * Update the blocked load only if no SCHED_SOFTIRQ is about to happen - * (ie NOHZ_STATS_KICK set) and will do the same. + * (i.e. NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); @@ -12243,13 +12755,6 @@ static void nohz_newidle_balance(struct rq *this_rq) { int this_cpu = this_rq->cpu; - /* - * This CPU doesn't want to be disturbed by scheduler - * housekeeping - */ - if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) - return; - /* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) return; @@ -12278,7 +12783,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * newidle_balance is called by schedule() if this_cpu is about to become + * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -12286,10 +12791,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + int continue_balancing = 1; u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; @@ -12304,8 +12810,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) return 0; /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * We must set idle_stamp _before_ calling sched_balance_rq() + * for CPU_NEWLY_IDLE, such that we measure the this duration + * as idle time. */ this_rq->idle_stamp = rq_clock(this_rq); @@ -12326,7 +12833,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!READ_ONCE(this_rq->rd->overload) || + if (!get_rd_overloaded(this_rq->rd) || (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { if (sd) @@ -12340,11 +12847,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); - update_blocked_averages(this_cpu); + sched_balance_update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - int continue_balancing = 1; u64 domain_cost; update_next_balance(sd, &next_balance); @@ -12354,7 +12860,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { - pulled_task = load_balance(this_cpu, this_rq, + pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -12370,8 +12876,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0 || - this_rq->ttwu_pending) + if (pulled_task || !continue_balancing) break; } rcu_read_unlock(); @@ -12386,11 +12891,11 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) + if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) + if (this_rq->nr_running != this_rq->cfs.h_nr_queued) pulled_task = -1; out: @@ -12409,19 +12914,21 @@ out: } /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + * This softirq handler is triggered via SCHED_SOFTIRQ from two places: + * + * - directly from the local sched_tick() for periodic load balancing + * + * - indirectly from a remote sched_tick() for NOHZ idle balancing + * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(void) { struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; /* - * If this CPU has a pending nohz_balance_kick, then do the + * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are - * stopped. Do nohz_idle_balance *before* rebalance_domains to + * stopped. Do nohz_idle_balance *before* sched_balance_domains to * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. @@ -12430,14 +12937,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) return; /* normal load balance */ - update_blocked_averages(this_rq->cpu); - rebalance_domains(this_rq, idle); + sched_balance_update_blocked_averages(this_rq->cpu); + sched_balance_domains(this_rq, idle); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq) +void sched_balance_trigger(struct rq *rq) { /* * Don't need to rebalance while attached to NULL domain or @@ -12502,7 +13009,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU. */ - if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && + if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } @@ -12546,7 +13053,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, struct cfs_rq *cfs_rqb; s64 delta; - SCHED_WARN_ON(task_rq(b)->core != rq->core); + WARN_ON_ONCE(task_rq(b)->core != rq->core); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -12621,7 +13128,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); - update_overutilized_status(task_rq(curr)); + check_update_overutilized_status(task_rq(curr)); task_tick_core(rq, curr); } @@ -12633,20 +13140,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct sched_entity *se = &p->se, *curr; - struct cfs_rq *cfs_rq; - struct rq *rq = this_rq(); - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; - if (curr) - update_curr(cfs_rq); - place_entity(cfs_rq, se, ENQUEUE_INITIAL); - rq_unlock(rq, &rf); + set_task_max_allowed_capacity(p); } /* @@ -12659,7 +13153,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->cfs.nr_running == 1) + if (rq->cfs.nr_queued == 1) return; /* @@ -12667,7 +13161,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -12762,27 +13256,26 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) static void switched_to_fair(struct rq *rq, struct task_struct *p) { + WARN_ON_ONCE(p->se.sched_delayed); + attach_task_cfs_rq(p); + set_task_max_allowed_capacity(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (task_current(rq, p)) + if (task_current_donor(rq, p)) resched_curr(rq); else wakeup_preempt(rq, p, 0); } } -/* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; @@ -12795,6 +13288,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) list_move(&se->group_node, &rq->cfs_tasks); } #endif + if (!first) + return; + + WARN_ON_ONCE(se->sched_delayed); + + if (hrtick_enabled_fair(rq)) + hrtick_start_fair(rq, p); + + update_misfit_status(p, rq); + sched_fair_update_stop_tick(rq, p); +} + +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +{ + struct sched_entity *se = &p->se; for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -12803,12 +13317,14 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } + + __set_next_task_fair(rq, p, first); } void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif @@ -12910,28 +13426,35 @@ void online_fair_sched_group(struct task_group *tg) void unregister_fair_sched_group(struct task_group *tg) { - unsigned long flags; - struct rq *rq; int cpu; destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(cpu) { - if (tg->se[cpu]) - remove_entity_load_avg(tg->se[cpu]); + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct sched_entity *se = tg->se[cpu]; + struct rq *rq = cpu_rq(cpu); + + if (se) { + if (se->sched_delayed) { + guard(rq_lock_irqsave)(rq); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + list_del_leaf_cfs_rq(cfs_rq); + } + remove_entity_load_avg(se); + } /* * Only empty task groups can be destroyed; so we can speculatively * check on_list without danger of it being re-added. */ - if (!tg->cfs_rq[cpu]->on_list) - continue; - - rq = cpu_rq(cpu); - - raw_spin_rq_lock_irqsave(rq, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_rq_unlock_irqrestore(rq, flags); + if (cfs_rq->on_list) { + guard(rq_lock_irqsave)(rq); + list_del_leaf_cfs_rq(cfs_rq); + } } } @@ -13040,7 +13563,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -13051,16 +13574,8 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; - if (se->on_rq) { - parent_cfs_rq = cfs_rq_of(se); - if (cfs_rq_is_idle(grp_cfs_rq)) - parent_cfs_rq->idle_nr_running++; - else - parent_cfs_rq->idle_nr_running--; - } - - idle_task_delta = grp_cfs_rq->h_nr_running - - grp_cfs_rq->idle_h_nr_running; + idle_task_delta = grp_cfs_rq->h_nr_queued - + grp_cfs_rq->h_nr_idle; if (!cfs_rq_is_idle(grp_cfs_rq)) idle_task_delta *= -1; @@ -13070,7 +13585,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (!se->on_rq) break; - cfs_rq->idle_h_nr_running += idle_task_delta; + cfs_rq->h_nr_idle += idle_task_delta; /* Already accounted at parent level and above. */ if (cfs_rq_is_idle(cfs_rq)) @@ -13121,13 +13636,13 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, + .pick_task = pick_task_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP .balance = balance_fair, - .pick_task = pick_task_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13135,12 +13650,13 @@ DEFINE_SCHED_CLASS(fair) = { .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, - .set_cpus_allowed = set_cpus_allowed_common, + .set_cpus_allowed = set_cpus_allowed_fair, #endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, + .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, @@ -13162,7 +13678,6 @@ DEFINE_SCHED_CLASS(fair) = { #endif }; -#ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq, *pos; @@ -13196,7 +13711,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { @@ -13215,7 +13729,7 @@ __init void init_sched_fair_class(void) #endif } - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + open_softirq(SCHED_SOFTIRQ, sched_balance_softirq); #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 143f55df890b..3c12d9f93331 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -5,8 +5,24 @@ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. */ SCHED_FEAT(PLACE_LAG, true) +/* + * Give new tasks half a slice to ease into the competition. + */ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) +/* + * Preserve relative virtual deadline on 'migration'. + */ +SCHED_FEAT(PLACE_REL_DEADLINE, true) +/* + * Inhibit (wakeup) preemption until the current task has either matched the + * 0-lag point or until is has exhausted it's slice. + */ SCHED_FEAT(RUN_TO_PARITY, true) +/* + * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for + * current. + */ +SCHED_FEAT(PREEMPT_SHORT, true) /* * Prefer to schedule the task we woke last (assuming it failed @@ -16,19 +32,39 @@ SCHED_FEAT(RUN_TO_PARITY, true) SCHED_FEAT(NEXT_BUDDY, false) /* + * Allow completely ignoring cfs_rq->next; which can be set from various + * places: + * - NEXT_BUDDY (wakeup preemption) + * - yield_to_task() + * - cgroup dequeue / pick + */ +SCHED_FEAT(PICK_BUDDY, true) + +/* * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. */ SCHED_FEAT(CACHE_HOT_BUDDY, true) /* + * Delay dequeueing tasks until they get selected or woken. + * + * By delaying the dequeue for non-eligible tasks, they remain in the + * competition and can burn off their negative lag. When they get selected + * they'll have positive lag by definition. + * + * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0. + */ +SCHED_FEAT(DELAY_DEQUEUE, true) +SCHED_FEAT(DELAY_ZERO, true) + +/* * Allow wakeup-time preemption of the current task: */ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK_DL, false) -SCHED_FEAT(DOUBLE_TICK, false) /* * Decrement CPU capacity based on time not spent running tasks @@ -85,5 +121,3 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) - -SCHED_FEAT(HZ_BW, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6135fbe83d68..2c85c86b455f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -172,19 +172,13 @@ static void cpuidle_idle_call(void) /* * Check if the idle task must be rescheduled. If it is the - * case, exit the function after re-enabling the local irq. + * case, exit the function after re-enabling the local IRQ. */ if (need_resched()) { local_irq_enable(); return; } - /* - * The RCU framework needs to be told that we are entering an idle - * section, so no more rcu read side critical sections and one more - * step to the grace period - */ - if (cpuidle_not_available(drv, dev)) { tick_nohz_idle_stop_tick(); @@ -244,7 +238,7 @@ exit_idle: __current_set_polling(); /* - * It is up to the idle functions to reenable local interrupts + * It is up to the idle functions to re-enable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); @@ -277,7 +271,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - rmb(); /* * Interrupts shouldn't be re-enabled from that point on until @@ -320,7 +313,7 @@ static void do_idle(void) rcu_nocb_flush_deferred_wakeup(); /* - * In poll mode we reenable interrupts and spin. Also if we + * In poll mode we re-enable interrupts and spin. Also if we * detected in the wakeup from idle path that the tick * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. @@ -405,8 +398,8 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - it.timer.function = idle_inject_timer_fn; + hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED_HARD); @@ -456,43 +449,38 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + dl_server_update_idle_time(rq, prev); + scx_update_idle(rq, false, true); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); + scx_update_idle(rq, true, true); schedstat_inc(rq->sched_goidle); + next->se.exec_start = rq_clock_task(rq); } -#ifdef CONFIG_SMP -static struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq) { + scx_update_idle(rq, true, false); return rq->idle; } -#endif - -struct task_struct *pick_next_task_idle(struct rq *rq) -{ - struct task_struct *next = rq->idle; - - set_next_task_idle(rq, next, true); - - return next; -} /* * It is not legal to sleep in the idle task - print a warning * message if some code attempts to do it: */ -static void +static bool dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { raw_spin_rq_unlock_irq(rq); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); raw_spin_rq_lock_irq(rq); + return true; } /* @@ -534,13 +522,12 @@ DEFINE_SCHED_CLASS(idle) = { .wakeup_preempt = wakeup_preempt_idle, - .pick_next_task = pick_next_task_idle, + .pick_task = pick_task_idle, .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP .balance = balance_idle, - .pick_task = pick_task_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 5891e715f00d..93b038d48900 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -9,15 +9,9 @@ */ enum hk_flags { - HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), - HK_FLAG_RCU = BIT(HK_TYPE_RCU), - HK_FLAG_MISC = BIT(HK_TYPE_MISC), - HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), - HK_FLAG_TICK = BIT(HK_TYPE_TICK), HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), - HK_FLAG_WQ = BIT(HK_TYPE_WQ), HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), - HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), + HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE), }; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); @@ -46,7 +40,7 @@ int housekeeping_any_cpu(enum hk_type type) if (cpu < nr_cpu_ids) return cpu; - cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask); if (likely(cpu < nr_cpu_ids)) return cpu; /* @@ -97,7 +91,7 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overridden); - if (housekeeping.flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_KERNEL_NOISE) sched_tick_offload_init(); for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { @@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) unsigned int first_cpu; int err = 0; - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) { if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { pr_warn("Housekeeping: nohz unsupported." " Build with CONFIG_NO_HZ_FULL\n"); @@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) housekeeping_setup_type(type, housekeeping_staging); } - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) tick_nohz_full_setup(non_housekeeping_mask); housekeeping.flags |= flags; @@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned long flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | - HK_FLAG_MISC | HK_FLAG_KTHREAD; + flags = HK_FLAG_KERNEL_NOISE; return housekeeping_setup(str, flags); } @@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str) int len; while (isalpha(*str)) { + /* + * isolcpus=nohz is equivalent to nohz_full. + */ if (!strncmp(str, "nohz,", 5)) { str += 5; - flags |= HK_FLAG_TICK; + flags |= HK_FLAG_KERNEL_NOISE; continue; } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 52c8f8226b0d..c48900b856a2 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -45,7 +45,7 @@ * again, being late doesn't loose the delta, just wrecks the sample. * * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because - * this would add another cross-CPU cacheline miss and atomic operation + * this would add another cross-CPU cache-line miss and atomic operation * to the wakeup path. Instead we increment on whatever CPU the task ran * when it went into uninterruptible state and decrement on whatever CPU * did the wakeup. This means that only the sum of nr_uninterruptible over @@ -62,7 +62,7 @@ EXPORT_SYMBOL(avenrun); /* should be removed */ /** * get_avenrun - get the load average array - * @loads: pointer to dest load array + * @loads: pointer to destination load array * @offset: offset to add * @shift: shift count to shift the result left * @@ -379,7 +379,7 @@ void calc_global_load(void) } /* - * Called from scheduler_tick() to periodically update this CPU's + * Called from sched_tick() to periodically update this CPU's * active count. */ void calc_global_load_tick(struct rq *this_rq) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 63b6cf898220..7a8534a2deff 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -208,8 +208,8 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * se has been already dequeued but cfs_rq->curr still points to it. * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls - * update_blocked_averages(). + * this happens during sched_balance_newidle() which calls + * sched_balance_update_blocked_averages(). * * Also see the comment in accumulate_sum(). */ @@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = grq->h_nr_running + * se_runnable() = grq->h_nr_runnable * * runnable_sum = se_runnable() * runnable = grq->runnable_sum * runnable_avg = runnable_sum @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running, + cfs_rq->h_nr_runnable, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); @@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#ifdef CONFIG_SCHED_THERMAL_PRESSURE +#ifdef CONFIG_SCHED_HW_PRESSURE /* - * thermal: + * hardware: * * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked * * util_avg and runnable_load_avg are not supported and meaningless. * * Unlike rt/dl utilization tracking that track time spent by a cpu - * running a rt/dl task through util_avg, the average thermal pressure is - * tracked through load_avg. This is because thermal pressure signal is + * running a rt/dl task through util_avg, the average HW pressure is + * tracked through load_avg. This is because HW pressure signal is * time weighted "delta" capacity unlike util_avg which is binary. * "delta capacity" = actual capacity - - * capped capacity a cpu due to a thermal event. + * capped capacity a cpu due to a HW event. */ -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { - if (___update_load_sum(now, &rq->avg_thermal, + if (___update_load_sum(now, &rq->avg_hw, capacity, capacity, capacity)) { - ___update_load_avg(&rq->avg_thermal, 1); - trace_pelt_thermal_tp(rq); + ___update_load_avg(&rq->avg_hw, 1); + trace_pelt_hw_tp(rq); return 1; } @@ -417,7 +417,7 @@ int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* - * irq: + * IRQ: * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum @@ -432,7 +432,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) int ret = 0; /* - * We can't use clock_pelt because irq time is not accounted in + * We can't use clock_pelt because IRQ time is not accounted in * clock_task. Instead we directly scale the running time to * reflect the real amount of computation */ @@ -467,3 +467,23 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } #endif + +/* + * Load avg and utiliztion metrics need to be updated periodically and before + * consumption. This function updates the metrics for all subsystems except for + * the fair class. @rq must be locked and have its clock updated. + */ +bool update_other_load_avgs(struct rq *rq) +{ + u64 now = rq_clock_pelt(rq); + const struct sched_class *curr_class = rq->donor->sched_class; + unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + + lockdep_assert_rq_held(rq); + + /* hw_pressure doesn't care about invariance */ + return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) | + update_irq_load_avg(rq, 0); +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 9e1083465fbc..f4f6a0875c66 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -6,22 +6,23 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +bool update_other_load_avgs(struct rq *rq); -#ifdef CONFIG_SCHED_THERMAL_PRESSURE -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +#ifdef CONFIG_SCHED_HW_PRESSURE +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity); -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { - return READ_ONCE(rq->avg_thermal.load_avg); + return READ_ONCE(rq->avg_hw.load_avg); } #else static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } @@ -202,12 +203,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7b4aa5809c0f..1396674fa722 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -41,7 +41,7 @@ * What it means for a task to be productive is defined differently * for each resource. For IO, productive means a running task. For * memory, productive means a running task that isn't a reclaimer. For - * CPU, productive means an oncpu task. + * CPU, productive means an on-CPU task. * * Naturally, the FULL state doesn't exist for the CPU resource at the * system level, but exist at the cgroup level. At the cgroup level, @@ -49,7 +49,7 @@ * resource which is being used by others outside of the cgroup or * throttled by the cgroup cpu.max configuration. * - * The percentage of wallclock time spent in those compound stall + * The percentage of wall clock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, * where the SOME percentage indicates workload slowdowns and the FULL * percentage indicates reduced CPU utilization: @@ -218,28 +218,32 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) +static u32 test_states(unsigned int *tasks, u32 state_mask) { - switch (state) { - case PSI_IO_SOME: - return unlikely(tasks[NR_IOWAIT]); - case PSI_IO_FULL: - return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]); - case PSI_MEM_SOME: - return unlikely(tasks[NR_MEMSTALL]); - case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && - tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); - case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > oncpu); - case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !oncpu); - case PSI_NONIDLE: - return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || - tasks[NR_RUNNING]; - default: - return false; + const bool oncpu = state_mask & PSI_ONCPU; + + if (tasks[NR_IOWAIT]) { + state_mask |= BIT(PSI_IO_SOME); + if (!tasks[NR_RUNNING]) + state_mask |= BIT(PSI_IO_FULL); + } + + if (tasks[NR_MEMSTALL]) { + state_mask |= BIT(PSI_MEM_SOME); + if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]) + state_mask |= BIT(PSI_MEM_FULL); } + + if (tasks[NR_RUNNING] > oncpu) + state_mask |= BIT(PSI_CPU_SOME); + + if (tasks[NR_RUNNING] && !oncpu) + state_mask |= BIT(PSI_CPU_FULL); + + if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]) + state_mask |= BIT(PSI_NONIDLE); + + return state_mask; } static void get_recent_times(struct psi_group *group, int cpu, @@ -345,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group, /* * Collect the per-cpu time buckets and average them into a - * single time sample that is normalized to wallclock time. + * single time sample that is normalized to wall clock time. * * For averaging, each CPU is weighted by its non-idle time in * the sampling period. This eliminates artifacts from uneven @@ -765,14 +769,15 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, u64 now, + unsigned int clear, unsigned int set, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; - enum psi_states s; u32 state_mask; + u64 now; + lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* @@ -785,6 +790,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); /* * Start with TSK_ONCPU, which doesn't have a corresponding @@ -841,10 +847,7 @@ static void psi_group_change(struct psi_group *group, int cpu, return; } - for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) - state_mask |= (1 << s); - } + state_mask = test_states(groupc->tasks, state_mask); /* * Since we care about lost potential, a memstall is FULL @@ -898,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - now = cpu_clock(cpu); - group = task_psi_group(task); do { - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, true); } while ((group = group->parent)); } @@ -918,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - u64 now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -935,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, true); } while ((group = group->parent)); } @@ -973,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, do { if (group == common) break; - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } while ((group = group->parent)); /* @@ -985,35 +984,47 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, wake_clock); } } } #ifdef CONFIG_IRQ_TIME_ACCOUNTING -void psi_account_irqtime(struct task_struct *task, u32 delta) +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { - int cpu = task_cpu(task); + int cpu = task_cpu(curr); struct psi_group *group; struct psi_group_cpu *groupc; - u64 now; + s64 delta; + u64 irq; - if (static_branch_likely(&psi_disabled)) + if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; - if (!task->pid) + if (!curr->pid) return; - now = cpu_clock(cpu); + lockdep_assert_rq_held(rq); + group = task_psi_group(curr); + if (prev && task_psi_group(prev) == group) + return; + + irq = irq_time_read(cpu); + delta = (s64)(irq - rq->psi_irq_time); + if (delta < 0) + return; + rq->psi_irq_time = irq; - group = task_psi_group(task); do { + u64 now; + if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; @@ -1194,7 +1205,7 @@ void psi_cgroup_restart(struct psi_group *group) /* * After we disable psi_group->enabled, we don't actually * stop percpu tasks accounting in each psi_group_cpu, - * instead only stop test_state() loop, record_times() + * instead only stop test_states() loop, record_times() * and averaging worker, see psi_group_change() for details. * * When disable cgroup PSI, this function has nothing to sync @@ -1202,7 +1213,7 @@ void psi_cgroup_restart(struct psi_group *group) * would see !psi_group->enabled and only do task accounting. * * When re-enable cgroup PSI, this function use psi_group_change() - * to get correct state mask from test_state() loop on tasks[], + * to get correct state mask from test_states() loop on tasks[], * and restart groupc->state_start from now, use .clear = .set = 0 * here since no task status really changed. */ @@ -1212,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); struct rq_flags rf; - u64 now; rq_lock_irq(rq, &rf); - now = cpu_clock(cpu); - psi_group_change(group, cpu, 0, 0, now, true); + psi_group_change(group, cpu, 0, 0, true); rq_unlock_irq(rq, &rf); } } @@ -1231,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (!irqtime_enabled() && res == PSI_IRQ) + return -EOPNOTSUPP; +#endif + /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); @@ -1426,7 +1440,7 @@ void psi_trigger_destroy(struct psi_trigger *t) group->rtpoll_task, lockdep_is_held(&group->rtpoll_trigger_lock)); rcu_assign_pointer(group->rtpoll_task, NULL); - del_timer(&group->rtpoll_timer); + timer_delete(&group->rtpoll_timer); } } mutex_unlock(&group->rtpoll_trigger_lock); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3261b067b67e..e40422c37033 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -struct rt_bandwidth def_rt_bandwidth; - /* * period over which we measure -rt task CPU usage in us. * default: 1s @@ -26,11 +22,11 @@ int sysctl_sched_rt_runtime = 950000; #ifdef CONFIG_SYSCTL static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; -static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -static struct ctl_table sched_rt_sysctls[] = { +static const struct ctl_table sched_rt_sysctls[] = { { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, @@ -56,7 +52,6 @@ static struct ctl_table sched_rt_sysctls[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, - {} }; static int __init sched_rt_sysctl_init(void) @@ -67,6 +62,41 @@ static int __init sched_rt_sysctl_init(void) late_initcall(sched_rt_sysctl_init); #endif +void init_rt_rq(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif /* CONFIG_SMP */ + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); + rt_rq->tg = &root_task_group; +#endif +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -98,9 +128,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_HARD); - rt_b->rt_period_timer.function = sched_rt_period_timer; + hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); } static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -131,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) do_start_rt_bandwidth(rt_b); } -void init_rt_rq(struct rt_rq *rt_rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO-1; - rt_rq->highest_prio.next = MAX_RT_PRIO-1; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ - /* We start is dequeued state, because no RT tasks are queued */ - rt_rq->rt_queued = 0; - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); -} - -#ifdef CONFIG_RT_GROUP_SCHED static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) { hrtimer_cancel(&rt_b->rt_period_timer); @@ -169,19 +169,21 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_SCHED_DEBUG WARN_ON_ONCE(!rt_entity_is_task(rt_se)); -#endif + return container_of(rt_se, struct task_struct, rt); } static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { + /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */ + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) { + WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group); return rt_se->rt_rq; } @@ -189,20 +191,26 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_se->rt_rq; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } void unregister_rt_sched_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) + return; + if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); - } void free_rt_sched_group(struct task_group *tg) { int i; + if (!rt_group_sched_enabled()) + return; + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -247,6 +255,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) struct sched_rt_entity *rt_se; int i; + if (!rt_group_sched_enabled()) + return 1; + tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); if (!tg->rt_rq) goto err; @@ -254,8 +265,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!tg->rt_se) goto err; - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); + init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); for_each_possible_cpu(i) { rt_rq = kzalloc_node(sizeof(struct rt_rq), @@ -486,9 +496,6 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (!rt_rq->tg) - return RUNTIME_INF; - return rt_rq->rt_runtime; } @@ -501,6 +508,11 @@ typedef struct task_group *rt_rq_iter_t; static inline struct task_group *next_task_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) { + WARN_ON(tg != &root_task_group); + return NULL; + } + do { tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); @@ -513,9 +525,9 @@ static inline struct task_group *next_task_group(struct task_group *tg) } #define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = container_of(&task_groups, typeof(*iter), list); \ - (iter = next_task_group(iter)) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]);) + for (iter = &root_task_group; \ + iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = next_task_group(iter)) #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -530,7 +542,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; @@ -544,7 +556,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, 0); - if (rt_rq->highest_prio.curr < curr->prio) + if (rt_rq->highest_prio.curr < donor->prio) resched_curr(rq); } } @@ -605,70 +617,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &rt_rq->tg->rt_bandwidth; } -#else /* !CONFIG_RT_GROUP_SCHED */ - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(def_rt_bandwidth.rt_period); -} - -typedef struct rt_rq *rt_rq_iter_t; - -#define for_each_rt_rq(rt_rq, iter, rq) \ - for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = NULL) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return NULL; -} - -static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - struct rq *rq = rq_of_rt_rq(rt_rq); - - if (!rt_rq->rt_nr_running) - return; - - enqueue_top_rt_rq(rt_rq); - resched_curr(rq); -} - -static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ - dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) -{ - return &cpu_rq(cpu)->rt; -} - -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &def_rt_bandwidth; -} - -#endif /* CONFIG_RT_GROUP_SCHED */ - bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -860,7 +808,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); -#ifdef CONFIG_RT_GROUP_SCHED + /* * FIXME: isolated CPUs should really leave the root task group, * whether they are isolcpus or were isolated via cpusets, lest @@ -872,7 +820,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) */ if (rt_b == &root_task_group.rt_bandwidth) span = cpu_online_mask; -#endif + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); @@ -939,18 +887,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) return idle; } -static inline int rt_se_prio(struct sched_rt_entity *rt_se) -{ -#ifdef CONFIG_RT_GROUP_SCHED - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq) - return rt_rq->highest_prio.curr; -#endif - - return rt_task_of(rt_se)->prio; -} - static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -994,23 +930,91 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } +#else /* !CONFIG_RT_GROUP_SCHED */ + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = NULL) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return NULL; +} + +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_curr(rq); +} + +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return false; +} + +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +#ifdef CONFIG_SMP +static void __enable_runtime(struct rq *rq) { } +static void __disable_runtime(struct rq *rq) { } +#endif + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline int rt_se_prio(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq) + return rt_rq->highest_prio.curr; +#endif + + return rt_task_of(rt_se)->prio; +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ static void update_curr_rt(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_rt_entity *rt_se = &curr->rt; + struct task_struct *donor = rq->donor; s64 delta_exec; - if (curr->sched_class != &rt_sched_class) + if (donor->sched_class != &rt_sched_class) return; delta_exec = update_curr_common(rq); if (unlikely(delta_exec <= 0)) return; +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity *rt_se = &donor->rt; + if (!rt_bandwidth_enabled()) return; @@ -1029,6 +1033,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } +#endif } static void @@ -1077,13 +1082,12 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -1093,13 +1097,12 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } @@ -1136,7 +1139,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) /* * This may have been our highest task, and therefore - * we may have some recomputation to do + * we may have some re-computation to do */ if (prio == prev_prio) { struct rt_prio_array *array = &rt_rq->active; @@ -1167,8 +1170,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); } static void @@ -1185,7 +1187,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { - start_rt_bandwidth(&def_rt_bandwidth); } static inline @@ -1269,11 +1270,9 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr static inline struct sched_statistics * __schedstats_from_rt_se(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_RT_GROUP_SCHED /* schedstats is not supported for rt group. */ if (!rt_entity_is_task(rt_se)) return NULL; -#endif return &rt_task_of(rt_se)->stats; } @@ -1493,7 +1492,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_pushable_task(rq, p); } -static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) +static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; @@ -1501,6 +1500,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); + + return true; } /* @@ -1543,7 +1544,7 @@ static int find_lowest_rq(struct task_struct *task); static int select_task_rq_rt(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; struct rq *rq; bool test; @@ -1555,6 +1556,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If the current task on @p's runqueue is an RT task, then @@ -1572,7 +1574,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) * * For equal prio tasks, we just let the scheduler sort it out. * - * Otherwise, just let it ride on the affined RQ and the + * Otherwise, just let it ride on the affine RQ and the * post-schedule router will push the preempted task away * * This test is optimistic, if we get it wrong the load-balancer @@ -1583,8 +1585,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags) * systems like big.LITTLE. */ test = curr && - unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + unlikely(rt_task(donor)) && + (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); @@ -1614,12 +1616,8 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - /* - * Current can't be migrated, useless to reschedule, - * let's hope p can move out. - */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) return; /* @@ -1662,7 +1660,9 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) */ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { - if (p->prio < rq->curr->prio) { + struct task_struct *donor = rq->donor; + + if (p->prio < donor->prio) { resched_curr(rq); return; } @@ -1680,7 +1680,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); #endif } @@ -1705,7 +1705,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f * utilization. We only care of the case where we start to schedule a * rt task */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); rt_queue_push_tasks(rq); @@ -1722,7 +1722,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; - if (SCHED_WARN_ON(list_empty(queue))) + if (WARN_ON_ONCE(list_empty(queue))) return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); @@ -1756,17 +1756,7 @@ static struct task_struct *pick_task_rt(struct rq *rq) return p; } -static struct task_struct *pick_next_task_rt(struct rq *rq) -{ - struct task_struct *p = pick_task_rt(rq); - - if (p) - set_next_task_rt(rq, p, true); - - return p; -} - -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct sched_rt_entity *rt_se = &p->rt; struct rt_rq *rt_rq = &rq->rt; @@ -1791,15 +1781,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - - return 0; -} - /* * Return the highest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise @@ -1813,7 +1794,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; plist_for_each_entry(p, head, pushable_tasks) { - if (pick_rt_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; } @@ -1913,6 +1894,27 @@ static int find_lowest_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1943,18 +1945,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) /* * We had to unlock the run queue. In * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. + * migrated already or had its affinity changed, + * therefore check if the task is still at the + * head of the pushable tasks list. * It is possible the task was scheduled, set * "migrate_disabled" and then got preempted, so we must * check the task migration disable flag here too. */ - if (unlikely(task_rq(task) != rq || + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !rt_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + task != pick_next_pushable_task(rq))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1974,26 +1974,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task @@ -2018,7 +1998,7 @@ retry: * higher priority than current. If that's the case * just reschedule current. */ - if (unlikely(next_task->prio < rq->curr->prio)) { + if (unlikely(next_task->prio < rq->donor->prio)) { resched_curr(rq); return 0; } @@ -2039,7 +2019,7 @@ retry: * Note that the stoppers are masqueraded as SCHED_FIFO * (cf. sched_set_stop_task()), so we can't rely on rt_task(). */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) return 0; cpu = find_lowest_rq(rq->curr); @@ -2106,9 +2086,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); + move_queued_task_locked(rq, lowest_rq, next_task); resched_curr(lowest_rq); ret = 1; @@ -2148,14 +2126,14 @@ static void push_rt_tasks(struct rq *rq) * if its the only CPU with multiple RT tasks queued, and a large number * of CPUs scheduling a lower priority task at the same time. * - * Each root domain has its own irq work function that can iterate over + * Each root domain has its own IRQ work function that can iterate over * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT * task must be checked if there's one or many CPUs that are lowering - * their priority, there's a single irq work iterator that will try to + * their priority, there's a single IRQ work iterator that will try to * push off RT tasks that are waiting to run. * * When a CPU schedules a lower priority task, it will kick off the - * irq work iterator that will jump to each CPU with overloaded RT tasks. + * IRQ work iterator that will jump to each CPU with overloaded RT tasks. * As it only takes the first CPU that schedules a lower priority task * to start the process, the rto_start variable is incremented and if * the atomic result is one, then that CPU will try to take the rto_lock. @@ -2163,7 +2141,7 @@ static void push_rt_tasks(struct rq *rq) * CPUs scheduling lower priority tasks. * * All CPUs that are scheduling a lower priority task will increment the - * rt_loop_next variable. This will make sure that the irq work iterator + * rt_loop_next variable. This will make sure that the IRQ work iterator * checks all RT overloaded CPUs whenever a CPU schedules a new lower * priority task, even if the iterator is in the middle of a scan. Incrementing * the rt_loop_next will cause the iterator to perform another scan. @@ -2243,7 +2221,7 @@ static void tell_cpu_to_push(struct rq *rq) * The rto_cpu is updated under the lock, if it has a valid CPU * then the IPI is still running and will continue due to the * update to loop_next, and nothing needs to be done here. - * Otherwise it is finishing up and an ipi needs to be sent. + * Otherwise it is finishing up and an IPI needs to be sent. */ if (rq->rd->rto_cpu < 0) cpu = rto_next_cpu(rq->rd); @@ -2373,15 +2351,13 @@ static void pull_rt_task(struct rq *this_rq) * p if it is lower in priority than the * current task on the run queue */ - if (p->prio < src_rq->curr->prio) + if (p->prio < src_rq->donor->prio) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); resched = true; } /* @@ -2417,9 +2393,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && + (dl_task(rq->donor) || rt_task(rq->donor)) && (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio); + rq->donor->prio <= p->prio); if (need_to_push) push_rt_tasks(rq); @@ -2503,7 +2479,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) + if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } @@ -2518,7 +2494,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we @@ -2544,7 +2520,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * greater than the current running task * then reschedule. */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->donor->prio) resched_curr(rq); } } @@ -2595,7 +2571,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) watchdog(rq, p); /* - * RR tasks need a special form of timeslice management. + * RR tasks need a special form of time-slice management. * FIFO tasks have no timeslices. */ if (p->policy != SCHED_RR) @@ -2635,8 +2611,9 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) { struct rt_rq *rt_rq; -#ifdef CONFIG_RT_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq rt_rq = task_group(p)->rt_rq[cpu]; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); #else rt_rq = &cpu_rq(cpu)->rt; #endif @@ -2653,13 +2630,12 @@ DEFINE_SCHED_CLASS(rt) = { .wakeup_preempt = wakeup_preempt_rt, - .pick_next_task = pick_next_task_rt, + .pick_task = pick_task_rt, .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP .balance = balance_rt, - .pick_task = pick_task_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, @@ -2747,6 +2723,9 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; + if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) + return -EBUSY; + total = to_ratio(period, runtime); /* @@ -2901,8 +2880,8 @@ static int sched_rt_global_constraints(void) int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + /* Don't accept real-time tasks when there is no way for them to run */ + if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) return 0; return 1; @@ -2913,19 +2892,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; } #endif /* CONFIG_SYSCTL */ @@ -2945,15 +2911,9 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { - unsigned long flags; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } -static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_period, old_runtime; @@ -2961,6 +2921,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, int ret; mutex_lock(&mutex); + sched_domains_mutex_lock(); old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; @@ -2987,12 +2948,13 @@ undo: sysctl_sched_rt_period = old_period; sysctl_sched_rt_runtime = old_runtime; } + sched_domains_mutex_unlock(); mutex_unlock(&mutex); return ret; } -static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3002,7 +2964,7 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, ret = proc_dointvec(table, write, buffer, lenp, ppos); /* * Make sure that internally we keep jiffies. - * Also, writing zero resets the timeslice to default: + * Also, writing zero resets the time-slice to default: */ if (!ret && write) { sched_rr_timeslice = @@ -3018,7 +2980,6 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, } #endif /* CONFIG_SYSCTL */ -#ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; @@ -3029,4 +2990,3 @@ void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ae50f212775e..475bb5998295 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -68,12 +68,19 @@ #include <linux/wait_api.h> #include <linux/wait_bit.h> #include <linux/workqueue_api.h> +#include <linux/delayacct.h> #include <trace/events/power.h> #include <trace/events/sched.h> #include "../workqueue_internal.h" +struct rq; +struct cfs_rq; +struct rt_rq; +struct sched_group; +struct cpuidle_state; + #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> # include <asm/paravirt_api_clock.h> @@ -84,15 +91,6 @@ #include "cpupri.h" #include "cpudeadline.h" -#ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -#else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -#endif - -struct rq; -struct cpuidle_state; - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -112,14 +110,28 @@ extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; /* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + struct rcu_head rcu; + unsigned long capacity; + unsigned long cpus[]; +}; + +extern struct list_head asym_cap_list; + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + +/* * Helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ)) /* * Increase resolution of nice-level calculations for 64-bit architectures. * The extra resolution improves shares distribution and load balancing of - * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group * hierarchies, especially on larger systems. This is not a user-visible change * and does not change the user-interface for setting shares/weights. * @@ -133,12 +145,13 @@ extern int sched_rr_timeslice; #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w) \ -({ \ - unsigned long __w = (w); \ - if (__w) \ - __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ - __w; \ +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ }) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) @@ -173,9 +186,19 @@ static inline int idle_policy(int policy) { return policy == SCHED_IDLE; } + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT + if (policy == SCHED_EXT) + return true; +#endif + return policy == SCHED_NORMAL; +} + static inline int fair_policy(int policy) { - return policy == SCHED_NORMAL || policy == SCHED_BATCH; + return normal_policy(policy) || policy == SCHED_BATCH; } static inline int rt_policy(int policy) @@ -187,6 +210,7 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } + static inline bool valid_policy(int policy) { return idle_policy(policy) || fair_policy(policy) || @@ -208,11 +232,12 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) static inline void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; + *avg += diff / 8; } @@ -224,6 +249,24 @@ static inline void update_avg(u64 *avg, u64 sample) (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) /* + * cgroup weight knobs should use the common MIN, DFL and MAX values which are + * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it + * maps pretty well onto the shares value used by scheduler and the round-trip + * conversions preserve the original value over the entire range. + */ +static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) +{ + return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); +} + +static inline unsigned long sched_weight_to_cgroup(unsigned long weight) +{ + return clamp_t(unsigned long, + DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +/* * !! For sched_setattr_nocheck() (kernel) only !! * * This is actually gross. :( @@ -237,7 +280,7 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 -#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) { @@ -313,8 +356,8 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_bw_check_overflow(int cpu); - +extern int dl_bw_deactivate(int cpu); +extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following * interface: @@ -340,12 +383,21 @@ extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick); + dl_server_pick_f pick_task); -#ifdef CONFIG_CGROUP_SCHED +extern void dl_server_update_idle_time(struct rq *rq, + struct task_struct *p); +extern void fair_server_init(struct rq *rq); +extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); +extern int dl_server_apply_params(struct sched_dl_entity *dl_se, + u64 runtime, u64 period, bool init); -struct cfs_rq; -struct rt_rq; +static inline bool dl_server_active(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server_active; +} + +#ifdef CONFIG_CGROUP_SCHED extern struct list_head task_groups; @@ -379,20 +431,21 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_GROUP_SCHED_WEIGHT + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each CPU */ struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; - - /* A positive value indicates that this is a SCHED_IDLE group. */ - int idle; - #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put - * it in its own cacheline separated from the fields above which + * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; @@ -406,6 +459,11 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + struct rcu_head rcu; struct list_head list; @@ -430,7 +488,7 @@ struct task_group { }; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD /* @@ -461,6 +519,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) return walk_tg_tree_from(&root_task_group, down, up, data); } +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + extern int tg_nop(struct task_group *tg, void *data); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -503,7 +566,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); @@ -517,11 +580,15 @@ extern void set_task_rq_fair(struct sched_entity *se, static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } #endif /* CONFIG_SMP */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ +static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } +static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; + static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } #endif /* CONFIG_CGROUP_SCHED */ @@ -537,8 +604,8 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent * applicable for 32-bits architectures. */ #ifdef CONFIG_64BIT -# define u64_u32_load_copy(var, copy) var -# define u64_u32_store_copy(var, copy, val) (var = val) +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) #else # define u64_u32_load_copy(var, copy) \ ({ \ @@ -566,31 +633,31 @@ do { \ copy = __val; \ } while (0) #endif -# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) -# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ + unsigned int nr_queued; + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; u64 avg_load; - u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 min_vruntime_fi; #endif -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; -#endif - struct rb_root_cached tasks_timeline; /* @@ -600,10 +667,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - #ifdef CONFIG_SMP /* * CFS load tracking @@ -677,6 +740,47 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ }; +#ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + /* + * A hotplugged CPU starts scheduling before rq_online_scx(). Track + * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called + * only while the BPF scheduler considers the CPU to be online. + */ + SCX_RQ_ONLINE = 1 << 0, + SCX_RQ_CAN_STOP_TICK = 1 << 1, + SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ + SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ + SCX_RQ_BYPASSING = 1 << 4, + SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ + + SCX_RQ_IN_WAKEUP = 1 << 16, + SCX_RQ_IN_BALANCE = 1 << 17, +}; + +struct scx_rq { + struct scx_dispatch_q local_dsq; + struct list_head runnable_list; /* runnable tasks on this rq */ + struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */ + unsigned long ops_qseq; + u64 extra_enq_flags; /* see move_task_to_local_dsq() */ + u32 nr_running; + u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ + bool cpu_released; + u32 flags; + u64 clock; /* current per-rq clock -- see scx_bpf_now() */ + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_kick_if_idle; + cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; + struct balance_callback deferred_bal_cb; + struct irq_work deferred_irq_work; + struct irq_work kick_cpus_irq_work; +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -701,23 +805,25 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - int overloaded; + bool overloaded; struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ int rt_queued; +#ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; - u64 rt_time; - u64 rt_runtime; + u64 rt_time; /* consumed RT time, goes up in update_curr_rt */ + u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */ /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; -#ifdef CONFIG_RT_GROUP_SCHED unsigned int rt_nr_boosted; - struct rq *rq; - struct task_group *tg; + struct rq *rq; /* this is always top-level rq, cache? */ +#endif +#ifdef CONFIG_CGROUP_SCHED + struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */ #endif }; @@ -745,7 +851,7 @@ struct dl_rq { u64 next; } earliest_dl; - int overloaded; + bool overloaded; /* * Tasks on this rq that can be pushed away. They are kept in @@ -789,33 +895,42 @@ struct dl_rq { }; #ifdef CONFIG_FAIR_GROUP_SCHED + /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) static inline void se_update_runnable(struct sched_entity *se) { if (!entity_is_task(se)) - se->runnable_weight = se->my_q->h_nr_running; + se->runnable_weight = se->my_q->h_nr_runnable; } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + if (entity_is_task(se)) return !!se->on_rq; else return se->runnable_weight; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ + #define entity_is_task(se) 1 -static inline void se_update_runnable(struct sched_entity *se) {} +static inline void se_update_runnable(struct sched_entity *se) { } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + return !!se->on_rq; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP /* @@ -838,10 +953,6 @@ struct perf_domain { struct rcu_head rcu; }; -/* Scheduling group status flags */ -#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ - /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -862,10 +973,10 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overload; + bool overloaded; - /* Indicate one or more cpus over-utilized (tipping point) */ - int overutilized; + /* Indicate one or more CPUs over-utilized (tipping point) */ + bool overutilized; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -883,7 +994,7 @@ struct root_domain { * Also, some corner cases, like 'wrap around' is dangerous, but given * that u64 is 'big enough'. So that shouldn't be a concern. */ - u64 visit_gen; + u64 visit_cookie; #ifdef HAVE_RT_PUSH_IPI /* @@ -905,8 +1016,6 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; - /* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. @@ -920,6 +1029,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); +static inline int get_rd_overloaded(struct root_domain *rd) +{ + return READ_ONCE(rd->overloaded); +} + +static inline void set_rd_overloaded(struct root_domain *rd, int status) +{ + if (get_rd_overloaded(rd) != status) + WRITE_ONCE(rd->overloaded, status); +} + #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif @@ -969,12 +1089,6 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ -struct rq; -struct balance_callback { - struct balance_callback *next; - void (*func)(struct rq *rq); -}; - /* * This is the main, per-CPU runqueue data structure. * @@ -1017,6 +1131,11 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_EXT + struct scx_rq scx; +#endif + + struct sched_dl_entity fair_server; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1032,7 +1151,11 @@ struct rq { */ unsigned int nr_uninterruptible; - struct task_struct __rcu *curr; + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; + struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; @@ -1053,10 +1176,8 @@ struct rq { atomic_t nr_iowait; -#ifdef CONFIG_SCHED_DEBUG u64 last_seen_need_resched_ns; int ticks_without_resched; -#endif #ifdef CONFIG_MEMBARRIER int membarrier_state; @@ -1091,8 +1212,8 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif -#ifdef CONFIG_SCHED_THERMAL_PRESSURE - struct sched_avg avg_thermal; +#ifdef CONFIG_SCHED_HW_PRESSURE + struct sched_avg avg_hw; #endif u64 idle_stamp; u64 avg_idle; @@ -1107,6 +1228,7 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; + u64 psi_irq_time; #endif #ifdef CONFIG_PARAVIRT u64 prev_steal_time; @@ -1124,14 +1246,13 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; - ktime_t hrtick_time; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_count; @@ -1146,7 +1267,7 @@ struct rq { #endif #ifdef CONFIG_CPU_IDLE - /* Must be inspected within a rcu lock section */ + /* Must be inspected within a RCU lock section */ struct cpuidle_state *idle_state; #endif @@ -1160,6 +1281,7 @@ struct rq { /* per rq */ struct rq *core; struct task_struct *core_pick; + struct sched_dl_entity *core_dl_server; unsigned int core_enabled; unsigned int core_sched_seq; struct rb_root core_tree; @@ -1208,7 +1330,7 @@ static inline int cpu_of(struct rq *rq) #endif } -#define MDF_PUSH 0x01 +#define MDF_PUSH 0x01 static inline bool is_migration_disabled(struct task_struct *p) { @@ -1227,7 +1349,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) -struct sched_group; +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + /* Do nothing */ +} + #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1263,9 +1389,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, - bool fi); -void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); +extern bool +cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi); + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1333,7 +1460,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline bool sched_core_enabled(struct rq *rq) { @@ -1371,7 +1498,25 @@ static inline bool sched_group_cookie_match(struct rq *rq, { return true; } -#endif /* CONFIG_SCHED_CORE */ + +#endif /* !CONFIG_SCHED_CORE */ +#ifdef CONFIG_RT_GROUP_SCHED +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DECLARE_STATIC_KEY_FALSE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_unlikely(&rt_group_sched); +} +# else +DECLARE_STATIC_KEY_TRUE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_likely(&rt_group_sched); +} +# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else +# define rt_group_sched_enabled() false +#endif /* CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1402,8 +1547,10 @@ static inline void raw_spin_rq_unlock_irq(struct rq *rq) static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) { unsigned long flags; + local_irq_save(flags); raw_spin_rq_lock(rq); + return flags; } @@ -1432,9 +1579,10 @@ static inline void update_idle_core(struct rq *rq) { } #endif #ifdef CONFIG_FAIR_GROUP_SCHED + static inline struct task_struct *task_of(struct sched_entity *se) { - SCHED_WARN_ON(!entity_is_task(se)); + WARN_ON_ONCE(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -1455,9 +1603,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ -#define task_of(_se) container_of(_se, struct task_struct, se) +#define task_of(_se) container_of(_se, struct task_struct, se) static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) { @@ -1477,7 +1625,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) { return NULL; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void update_rq_clock(struct rq *rq); @@ -1514,7 +1663,7 @@ static inline void assert_clock_updated(struct rq *rq) * The only reason for not seeing a clock update since the * last rq_pin_lock() is if we're currently skipping updates. */ - SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP); } static inline u64 rq_clock(struct rq *rq) @@ -1533,24 +1682,6 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -/** - * By default the decay is the default pelt decay period. - * The decay shift can change the decay period in - * multiples of 32. - * Decay shift Decay period(ms) - * 0 32 - * 1 64 - * 2 128 - * 3 256 - * 4 512 - */ -extern int sched_thermal_decay_shift; - -static inline u64 rq_clock_thermal(struct rq *rq) -{ - return rq_clock_task(rq) >> sched_thermal_decay_shift; -} - static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_rq_held(rq); @@ -1579,7 +1710,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) static inline void rq_clock_start_loop_update(struct rq *rq) { lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP); rq->clock_update_flags |= RQCF_ACT_SKIP; } @@ -1592,18 +1723,48 @@ static inline void rq_clock_stop_loop_update(struct rq *rq) struct rq_flags { unsigned long flags; struct pin_cookie cookie; -#ifdef CONFIG_SCHED_DEBUG /* * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the * current pin context is stashed here in case it needs to be * restored in rq_repin_lock(). */ unsigned int clock_update_flags; -#endif }; extern struct balance_callback balance_push_callback; +#ifdef CONFIG_SCHED_CLASS_EXT +extern const struct sched_class ext_sched_class; + +DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */ +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ + +#define scx_enabled() static_branch_unlikely(&__scx_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.clock, clock); + smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID); +} + +static inline void scx_rq_clock_invalidate(struct rq *rq) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); +} + +#else /* !CONFIG_SCHED_CLASS_EXT */ +#define scx_enabled() false +#define scx_switched_all() false + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {} +static inline void scx_rq_clock_invalidate(struct rq *rq) {} +#endif /* !CONFIG_SCHED_CLASS_EXT */ + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1618,22 +1779,19 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); -#ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; #ifdef CONFIG_SMP - SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif + WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); #endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SCHED_DEBUG if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED; -#endif + scx_rq_clock_invalidate(rq); lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } @@ -1641,17 +1799,17 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { lockdep_repin_lock(__rq_lockp(rq), rf->cookie); -#ifdef CONFIG_SCHED_DEBUG /* * Restore the value we stashed in @rf for this pin context. */ rq->clock_update_flags |= rf->clock_update_flags; -#endif } +extern struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); +extern struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1678,48 +1836,42 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); } -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); } -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock(rq); rq_pin_lock(rq, rf); } -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irqrestore(rq, rf->flags); } -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irq(rq); } -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1741,8 +1893,7 @@ DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, rq_unlock_irqrestore(_T->lock, &_T->rf), struct rq_flags rf) -static inline struct rq * -this_rq_lock_irq(struct rq_flags *rf) +static inline struct rq *this_rq_lock_irq(struct rq_flags *rf) __acquires(rq->lock) { struct rq *rq; @@ -1750,15 +1901,18 @@ this_rq_lock_irq(struct rq_flags *rf) local_irq_disable(); rq = this_rq(); rq_lock(rq, rf); + return rq; } #ifdef CONFIG_NUMA + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, NUMA_BACKPLANE, }; + extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); @@ -1767,18 +1921,23 @@ extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -#else + +#else /* !CONFIG_NUMA: */ + static inline void sched_init_numa(int offline_node) { } static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } + static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { return nr_cpu_ids; } -#endif + +#endif /* !CONFIG_NUMA */ #ifdef CONFIG_NUMA_BALANCING + /* The regions in numa_faults array from task_struct */ enum numa_faults_stats { NUMA_MEM = 0, @@ -1786,17 +1945,21 @@ enum numa_faults_stats { NUMA_MEMBUF, NUMA_CPUBUF }; + extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); -#else + +#else /* !CONFIG_NUMA_BALANCING: */ + static inline void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) { } -#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* !CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP @@ -1821,8 +1984,7 @@ queue_balance_callback(struct rq *rq, } #define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) + rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -1893,6 +2055,7 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + extern struct static_key_false sched_asym_cpucapacity; extern struct static_key_false sched_cluster_active; @@ -1913,9 +2076,7 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ -#ifdef CONFIG_SCHED_DEBUG int id; -#endif unsigned long cpumask[]; /* Balance mask */ }; @@ -1955,17 +2116,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) extern int group_balance_cpu(struct sched_group *sg); -#ifdef CONFIG_SCHED_DEBUG -void update_sched_domain_debugfs(void); -void dirty_sched_domain_sysctl(int cpu); -#else -static inline void update_sched_domain_debugfs(void) -{ -} -static inline void dirty_sched_domain_sysctl(int cpu) -{ -} -#endif +extern void update_sched_domain_debugfs(void); +extern void dirty_sched_domain_sysctl(int cpu); extern int sched_update_scaling(void); @@ -1975,35 +2127,8 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return cpu_possible_mask; /* &init_task.cpus_mask */ return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - -#include "stats.h" - -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) - -extern void __sched_core_account_forceidle(struct rq *rq); - -static inline void sched_core_account_forceidle(struct rq *rq) -{ - if (schedstat_enabled()) - __sched_core_account_forceidle(rq); -} - -extern void __sched_core_tick(struct rq *rq); -static inline void sched_core_tick(struct rq *rq) -{ - if (sched_core_enabled(rq) && schedstat_enabled()) - __sched_core_tick(rq); -} - -#else - -static inline void sched_core_account_forceidle(struct rq *rq) {} - -static inline void sched_core_tick(struct rq *rq) {} - -#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SMP */ #ifdef CONFIG_CGROUP_SCHED @@ -2040,20 +2165,28 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_RT_GROUP_SCHED + /* + * p->rt.rt_rq is NULL initially and it is easier to assign + * root_task_group's rt_rq than switching in rt_rq_of_se() + * Clobbers tg(!) + */ + if (!rt_group_sched_enabled()) + tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; #endif } -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } + static inline struct task_group *task_group(struct task_struct *p) { return NULL; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -2071,13 +2204,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + * Tunables: */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug const -#endif #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -2089,15 +2217,14 @@ enum { #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG - /* * To support run-time toggling of sched features, all the translation units * (but core.c) reference the sysctl_sched_features defined in core.c. */ -extern const_debug unsigned int sysctl_sched_features; +extern __read_mostly unsigned int sysctl_sched_features; #ifdef CONFIG_JUMP_LABEL + #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -2110,29 +2237,11 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* CONFIG_JUMP_LABEL */ - -#else /* !SCHED_DEBUG */ - -/* - * Each translation unit has its own copy of sysctl_sched_features to allow - * constants propagation at compile time and compiler optimization based on - * features default. - */ -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | -static const_debug __maybe_unused unsigned int sysctl_sched_features = -#include "features.h" - 0; -#undef SCHED_FEAT - -#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - -#endif /* SCHED_DEBUG */ +#endif /* !CONFIG_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -2150,11 +2259,25 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * Is p the current execution context? + */ static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; } +/* + * Is p the current scheduling context? + * + * Note that it might be the current execution context at the same time if + * rq->curr == rq->donor == p. + */ +static inline int task_current_donor(struct rq *rq, struct task_struct *p) +{ + return rq->donor == p; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP @@ -2166,7 +2289,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p) static inline int task_on_rq_queued(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_QUEUED; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED; } static inline int task_on_rq_migrating(struct task_struct *p) @@ -2175,13 +2298,14 @@ static inline int task_on_rq_migrating(struct task_struct *p) } /* Wake flags. The first three directly map to some SD flag value */ -#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ -#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ -#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ -#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ -#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ -#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2224,14 +2348,17 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ -#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2247,13 +2374,15 @@ extern const u32 sched_prio_to_wmult[40]; #endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 +#define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) struct affinity_context { - const struct cpumask *new_mask; - struct cpumask *user_mask; - unsigned int flags; + const struct cpumask *new_mask; + struct cpumask *user_mask; + unsigned int flags; }; extern s64 update_curr_common(struct rq *rq); @@ -2265,23 +2394,31 @@ struct sched_class { #endif void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); - struct task_struct *(*pick_next_task)(struct rq *rq); + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + struct task_struct *(*pick_task)(struct rq *rq); + /* + * Optional! When implemented pick_next_task() should be equivalent to: + * + * next = pick_task(); + * if (next) { + * put_prev_task(prev); + * set_next_task_first(next); + * } + */ + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); #ifdef CONFIG_SMP - int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); - struct task_struct * (*pick_task)(struct rq *rq); - void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); @@ -2303,8 +2440,11 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); void (*switched_from)(struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*reweight_task)(struct rq *this_rq, struct task_struct *task, + const struct load_weight *lw); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); @@ -2324,8 +2464,8 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); + WARN_ON_ONCE(rq->donor != prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } static inline void set_next_task(struct rq *rq, struct task_struct *next) @@ -2333,6 +2473,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } +static inline void +__put_prev_set_next_dl_server(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + prev->dl_server = NULL; + next->dl_server = rq->dl_server; + rq->dl_server = NULL; +} + +static inline void put_prev_set_next_task(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + WARN_ON_ONCE(rq->curr != prev); + + __put_prev_set_next_dl_server(rq, prev, next); + + if (next == prev) + return; + + prev->sched_class->put_prev_task(rq, prev, next); + next->sched_class->set_next_task(rq, next, true); +} /* * Helper to define a sched_class instance; each one is placed in a separate @@ -2353,19 +2517,41 @@ const struct sched_class name##_sched_class \ extern struct sched_class __sched_class_highest[]; extern struct sched_class __sched_class_lowest[]; +extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + +/* + * Iterate only active classes. SCX can take over all fair tasks or be + * completely disabled. If the former, skip fair. If the latter, skip SCX. + */ +static inline const struct sched_class *next_active_class(const struct sched_class *class) +{ + class++; +#ifdef CONFIG_SCHED_CLASS_EXT + if (scx_switched_all() && class == &fair_sched_class) + class++; + if (!scx_enabled() && class == &ext_sched_class) + class++; +#endif + return class; +} + #define for_class_range(class, _from, _to) \ for (class = (_from); class < (_to); class++) #define for_each_class(class) \ for_class_range(class, __sched_class_highest, __sched_class_lowest) -#define sched_class_above(_a, _b) ((_a) < (_b)) +#define for_active_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = next_active_class(class)) -extern const struct sched_class stop_sched_class; -extern const struct sched_class dl_sched_class; -extern const struct sched_class rt_sched_class; -extern const struct sched_class fair_sched_class; -extern const struct sched_class idle_sched_class; +#define for_each_active_class(class) \ + for_active_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) static inline bool sched_stop_runnable(struct rq *rq) { @@ -2384,11 +2570,11 @@ static inline bool sched_rt_runnable(struct rq *rq) static inline bool sched_fair_runnable(struct rq *rq) { - return rq->cfs.nr_running > 0; + return rq->cfs.nr_queued > 0; } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_next_task_idle(struct rq *rq); +extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2399,13 +2585,37 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); extern void update_group_capacity(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq); +extern void sched_balance_trigger(struct rq *rq); +extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx); extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); +static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) +{ + /* When not in the task's cpumask, no point in looking further. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + + /* Can @cpu run a user thread? */ + if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p)) + return false; + + return true; +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + /* + * See do_set_cpus_allowed() above for the rcu_head usage. + */ + int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); + + return kmalloc_node(size, GFP_KERNEL, node); +} + static inline struct task_struct *get_push_task(struct rq *rq) { - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; lockdep_assert_rq_held(rq); @@ -2424,9 +2634,28 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#endif +#else /* !CONFIG_SMP: */ + +static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) +{ + return true; +} + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + struct affinity_context *ctx) +{ + return set_cpus_allowed_ptr(p, ctx->new_mask); +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + return NULL; +} + +#endif /* !CONFIG_SMP */ #ifdef CONFIG_CPU_IDLE + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -2435,11 +2664,13 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); return rq->idle_state; } -#else + +#else /* !CONFIG_CPU_IDLE: */ + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -2449,7 +2680,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } -#endif + +#endif /* !CONFIG_CPU_IDLE */ extern void schedule_idle(void); asmlinkage void schedule_user(void); @@ -2462,12 +2694,10 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, int prio); - extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); -extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -2478,7 +2708,8 @@ extern void init_dl_entity(struct sched_dl_entity *dl_se); #define RATIO_SHIFT 8 #define MAX_BW_BITS (64 - BW_SHIFT) #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) -unsigned long to_ratio(u64 period, u64 runtime); + +extern unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); extern void post_init_entity_util_avg(struct task_struct *p); @@ -2504,10 +2735,10 @@ static inline void sched_update_tick_dependency(struct rq *rq) else tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#else +#else /* !CONFIG_NO_HZ_FULL: */ static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ static inline void add_nr_running(struct rq *rq, unsigned count) { @@ -2519,10 +2750,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } #ifdef CONFIG_SMP - if (prev_nr < 2 && rq->nr_running >= 2) { - if (!READ_ONCE(rq->rd->overload)) - WRITE_ONCE(rq->rd->overload, 1); - } + if (prev_nr < 2 && rq->nr_running >= 2) + set_rd_overloaded(rq->rd, 1); #endif sched_update_tick_dependency(rq); @@ -2539,23 +2768,65 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } +static inline void __block_task(struct rq *rq, struct task_struct *p) +{ + if (p->sched_contributes_to_load) + rq->nr_uninterruptible++; + + if (p->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); +} + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT -#define SCHED_NR_MIGRATE_BREAK 8 +# define SCHED_NR_MIGRATE_BREAK 8 #else -#define SCHED_NR_MIGRATE_BREAK 32 +# define SCHED_NR_MIGRATE_BREAK 32 #endif -extern const_debug unsigned int sysctl_sched_nr_migrate; -extern const_debug unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; +extern __read_mostly unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; -#ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; @@ -2566,7 +2837,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; -#endif #ifdef CONFIG_SCHED_HRTICK @@ -2596,9 +2866,9 @@ static inline int hrtick_enabled_dl(struct rq *rq) return hrtick_enabled(rq); } -void hrtick_start(struct rq *rq, u64 delay); +extern void hrtick_start(struct rq *rq, u64 delay); -#else +#else /* !CONFIG_SCHED_HRTICK: */ static inline int hrtick_enabled_fair(struct rq *rq) { @@ -2615,13 +2885,10 @@ static inline int hrtick_enabled(struct rq *rq) return 0; } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick -static __always_inline -void arch_scale_freq_tick(void) -{ -} +static __always_inline void arch_scale_freq_tick(void) { } #endif #ifndef arch_scale_freq_capacity @@ -2642,7 +2909,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SCHED_DEBUG /* * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to * acquire rq lock instead of rq_lock(). So at the end of these two functions @@ -2657,14 +2923,11 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); #endif } -#else -static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} -#endif -#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ -__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ -static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ -{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ +#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ +__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ +static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ +{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } #ifdef CONFIG_SMP @@ -2718,7 +2981,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return 1; } -#else +#else /* !CONFIG_PREEMPTION: */ /* * Unfair double_lock_balance: Optimizes throughput at the expense of * latency by eliminating extra atomic operations when the locks are @@ -2749,7 +3012,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return 1; } -#endif /* CONFIG_PREEMPTION */ +#endif /* !CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. @@ -2825,9 +3088,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern void set_rq_online (struct rq *rq); extern void set_rq_offline(struct rq *rq); + extern bool sched_smp_initialized; -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ /* * double_rq_lock - safely lock two runqueues @@ -2861,7 +3125,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } -#endif +#endif /* !CONFIG_SMP */ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), @@ -2871,7 +3135,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); -#ifdef CONFIG_SCHED_DEBUG extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); @@ -2883,15 +3146,11 @@ extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); #ifdef CONFIG_NUMA_BALANCING -extern void -show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, - unsigned long tpf, unsigned long gsf, unsigned long gpf); + unsigned long tpf, unsigned long gsf, unsigned long gpf); #endif /* CONFIG_NUMA_BALANCING */ -#else -static inline void resched_latency_warn(int cpu, u64 latency) {} -#endif /* CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -2901,12 +3160,13 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON + #define NOHZ_BALANCE_KICK_BIT 0 #define NOHZ_STATS_KICK_BIT 1 #define NOHZ_NEWILB_KICK_BIT 2 #define NOHZ_NEXT_KICK_BIT 3 -/* Run rebalance_domains() */ +/* Run sched_balance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) /* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) @@ -2915,14 +3175,14 @@ extern void cfs_bandwidth_usage_dec(void); /* Update nohz.next_balance */ #define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT) -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) -#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) extern void nohz_balance_exit_idle(struct rq *rq); -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balance_exit_idle(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_run_idle_balance(int cpu); @@ -2930,7 +3190,36 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING + struct irqtime { u64 total; u64 tick_delta; @@ -2939,6 +3228,12 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +extern int sched_clock_irqtime; + +static inline int irqtime_enabled(void) +{ + return sched_clock_irqtime; +} /* * Returns the irqtime minus the softirq time computed by ksoftirqd. @@ -2958,9 +3253,18 @@ static inline u64 irq_time_read(int cpu) return total; } + +#else + +static inline int irqtime_enabled(void) +{ + return 0; +} + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ + DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** @@ -2994,9 +3298,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) if (data) data->func(data, rq_clock(rq), flags); } -#else -static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -#endif /* CONFIG_CPU_FREQ */ +#else /* !CONFIG_CPU_FREQ: */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } +#endif /* !CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant @@ -3007,6 +3311,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP + unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3049,11 +3354,40 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#endif + +#else /* !CONFIG_SMP */ +static inline bool update_other_load_avgs(struct rq *rq) { return false; } +#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK + unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} + +/* + * Enabling static branches would get the cpus_read_lock(), + * check whether uclamp_is_used before enable it to avoid always + * calling cpus_read_lock(). Because we never disable this + * static key once enable it. + */ +static inline void sched_uclamp_enable(void) +{ + if (!uclamp_is_used()) + static_branch_enable(&sched_uclamp_used); +} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3077,7 +3411,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) unsigned long rq_util; unsigned long max_util; - if (!static_branch_likely(&sched_uclamp_used)) + if (!uclamp_is_used()) return false; rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); @@ -3086,21 +3420,39 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; } -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) + +extern unsigned int sysctl_sched_uclamp_util_min_rt_default; + + +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { - return static_branch_likely(&sched_uclamp_used); + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; +} + +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) +{ + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); +} + +static inline void +uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined) +{ + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; } -#else /* CONFIG_UCLAMP_TASK */ -static inline unsigned long uclamp_eff_value(struct task_struct *p, - enum uclamp_id clamp_id) + +#else /* !CONFIG_UCLAMP_TASK: */ + +static inline unsigned long +uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -3115,8 +3467,10 @@ static inline bool uclamp_is_used(void) return false; } -static inline unsigned long uclamp_rq_get(struct rq *rq, - enum uclamp_id clamp_id) +static inline void sched_uclamp_enable(void) {} + +static inline unsigned long +uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -3124,8 +3478,8 @@ static inline unsigned long uclamp_rq_get(struct rq *rq, return SCHED_CAPACITY_SCALE; } -static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, - unsigned int value) +static inline void +uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value) { } @@ -3133,9 +3487,11 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) { return false; } -#endif /* CONFIG_UCLAMP_TASK */ + +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + static inline unsigned long cpu_util_irq(struct rq *rq) { return READ_ONCE(rq->avg_irq.util_avg); @@ -3150,7 +3506,9 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned return util; } -#else + +#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */ + static inline unsigned long cpu_util_irq(struct rq *rq) { return 0; @@ -3161,7 +3519,10 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned { return util; } -#endif + +#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */ + +extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -3174,16 +3535,16 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -extern struct cpufreq_governor schedutil_gov; - #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL + static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_MEMBARRIER + /* * The scheduler provides memory barriers required by membarrier between: * - prior user-space memory accesses and store to rq->membarrier_state, @@ -3205,13 +3566,16 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else + +#else /* !CONFIG_MEMBARRIER :*/ + static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, struct mm_struct *next_mm) { } -#endif + +#endif /* !CONFIG_MEMBARRIER */ #ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) @@ -3236,6 +3600,7 @@ extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID @@ -3263,7 +3628,7 @@ static inline void __mm_cid_put(struct mm_struct *mm, int cid) * be held to transition to other states. * * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across cpus, which prevents use of this_cpu_cmpxchg. + * consistent across CPUs, which prevents use of this_cpu_cmpxchg. */ static inline void mm_cid_put_lazy(struct task_struct *t) { @@ -3311,25 +3676,62 @@ static inline void mm_cid_put(struct mm_struct *mm) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } -static inline int __mm_cid_try_get(struct mm_struct *mm) +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid, max_nr_cid, allowed_max_nr_cid; - cpumask = mm_cidmask(mm); /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } + /* Try to re-use recent cid. This improves cache locality. */ + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = max_nr_cid; + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } + /* + * Find the first available concurrency id. * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) break; cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) + if (cpumask_test_and_set_cpu(cid, cidmask)) return -1; + return cid; } @@ -3345,7 +3747,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { int cid; @@ -3355,13 +3758,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * guarantee forward progress. */ if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto end; raw_spin_lock(&cid_lock); } else { raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto unlock; } @@ -3381,7 +3784,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * all newcoming allocations observe the use_cid_lock flag set. */ do { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); cpu_relax(); } while (cid < 0); /* @@ -3394,10 +3797,12 @@ unlock: raw_spin_unlock(&cid_lock); end: mm_cid_snapshot_time(rq, mm); + return cid; } -static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; struct cpumask *cpumask; @@ -3414,8 +3819,10 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } - cid = __mm_cid_get(rq, mm); + cid = __mm_cid_get(rq, t, mm); __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); + return cid; } @@ -3467,18 +3874,115 @@ static inline void switch_mm_cid(struct rq *rq, prev->mm_cid = -1; } if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); } -#else +#else /* !CONFIG_SCHED_MM_CID: */ static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } -#endif +#endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SMP +static inline +void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) +{ + lockdep_assert_rq_held(src_rq); + lockdep_assert_rq_held(dst_rq); + + deactivate_task(src_rq, task, 0); + set_task_cpu(task, dst_rq->cpu); + activate_task(dst_rq, task, 0); +} + +static inline +bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) + return true; + + return false; +} +#endif + +#ifdef CONFIG_RT_MUTEXES + +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + +#else /* !CONFIG_RT_MUTEXES: */ + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} + +#endif /* !CONFIG_RT_MUTEXES */ + +extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); +extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); +extern const struct sched_class *__setscheduler_class(int policy, int prio); +extern void set_load_weight(struct task_struct *p, bool update_load); +extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + +#ifdef CONFIG_SMP +extern struct balance_callback *splice_balance_callbacks(struct rq *rq); +extern void balance_callbacks(struct rq *rq, struct balance_callback *head); +#else + +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +{ + return NULL; +} + +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +{ +} + +#endif + +#ifdef CONFIG_SCHED_CLASS_EXT +/* + * Used by SCX in the enable/disable paths to move tasks between sched_classes + * and establish invariants. + */ +struct sched_enq_and_set_ctx { + struct task_struct *p; + int queue_flags; + bool queued; + bool running; +}; + +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx); +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#include "ext.h" #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 857f837f52cb..4346fd81c31f 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -92,16 +92,6 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, trace_sched_stat_blocked(p, delta); - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(p), - delta >> 20); - } account_scheduler_latency(p, delta >> 10, 0); } } @@ -113,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 15 +#define SCHEDSTAT_VERSION 17 static int show_schedstat(struct seq_file *seq, void *v) { @@ -148,15 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - seq_printf(seq, "domain%d %*pb", dcount++, + seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name, cpumask_pr_args(sched_domain_span(sd))); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", + for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { + seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], sd->lb_failed[itype], - sd->lb_imbalance[itype], + sd->lb_imbalance_load[itype], + sd->lb_imbalance_util[itype], + sd->lb_imbalance_task[itype], + sd->lb_imbalance_misfit[itype], sd->lb_gained[itype], sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 38f3698f5e5b..452826df6ae1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -110,49 +110,84 @@ __schedstats_from_se(struct sched_entity *se) void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); -void psi_account_irqtime(struct task_struct *task, u32 delta); - +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); +#else +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, - * where a task's runnable state changes, and requeues, where a task - * and its state are being moved between CPUs and runqueues. + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) +static inline void psi_enqueue(struct task_struct *p, int flags) { - int clear = 0, set = TSK_RUNNING; + int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; - if (p->in_memstall) - set |= TSK_MEMSTALL_RUNNING; + /* Same runqueue, nothing changed for psi */ + if (flags & ENQUEUE_RESTORE) + return; + + /* psi_sched_switch() will handle the flags */ + if (task_on_cpu(task_rq(p), p)) + return; - if (!wakeup) { + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (flags & ENQUEUE_MIGRATED) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; } else { + /* Wakeup of new or sleeping task */ if (p->in_iowait) clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; } psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool sleep) +static inline void psi_dequeue(struct task_struct *p, int flags) { if (static_branch_likely(&psi_disabled)) return; + /* Same runqueue, nothing changed for psi */ + if (flags & DEQUEUE_SAVE) + return; + /* * A voluntary sleep is a dequeue followed by a task switch. To * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. */ - if (sleep) + if (flags & DEQUEUE_SLEEP) return; + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ psi_task_change(p, p->psi_flags, 0); } @@ -186,13 +221,14 @@ static inline void psi_sched_switch(struct task_struct *prev, } #else /* CONFIG_PSI */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} -static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} -static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO @@ -212,14 +248,17 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) delta = rq_clock(rq) - t->sched_info.last_queued; t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; rq_sched_info_dequeue(rq, delta); } /* * Called when a task finally hits the CPU. We can now calculate how * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. + * can keep stats on how long its time-slice is. */ static void sched_info_arrive(struct rq *rq, struct task_struct *t) { @@ -234,6 +273,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; rq_sched_info_arrive(rq, delta); } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index b1b8fe61c532..058dd42e3d9b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -41,26 +41,17 @@ static struct task_struct *pick_task_stop(struct rq *rq) return rq->stop; } -static struct task_struct *pick_next_task_stop(struct rq *rq) -{ - struct task_struct *p = pick_task_stop(rq); - - if (p) - set_next_task_stop(rq, p, true); - - return p; -} - static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); } -static void +static bool dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + return true; } static void yield_task_stop(struct rq *rq) @@ -68,7 +59,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next) { update_curr_common(rq); } @@ -111,13 +102,12 @@ DEFINE_SCHED_CLASS(stop) = { .wakeup_preempt = wakeup_preempt_stop, - .pick_next_task = pick_next_task_stop, + .pick_task = pick_task_stop, .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP .balance = balance_stop, - .pick_task = pick_task_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c new file mode 100644 index 000000000000..547c1f05b667 --- /dev/null +++ b/kernel/sched/syscalls.c @@ -0,0 +1,1595 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kernel/sched/syscalls.c + * + * Core kernel scheduler syscalls related code + * + * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 1998-2024 Ingo Molnar, Red Hat + */ +#include <linux/sched.h> +#include <linux/cpuset.h> +#include <linux/sched/debug.h> + +#include <uapi/linux/sched/types.h> + +#include "sched.h" +#include "autogroup.h" + +static inline int __normal_prio(int policy, int rt_prio, int nice) +{ + int prio; + + if (dl_policy(policy)) + prio = MAX_DL_PRIO - 1; + else if (rt_policy(policy)) + prio = MAX_RT_PRIO - 1 - rt_prio; + else + prio = NICE_TO_PRIO(nice); + + return prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_or_dl_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +void set_user_nice(struct task_struct *p, long nice) +{ + bool queued, running; + struct rq *rq; + int old_prio; + + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + CLASS(task_rq_lock, rq_guard)(p); + rq = rq_guard.rq; + + update_rq_clock(rq); + + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it won't have any effect on scheduling until the task is + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: + */ + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + return; + } + + queued = task_on_rq_queued(p); + running = task_current_donor(rq, p); + if (queued) + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + if (running) + put_prev_task(rq, p); + + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * is_nice_reduction - check if nice value is an actual reduction + * + * Similar to can_nice() but does not perform a capability check. + * + * @p: task + * @nice: nice value + */ +static bool is_nice_reduction(const struct task_struct *p, const int nice) +{ + /* Convert nice value [19,-20] to rlimit style value [1,40]: */ + int nice_rlim = nice_to_rlimit(nice); + + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); +} + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); + nice = task_nice(current) + increment; + + nice = clamp_val(nice, MIN_NICE, MAX_NICE); + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * Return: The priority value as seen by users in /proc. + * + * sched policy return value kernel prio user prio/nice + * + * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] + * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] + * deadline -101 -1 0 + */ +int task_prio(const struct task_struct *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * idle_cpu - is a given CPU idle currently? + * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP + if (rq->ttwu_pending) + return 0; +#endif + + return 1; +} + +/** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int available_idle_cpu(int cpu) +{ + if (!idle_cpu(cpu)) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + +/** + * idle_task - return the idle task for a given CPU. + * @cpu: the processor in question. + * + * Return: The idle task for the CPU @cpu. + */ +struct task_struct *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +#ifdef CONFIG_SCHED_CORE +int sched_core_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (sched_core_enabled(rq) && rq->curr == rq->idle) + return 1; + + return idle_cpu(cpu); +} + +#endif + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_vpid(pid) : current; +} + +static struct task_struct *find_get_task(pid_t pid) +{ + struct task_struct *p; + guard(rcu)(); + + p = find_process_by_pid(pid); + if (likely(p)) + get_task_struct(p); + + return p; +} + +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), + find_get_task(pid), pid_t pid) + +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) +{ + int policy = attr->sched_policy; + + if (policy == SETPARAM_POLICY) + policy = p->policy; + + p->policy = policy; + + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (fair_policy(policy)) + __setparam_fair(p, attr); + + /* rt-policy tasks do not have a timerslack */ + if (rt_or_dl_task_policy(p)) { + p->timer_slack_ns = 0; + } else if (p->timer_slack_ns == 0) { + /* when switching back to non-rt policy, restore timerslack */ + p->timer_slack_ns = p->default_timer_slack_ns; + } + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. + */ + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); + set_load_weight(p, true); +} + +/* + * Check the target process has a UID that matches the current process's: + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + guard(rcu)(); + + pcred = __task_cred(p); + return (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); +} + +#ifdef CONFIG_UCLAMP_TASK + +static int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + int util_min = p->uclamp_req[UCLAMP_MIN].value; + int util_max = p->uclamp_req[UCLAMP_MAX].value; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + util_min = attr->sched_util_min; + + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + util_max = attr->sched_util_max; + + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (util_min != -1 && util_max != -1 && util_min > util_max) + return -EINVAL; + + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + sched_uclamp_enable(); + + return 0; +} + +static bool uclamp_reset(const struct sched_attr *attr, + enum uclamp_id clamp_id, + struct uclamp_se *uc_se) +{ + /* Reset on sched class change for a non user-defined clamp value. */ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && + !uc_se->user_defined) + return true; + + /* Reset on sched_util_{min,max} == -1. */ + if (clamp_id == UCLAMP_MIN && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min == -1) { + return true; + } + + if (clamp_id == UCLAMP_MAX && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max == -1) { + return true; + } + + return false; +} + +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) +{ + enum uclamp_id clamp_id; + + for_each_clamp_id(clamp_id) { + struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int value; + + if (!uclamp_reset(attr, clamp_id, uc_se)) + continue; + + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + value = sysctl_sched_uclamp_util_min_rt_default; + else + value = uclamp_none(clamp_id); + + uclamp_se_set(uc_se, value, false); + + } + + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) + return; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min != -1) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], + attr->sched_util_min, true); + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max != -1) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], + attr->sched_util_max, true); + } +} + +#else /* !CONFIG_UCLAMP_TASK: */ + +static inline int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + return -EOPNOTSUPP; +} +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) { } +#endif + +/* + * Allow unprivileged RT tasks to decrease priority. + * Only issue a capable test if needed and only once to avoid an audit + * event on permitted non-privileged operations: + */ +static int user_check_sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + int policy, int reset_on_fork) +{ + if (fair_policy(policy)) { + if (attr->sched_nice < task_nice(p) && + !is_nice_reduction(p, attr->sched_nice)) + goto req_priv; + } + + if (rt_policy(policy)) { + unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); + + /* Can't set/change the rt policy: */ + if (policy != p->policy && !rlim_rtprio) + goto req_priv; + + /* Can't increase priority: */ + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) + goto req_priv; + } + + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + goto req_priv; + + /* + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. + */ + if (task_has_idle_policy(p) && !idle_policy(policy)) { + if (!is_nice_reduction(p, task_nice(p))) + goto req_priv; + } + + /* Can't change other user's priorities: */ + if (!check_same_owner(p)) + goto req_priv; + + /* Normal users shall not reset the sched_reset_on_fork flag: */ + if (p->sched_reset_on_fork && !reset_on_fork) + goto req_priv; + + return 0; + +req_priv: + if (!capable(CAP_SYS_NICE)) + return -EPERM; + + return 0; +} + +int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user, bool pi) +{ + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; + const struct sched_class *prev_class, *next_class; + struct balance_callback *head; + struct rq_flags rf; + int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq *rq; + bool cpuset_locked = false; + + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); +recheck: + /* Double check policy once rq lock held: */ + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; + policy = oldpolicy = p->policy; + } else { + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); + + if (!valid_policy(policy)) + return -EINVAL; + } + + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) + return -EINVAL; + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. + */ + if (attr->sched_priority > MAX_RT_PRIO-1) + return -EINVAL; + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) + return -EINVAL; + + if (user) { + retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork); + if (retval) + return retval; + + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return -EINVAL; + + retval = security_task_setscheduler(p); + if (retval) + return retval; + } + + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); + if (retval) + return retval; + } + + /* + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets + * information. + */ + if (dl_policy(policy) || dl_policy(p->policy)) { + cpuset_locked = true; + cpuset_lock(); + } + + /* + * Make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + * + * To be able to change p->policy safely, the appropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + /* + * Changing the policy of the stop threads its a very bad idea: + */ + if (p == rq->stop) { + retval = -EINVAL; + goto unlock; + } + + retval = scx_check_setscheduler(p, policy); + if (retval) + goto unlock; + + /* + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && + (attr->sched_nice != task_nice(p) || + (attr->sched_runtime != p->se.slice))) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + if (dl_policy(policy) && dl_param_changed(p, attr)) + goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; + + p->sched_reset_on_fork = reset_on_fork; + retval = 0; + goto unlock; + } +change: + + if (user) { +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow real-time tasks into groups that have no runtime + * assigned. + */ + if (rt_group_sched_enabled() && + rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { + retval = -EPERM; + goto unlock; + } +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy) && + !(attr->sched_flags & SCHED_FLAG_SUGOV)) { + cpumask_t *span = rq->rd->span; + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_subset(span, p->cpus_ptr) || + rq->rd->dl_bw.bw == 0) { + retval = -EPERM; + goto unlock; + } + } +#endif + } + + /* Re-check policy now with rq lock held: */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &rf); + if (cpuset_locked) + cpuset_unlock(); + goto recheck; + } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { + retval = -EBUSY; + goto unlock; + } + + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); + if (pi) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + newprio = rt_effective_prio(p, newprio); + if (newprio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; + } + + prev_class = p->sched_class; + next_class = __setscheduler_class(policy, newprio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + + queued = task_on_rq_queued(p); + running = task_current_donor(rq, p); + if (queued) + dequeue_task(rq, p, queue_flags); + if (running) + put_prev_task(rq, p); + + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); + check_class_changing(rq, p, prev_class); + + if (queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, queue_flags); + } + if (running) + set_next_task(rq, p); + + check_class_changed(rq, p, prev_class, oldprio); + + /* Avoid rq from going away on us: */ + preempt_disable(); + head = splice_balance_callbacks(rq); + task_rq_unlock(rq, p, &rf); + + if (pi) { + if (cpuset_locked) + cpuset_unlock(); + rt_mutex_adjust_pi(p); + } + + /* Run balance callbacks after we've adjusted the PI chain: */ + balance_callbacks(rq, head); + preempt_enable(); + + return 0; + +unlock: + task_rq_unlock(rq, p, &rf); + if (cpuset_locked) + cpuset_unlock(); + return retval; +} + +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) +{ + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + + if (p->se.custom_slice) + attr.sched_runtime = p->se.slice; + + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + + return __sched_setscheduler(p, &attr, check, true); +} +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Use sched_set_fifo(), read its comment. + * + * Return: 0 on success. An error code otherwise. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return _sched_setscheduler(p, policy, param, true); +} + +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true, true); +} + +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, false, true); +} +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + * + * Return: 0 on success. An error code otherwise. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return _sched_setscheduler(p, policy, param, false); +} + +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lparam; + + if (unlikely(!param || pid < 0)) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + return sched_setscheduler(p, policy, &lparam); +} + +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) +{ + u32 size; + int ret; + + /* Zero the full structure, so that a short copy will be nice: */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + /* ABI compatibility quirk: */ + if (!size) + size = SCHED_ATTR_SIZE_VER0; + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) + goto err_size; + + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; + } + + if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + + /* + * XXX: Do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); + + return 0; + +err_size: + put_user(sizeof(*attr), &uattr->size); + return -E2BIG; +} + +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) { + __getparam_dl(p, attr); + } else if (task_has_rt_policy(p)) { + attr->sched_priority = p->rt_priority; + } else { + attr->sched_nice = task_nice(p); + attr->sched_runtime = p->se.slice; + } +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) +{ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); +} + +/** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @flags: for future extension. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) +{ + struct sched_attr attr; + int retval; + + if (unlikely(!uattr || pid < 0 || flags)) + return -EINVAL; + + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if ((int)attr.sched_policy < 0) + return -EINVAL; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + attr.sched_policy = SETPARAM_POLICY; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); + + return sched_setattr(p, &attr); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ + struct task_struct *p; + int retval; + + if (pid < 0) + return -EINVAL; + + guard(rcu)(); + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (!retval) { + retval = p->policy; + if (p->sched_reset_on_fork) + retval |= SCHED_RESET_ON_FORK; + } + return retval; +} + +/** + * sys_sched_getparam - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + * + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ + struct sched_param lp = { .sched_priority = 0 }; + struct task_struct *p; + int retval; + + if (unlikely(!param || pid < 0)) + return -EINVAL; + + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + } + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @usize: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, usize, unsigned int, flags) +{ + struct sched_attr kattr = { }; + struct task_struct *p; + int retval; + + if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags)) + return -EINVAL; + + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + kattr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; + +#ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; +#endif + } + + kattr.size = min(usize, sizeof(kattr)); + return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); +} + +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * The special/sugov task isn't part of regular bandwidth/admission + * control so let userspace change affinities. + */ + if (dl_entity_is_special(&p->dl)) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + guard(rcu)(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + return -EBUSY; + + return 0; +} +#endif /* CONFIG_SMP */ + +int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) +{ + int retval; + cpumask_var_t cpus_allowed, new_mask; + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, ctx->new_mask, cpus_allowed); + + ctx->new_mask = new_mask; + ctx->flags |= SCA_CHECK; + + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; + + retval = __set_cpus_allowed_ptr(p, ctx); + if (retval) + goto out_free_new_mask; + + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + + /* + * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr() + * will restore the previous user_cpus_ptr value. + * + * In the unlikely event a previous user_cpus_ptr exists, + * we need to further restrict the mask to what is allowed + * by that old user_cpus_ptr. + */ + if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) { + bool empty = !cpumask_and(new_mask, new_mask, + ctx->user_mask); + + if (empty) + cpumask_copy(new_mask, cpus_allowed); + } + __set_cpus_allowed_ptr(p, ctx); + retval = -EINVAL; + } + +out_free_new_mask: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + struct affinity_context ac; + struct cpumask *user_mask; + int retval; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + if (p->flags & PF_NO_SETAFFINITY) + return -EINVAL; + + if (!check_same_owner(p)) { + guard(rcu)(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) + return -EPERM; + } + + retval = security_task_setscheduler(p); + if (retval) + return retval; + + /* + * With non-SMP configs, user_cpus_ptr/user_mask isn't used and + * alloc_user_cpus_ptr() returns NULL. + */ + user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); + if (user_mask) { + cpumask_copy(user_mask, in_mask); + } else if (IS_ENABLED(CONFIG_SMP)) { + return -ENOMEM; + } + + ac = (struct affinity_context){ + .new_mask = in_mask, + .user_mask = user_mask, + .flags = SCA_USER, + }; + + retval = __sched_setaffinity(p, &ac); + kfree(ac.user_mask); + + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + struct cpumask *new_mask) +{ + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the CPU affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new CPU mask + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ + struct task_struct *p; + int retval; + + guard(rcu)(); + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + guard(raw_spinlock_irqsave)(&p->pi_lock); + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); + + return 0; +} + +/** + * sys_sched_getaffinity - get the CPU affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current CPU mask + * + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) + return -EINVAL; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + unsigned int retlen = min(len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) + ret = -EFAULT; + else + ret = retlen; + } + free_cpumask_var(mask); + + return ret; +} + +static void do_sched_yield(void) +{ + struct rq_flags rf; + struct rq *rq; + + rq = this_rq_lock_irq(&rf); + + schedstat_inc(rq->yld_count); + current->sched_class->yield_task(rq); + + preempt_disable(); + rq_unlock_irq(rq, &rf); + sched_preempt_enable_no_resched(); + + schedule(); +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + * + * Return: 0. + */ +SYSCALL_DEFINE0(sched_yield) +{ + do_sched_yield(); + return 0; +} + +/** + * yield - yield the current processor to other threads. + * + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, it's already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + do_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Return: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. + */ +int __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + int yielded = 0; + + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + rq = this_rq(); + +again: + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) + return -ESRCH; + + guard(double_rq_lock)(rq, p_rq); + if (task_rq(p) != p_rq) + goto again; + + if (!curr->sched_class->yield_to_task) + return 0; + + if (curr->sched_class != p->sched_class) + return 0; + + if (task_on_cpu(p_rq, p) || !task_is_running(p)) + return 0; + + yielded = curr->sched_class->yield_to_task(rq, p); + if (yielded) { + schedstat_inc(rq->yld_count); + /* + * Make p's CPU reschedule; pick_next_entity + * takes care of fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } + } + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_RT_PRIO-1; + break; + case SCHED_DEADLINE: + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + case SCHED_EXT: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_DEADLINE: + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + case SCHED_EXT: + ret = 0; + } + return ret; +} + +static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) +{ + unsigned int time_slice = 0; + int retval; + + if (pid < 0) + return -EINVAL; + + scoped_guard (rcu) { + struct task_struct *p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + scoped_guard (task_rq_lock, p) { + struct rq *rq = scope.rq; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + } + } + + jiffies_to_timespec64(time_slice, t); + return 0; +} + +/** + * sys_sched_rr_get_interval - return the default time-slice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the time-slice value. + * + * this syscall writes the default time-slice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the time-slice is in @interval. Otherwise, + * an error code. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct __kernel_timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_timespec64(&t, interval); + + return retval; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, + struct old_timespec32 __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_old_timespec32(&t, interval); + return retval; +} +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 99ea5986038c..b958fe48e020 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -6,13 +6,19 @@ #include <linux/bsearch.h> DEFINE_MUTEX(sched_domains_mutex); +void sched_domains_mutex_lock(void) +{ + mutex_lock(&sched_domains_mutex); +} +void sched_domains_mutex_unlock(void) +{ + mutex_unlock(&sched_domains_mutex); +} /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; -#ifdef CONFIG_SCHED_DEBUG - static int __init sched_debug_setup(char *str) { sched_debug_verbose = true; @@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } } -#else /* !CONFIG_SCHED_DEBUG */ - -# define sched_debug_verbose 0 -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | @@ -215,8 +212,6 @@ static bool sched_energy_update; static bool sched_is_eas_possible(const struct cpumask *cpu_mask) { bool any_asym_capacity = false; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; int i; /* EAS is enabled for asymmetric CPU capacity topologies. */ @@ -251,25 +246,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask) return false; } - /* Do not attempt EAS if schedutil is not being used. */ - for_each_cpu(i, cpu_mask) { - policy = cpufreq_cpu_get(i); - if (!policy) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", - cpumask_pr_args(cpu_mask), i); - } - return false; - } - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_mask)); - } - return false; + if (!cpufreq_ready_for_eas(cpu_mask)) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n", + cpumask_pr_args(cpu_mask)); } + return false; } return true; @@ -285,7 +267,7 @@ void rebuild_sched_domains_energy(void) } #ifdef CONFIG_PROC_SYSCTL -static int sched_energy_aware_handler(struct ctl_table *table, int write, +static int sched_energy_aware_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -312,7 +294,7 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write, return ret; } -static struct ctl_table sched_energy_aware_sysctls[] = { +static const struct ctl_table sched_energy_aware_sysctls[] = { { .procname = "sched_energy_aware", .data = &sysctl_sched_energy_aware, @@ -322,7 +304,6 @@ static struct ctl_table sched_energy_aware_sysctls[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - {} }; static int __init sched_energy_aware_sysctl_init(void) @@ -502,7 +483,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rd yet then + * If we don't want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ @@ -517,6 +498,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); + /* + * Because the rq is not a task, dl_add_task_root_domain() did not + * move the fair server bw to the rd if it already started. + * Add it now. + */ + if (rq->fair_server.dl_server) + __dl_server_attach_root(&rq->fair_server, rq); + rq_unlock_irqrestore(rq, &rf); if (old_rd) @@ -553,7 +542,7 @@ static int init_rootdomain(struct root_domain *rd) rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif - rd->visit_gen = 0; + rd->visit_cookie = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -1177,7 +1166,7 @@ fail: * uniquely identify each group (for a given domain): * * - The first is the balance_cpu (see should_we_balance() and the - * load-balance blub in fair.c); for each group we only want 1 CPU to + * load-balance blurb in fair.c); for each group we only want 1 CPU to * continue balancing at a higher domain. * * - The second is the sched_group_capacity; we want all identical groups @@ -1329,14 +1318,63 @@ next: update_group_capacity(sd, cpu); } -/* - * Asymmetric CPU capacity bits - */ -struct asym_cap_data { - struct list_head link; - unsigned long capacity; - unsigned long cpus[]; -}; +#ifdef CONFIG_SMP + +/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ +void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ + int asym_prefer_cpu = cpu; + struct sched_domain *sd; + + guard(rcu)(); + + for_each_domain(cpu, sd) { + struct sched_group *sg; + int group_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + continue; + + /* + * Groups of overlapping domain are replicated per NUMA + * node and will require updating "asym_prefer_cpu" on + * each local copy. + * + * If you are hitting this warning, consider moving + * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" + * which is shared by all the overlapping groups. + */ + WARN_ON_ONCE(sd->flags & SD_OVERLAP); + + sg = sd->groups; + if (cpu != sg->asym_prefer_cpu) { + /* + * Since the parent is a superset of the current group, + * if the cpu is not the "asym_prefer_cpu" at the + * current level, it cannot be the preferred CPU at a + * higher levels either. + */ + if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) + return; + + WRITE_ONCE(sg->asym_prefer_cpu, cpu); + continue; + } + + /* Ranking has improved; CPU is still the preferred one. */ + if (new_prio >= old_prio) + continue; + + for_each_cpu(group_cpu, sched_group_span(sg)) { + if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) + asym_prefer_cpu = group_cpu; + } + + WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); + } +} + +#endif /* CONFIG_SMP */ /* * Set of available CPUs grouped by their corresponding capacities @@ -1344,9 +1382,7 @@ struct asym_cap_data { * capacity. * The lifespan of data is unlimited. */ -static LIST_HEAD(asym_cap_list); - -#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) +LIST_HEAD(asym_cap_list); /* * Verify whether there is any CPU capacity asymmetry in a given sched domain. @@ -1386,21 +1422,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span, } +static void free_asym_cap_entry(struct rcu_head *head) +{ + struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu); + kfree(entry); +} + static inline void asym_cpu_capacity_update_data(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); - struct asym_cap_data *entry = NULL; + struct asym_cap_data *insert_entry = NULL; + struct asym_cap_data *entry; + /* + * Search if capacity already exits. If not, track which the entry + * where we should insert to keep the list ordered descending. + */ list_for_each_entry(entry, &asym_cap_list, link) { if (capacity == entry->capacity) goto done; + else if (!insert_entry && capacity > entry->capacity) + insert_entry = list_prev_entry(entry, link); } entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) return; entry->capacity = capacity; - list_add(&entry->link, &asym_cap_list); + + /* If NULL then the new capacity is the smallest, add last. */ + if (!insert_entry) + list_add_tail_rcu(&entry->link, &asym_cap_list); + else + list_add_rcu(&entry->link, &insert_entry->link); done: __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); } @@ -1423,8 +1477,8 @@ static void asym_cpu_capacity_scan(void) list_for_each_entry_safe(entry, next, &asym_cap_list, link) { if (cpumask_empty(cpu_capacity_span(entry))) { - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } @@ -1434,8 +1488,8 @@ static void asym_cpu_capacity_scan(void) */ if (list_is_singular(&asym_cap_list)) { entry = list_first_entry(&asym_cap_list, typeof(*entry), link); - list_del(&entry->link); - kfree(entry); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); } } @@ -1468,7 +1522,7 @@ static void set_domain_attribute(struct sched_domain *sd, } else request = attr->relax_domain_level; - if (sd->level > request) { + if (sd->level >= request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } @@ -1621,9 +1675,7 @@ sd_init(struct sched_domain_topology_level *tl, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; sd_span = sched_domain_span(sd); @@ -1847,7 +1899,7 @@ void sched_init_numa(int offline_node) struct cpumask ***masks; /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * O(nr_nodes^2) de-duplicating selection sort -- in order to find the * unique distances in the node_distance() table. */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); @@ -2089,7 +2141,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) for (i = 0; i < sched_domains_numa_levels; i++) { if (!masks[i][j]) break; - cpu = cpumask_any_and(cpus, masks[i][j]); + cpu = cpumask_any_and_distribute(cpus, masks[i][j]); if (cpu < nr_cpu_ids) { found = cpu; break; @@ -2263,9 +2315,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; -#ifdef CONFIG_SCHED_DEBUG sgc->id = j; -#endif *per_cpu_ptr(sdd->sgc, j) = sgc; } @@ -2324,10 +2374,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -2342,37 +2390,54 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve /* * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for - * any two given CPUs at this (non-NUMA) topology level. + * any two given CPUs on non-NUMA topology levels. */ -static bool topology_span_sane(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, int cpu) +static bool topology_span_sane(const struct cpumask *cpu_map) { - int i; + struct sched_domain_topology_level *tl; + struct cpumask *covered, *id_seen; + int cpu; - /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) - return true; + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + id_seen = sched_domains_tmpmask2; - /* - * Non-NUMA levels cannot partially overlap - they must be either - * completely equal or completely disjoint. Otherwise we can end up - * breaking the sched_group lists - i.e. a later get_group() pass - * breaks the linking done for an earlier span. - */ - for_each_cpu(i, cpu_map) { - if (i == cpu) + for_each_sd_topology(tl) { + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) continue; + + cpumask_clear(covered); + cpumask_clear(id_seen); + /* - * We should 'and' all those masks with 'cpu_map' to exactly - * match the topology we're about to build, but that can only - * remove CPUs, which only lessens our ability to detect - * overlaps + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. */ - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && - cpumask_intersects(tl->mask(cpu), tl->mask(i))) - return false; - } + for_each_cpu(cpu, cpu_map) { + const struct cpumask *tl_cpu_mask = tl->mask(cpu); + int id; + /* lowest bit set in this mask is used as a unique id */ + id = cpumask_first(tl_cpu_mask); + + if (cpumask_test_cpu(id, id_seen)) { + /* First CPU has already been seen, ensure identical spans */ + if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + return false; + } else { + /* First CPU hasn't been seen before, ensure it's a completely new span */ + if (cpumask_intersects(tl_cpu_mask, covered)) + return false; + + cpumask_or(covered, covered, tl_cpu_mask); + cpumask_set_cpu(id, id_seen); + } + } + } return true; } @@ -2405,9 +2470,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = NULL; for_each_sd_topology(tl) { - if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) - goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, i); has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; @@ -2421,6 +2483,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + if (WARN_ON(!topology_span_sane(cpu_map))) + goto error; + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { @@ -2507,16 +2572,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - unsigned long capacity; - rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); - capacity = arch_scale_cpu_capacity(i); - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, capacity); - cpu_attach_domain(sd, d.rd, i); if (lowest_flag_domain(i, SD_CLUSTER)) @@ -2530,10 +2588,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_cluster) static_branch_inc_cpuslocked(&sched_cluster_active); - if (rq && sched_debug_verbose) { - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); ret = 0; error: @@ -2681,7 +2737,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], +static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; @@ -2713,19 +2769,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) { - struct root_domain *rd; - - /* - * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. It - * will be recomputed in function - * update_tasks_root_domain(). - */ - rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; - dl_clear_root_domain(rd); + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; - } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); @@ -2755,7 +2800,7 @@ match2: } #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - /* Build perf. domains: */ + /* Build perf domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && @@ -2764,7 +2809,7 @@ match2: goto match3; } } - /* No match - add perf. domains for a new rd */ + /* No match - add perf domains for a new rd */ has_eas |= build_perf_domains(doms_new[i]); match3: ; @@ -2782,6 +2827,7 @@ match3: ndoms_cur = ndoms_new; update_sched_domain_debugfs(); + dl_rebuild_rd_accounting(); } /* @@ -2790,7 +2836,7 @@ match3: void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); } diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 0b1cd985dc27..b410b61cec95 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -9,7 +9,7 @@ static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; -wait_queue_head_t *bit_waitqueue(void *word, int bit) +wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit) { const int shift = BITS_PER_LONG == 32 ? 5 : 6; unsigned long val = (unsigned long)word << shift | bit; @@ -33,7 +33,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync EXPORT_SYMBOL(wake_bit_function); /* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * To allow interruptible waiting and asynchronous (i.e. non-blocking) * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are * permitted return codes. Nonzero return codes halt waiting and return. */ @@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ } EXPORT_SYMBOL(__wait_on_bit); -int __sched out_of_line_wait_on_bit(void *word, int bit, +int __sched out_of_line_wait_on_bit(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, + unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode, unsigned long timeout) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry } EXPORT_SYMBOL(__wait_on_bit_lock); -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, +int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); @@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) EXPORT_SYMBOL(__wake_up_bit); /** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wake_up_bit - wake up waiters on a bit + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. + * Wake up any process waiting in wait_on_bit() or similar for the + * given bit to be cleared. * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address and bit will be woken, and only if the + * bit is clear. + * + * In order for this to function properly there must be a full memory + * barrier after the bit is cleared and before this function is called. + * If the bit was cleared atomically, such as a by clear_bit() then + * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed. + * If the bit was cleared with a fully-ordered operation, no further + * barrier is required. + * + * Normally the bit should be cleared by an operation with RELEASE + * semantics so that any changes to memory made before the bit is + * cleared are guaranteed to be visible after the matching wait_on_bit() + * completes. */ -void wake_up_bit(void *word, int bit) +void wake_up_bit(unsigned long *word, int bit) { __wake_up_bit(bit_waitqueue(word, bit), word, bit); } @@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int } EXPORT_SYMBOL(init_wait_var_entry); +/** + * wake_up_var - wake up waiters on a variable (kernel address) + * @var: the address of the variable being waited on + * + * Wake up any process waiting in wait_var_event() or similar for the + * given variable to change. wait_var_event() can be waiting for an + * arbitrary condition to be true and associates that condition with an + * address. Calling wake_up_var() suggests that the condition has been + * made true, but does not strictly require the condtion to use the + * address given. + * + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address will be woken. + * + * In order for this to function properly there must be a full memory + * barrier after the variable is updated (or more accurately, after the + * condition waited on has been made to be true) and before this function + * is called. If the variable was updated atomically, such as a by + * atomic_dec() then smb_mb__after_atomic() can be used. If the + * variable was updated by a fully ordered operation such as + * atomic_dec_and_test() then no extra barrier is required. Otherwise + * smb_mb() is needed. + * + * Normally the variable should be updated (the condition should be made + * to be true) by an operation with RELEASE semantics such as + * smp_store_release() so that any changes to memory made before the + * variable was updated are guaranteed to be visible after the matching + * wait_var_event() completes. + */ void wake_up_var(void *var) { __wake_up_bit(__var_waitqueue(var), var, -1); @@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); - void __init wait_bit_init(void) { int i; |