diff options
Diffstat (limited to 'kernel/sched')
40 files changed, 1080 insertions, 1271 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2b331822c7e7..cdea931aae30 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -4,6 +4,9 @@ * Auto-group scheduling implementation: */ +#include "autogroup.h" +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; @@ -25,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void) { register_sysctl_init("kernel", sched_autogroup_sysctls); } -#else +#else /* !CONFIG_SYSCTL: */ #define sched_autogroup_sysctl_init() do { } while (0) -#endif +#endif /* !CONFIG_SYSCTL */ void __init autogroup_init(struct task_struct *init_task) { @@ -108,7 +111,7 @@ static inline struct autogroup *autogroup_create(void) free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; tg->rt_rq = root_task_group.rt_rq; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ tg->autogroup = ag; sched_online_group(tg, &root_task_group); diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 90d69f2c5eaf..06c82b2bdfb5 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -2,6 +2,8 @@ #ifndef _KERNEL_SCHED_AUTOGROUP_H #define _KERNEL_SCHED_AUTOGROUP_H +#include "sched.h" + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -41,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) extern int autogroup_path(struct task_group *tg, char *buf, int buflen); -#else /* !CONFIG_SCHED_AUTOGROUP */ +#else /* !CONFIG_SCHED_AUTOGROUP: */ static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } @@ -61,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) return 0; } -#endif /* CONFIG_SCHED_AUTOGROUP */ +#endif /* !CONFIG_SCHED_AUTOGROUP */ #endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 72d97aa8b726..c4a488e67aa7 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -50,11 +50,9 @@ #include "idle.c" #include "rt.c" +#include "cpudeadline.c" -#ifdef CONFIG_SMP -# include "cpudeadline.c" -# include "pelt.c" -#endif +#include "pelt.c" #include "cputime.c" #include "deadline.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index bf9d8db94b70..e2cf3b08d4e9 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -80,11 +80,10 @@ #include "wait_bit.c" #include "wait.c" -#ifdef CONFIG_SMP -# include "cpupri.c" -# include "stop_task.c" -# include "topology.c" -#endif +#include "cpupri.c" +#include "stop_task.c" + +#include "topology.c" #ifdef CONFIG_SCHED_CORE # include "core_sched.c" diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a09655b48140..f5e6dd6a6b3a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -54,6 +54,9 @@ * */ +#include <linux/sched/clock.h> +#include "sched.h" + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. @@ -471,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */ void __init sched_clock_init(void) { @@ -489,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * Running clock - returns the time that has elapsed while a guest has been diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 3561ab533dd4..19ee702273c0 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,11 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +#include <linux/linkage.h> +#include <linux/sched/debug.h> +#include <linux/completion.h> +#include "sched.h" + static void complete_with_flags(struct completion *x, int wake_flags) { unsigned long flags; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8988d38d46a3..3ec00d08d46a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -69,8 +69,8 @@ #include <linux/livepatch_sched.h> #ifdef CONFIG_PREEMPT_DYNAMIC -# ifdef CONFIG_GENERIC_ENTRY -# include <linux/entry-common.h> +# ifdef CONFIG_GENERIC_IRQ_ENTRY +# include <linux/irq-entry-common.h> # endif #endif @@ -96,6 +96,7 @@ #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" #include "../smpboot.h" +#include "../locking/mutex.h" EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -119,6 +120,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#ifdef CONFIG_SCHED_PROXY_EXEC +DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); +static int __init setup_proxy_exec(char *str) +{ + bool proxy_enable = true; + + if (*str && kstrtobool(str + 1, &proxy_enable)) { + pr_warn("Unable to parse sched_proxy_exec=\n"); + return 0; + } + + if (proxy_enable) { + pr_info("sched_proxy_exec enabled via boot arg\n"); + static_branch_enable(&__sched_proxy_exec); + } else { + pr_info("sched_proxy_exec disabled via boot arg\n"); + static_branch_disable(&__sched_proxy_exec); + } + return 1; +} +#else +static int __init setup_proxy_exec(char *str) +{ + pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n"); + return 0; +} +#endif +__setup("sched_proxy_exec", setup_proxy_exec); + /* * Debugging: various feature bits * @@ -481,13 +511,13 @@ void sched_core_put(void) schedule_work(&_work); } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* need a wrapper since we may need to trace from modules */ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); @@ -650,7 +680,6 @@ void raw_spin_rq_unlock(struct rq *rq) raw_spin_unlock(rq_lockp(rq)); } -#ifdef CONFIG_SMP /* * double_rq_lock - safely lock two runqueues */ @@ -667,7 +696,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } -#endif /* * __task_rq_lock - lock the rq @p resides on. @@ -853,8 +881,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } -#ifdef CONFIG_SMP - static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; @@ -899,33 +925,12 @@ void hrtick_start(struct rq *rq, u64 delay) smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -#else -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and IRQs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - delay = max_t(u64, delay, 10000LL); - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_HARD); -} - -#endif /* CONFIG_SMP */ - static void hrtick_rq_init(struct rq *rq) { -#ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); -#endif hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } -#else /* CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_clear(struct rq *rq) { } @@ -933,7 +938,7 @@ static inline void hrtick_clear(struct rq *rq) static inline void hrtick_rq_init(struct rq *rq) { } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ /* * try_cmpxchg based fetch_or() macro so it works for different integer types: @@ -949,7 +954,7 @@ static inline void hrtick_rq_init(struct rq *rq) _val; \ }) -#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +#ifdef TIF_POLLING_NRFLAG /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -988,13 +993,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) return true; } -#ifdef CONFIG_SMP static inline bool set_nr_if_polling(struct task_struct *p) { return false; } #endif -#endif static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { @@ -1167,7 +1170,6 @@ void resched_cpu(int cpu) raw_spin_rq_unlock_irqrestore(rq, flags); } -#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy CPU for migrating timers @@ -1374,10 +1376,8 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ -#endif /* CONFIG_SMP */ -#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. @@ -1971,7 +1971,7 @@ undo: sysctl_sched_uclamp_util_min_rt_default = old_min_rt; return result; } -#endif +#endif /* CONFIG_SYSCTL */ static void uclamp_fork(struct task_struct *p) { @@ -2037,13 +2037,13 @@ static void __init init_uclamp(void) } } -#else /* !CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK: */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } -#endif /* CONFIG_UCLAMP_TASK */ +#endif /* !CONFIG_UCLAMP_TASK */ bool sched_task_on_rq(struct task_struct *p) { @@ -2353,8 +2353,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state return ncsw; } -#ifdef CONFIG_SMP - static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); @@ -2936,8 +2934,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag struct set_affinity_pending my_pending = { }, *pending = NULL; bool stop_pending, complete = false; - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + /* + * Can the task run on the task's current CPU? If so, we're done + * + * We are also done if the task is the current donor, boosting a lock- + * holding proxy, (and potentially has been migrated outside its + * current or previous affinity mask) + */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) || + (task_current_donor(rq, p) && !task_current(rq, p))) { struct task_struct *push_task = NULL; if ((flags & SCA_MIGRATE_ENABLE) && @@ -3305,6 +3310,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) WARN_ON_ONCE(ret); } +#ifdef CONFIG_SMP + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { unsigned int state = READ_ONCE(p->__state); @@ -3358,14 +3365,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { - __schedstat_inc(p->stats.numa_task_swapped); - count_vm_numa_event(NUMA_TASK_SWAP); - count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); - if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -3661,17 +3665,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else /* CONFIG_SMP */ - -static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } - -static inline bool rq_has_pinned_tasks(struct rq *rq) -{ - return false; -} - -#endif /* !CONFIG_SMP */ - static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -3682,7 +3675,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) rq = this_rq(); -#ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); __schedstat_inc(p->stats.nr_wakeups_local); @@ -3702,7 +3694,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_MIGRATED) __schedstat_inc(p->stats.nr_wakeups_migrate); -#endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); __schedstat_inc(p->stats.nr_wakeups); @@ -3731,13 +3722,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (p->sched_contributes_to_load) rq->nr_uninterruptible--; -#ifdef CONFIG_SMP if (wake_flags & WF_RQ_SELECTED) en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else -#endif if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); @@ -3748,7 +3737,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ttwu_do_wakeup(p); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Our task @p is fully woken up and running; so it's safe to @@ -3770,7 +3758,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } -#endif } /* @@ -3824,7 +3811,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) return ret; } -#ifdef CONFIG_SMP void sched_ttwu_pending(void *arg) { struct llist_node *llist = arg; @@ -3891,7 +3877,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); +#ifdef CONFIG_SMP __smp_call_single_queue(cpu, &p->wake_entry.llist); +#endif } void wake_up_if_idle(int cpu) @@ -3943,6 +3931,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) if (!scx_allow_ttwu_queue(p)) return false; +#ifdef CONFIG_SMP + if (p->sched_class == &stop_sched_class) + return false; +#endif + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. @@ -3992,15 +3985,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } -#else /* !CONFIG_SMP */ - -static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -{ - return false; -} - -#endif /* CONFIG_SMP */ - static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -4256,7 +4240,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) break; -#ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -4335,9 +4318,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } -#else - cpu = task_cpu(p); -#endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); } @@ -4370,14 +4350,12 @@ static bool __task_needs_rq_lock(struct task_struct *p) if (p->on_rq) return true; -#ifdef CONFIG_SMP /* * Ensure the task has finished __schedule() and will not be referenced * anymore. Again, see try_to_wake_up() for a longer comment. */ smp_rmb(); smp_cond_load_acquire(&p->on_cpu, !VAL); -#endif return false; } @@ -4533,10 +4511,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); -#ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; -#endif init_sched_mm_cid(p); } @@ -4599,8 +4575,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write, } return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SCHEDSTATS @@ -4787,14 +4763,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) p->on_cpu = 0; -#endif init_task_preempt_count(p); -#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -#endif + return 0; } @@ -4871,7 +4844,6 @@ void wake_up_new_task(struct task_struct *p) raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); -#ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_ptr can change in the fork path @@ -4883,7 +4855,6 @@ void wake_up_new_task(struct task_struct *p) p->recent_used_cpu = task_cpu(p); rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); -#endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); @@ -4891,7 +4862,6 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, wake_flags); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Nothing relies on rq->lock after this, so it's fine to @@ -4901,7 +4871,6 @@ void wake_up_new_task(struct task_struct *p) p->sched_class->task_woken(rq, p); rq_repin_lock(rq, &rf); } -#endif task_rq_unlock(rq, p, &rf); } @@ -4978,7 +4947,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, __fire_sched_out_preempt_notifiers(curr, next); } -#else /* !CONFIG_PREEMPT_NOTIFIERS */ +#else /* !CONFIG_PREEMPT_NOTIFIERS: */ static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -4990,11 +4959,10 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif /* CONFIG_PREEMPT_NOTIFIERS */ +#endif /* !CONFIG_PREEMPT_NOTIFIERS */ static inline void prepare_task(struct task_struct *next) { -#ifdef CONFIG_SMP /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. @@ -5003,12 +4971,10 @@ static inline void prepare_task(struct task_struct *next) * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); -#endif } static inline void finish_task(struct task_struct *prev) { -#ifdef CONFIG_SMP /* * This must be the very last reference to @prev from this CPU. After * p->on_cpu is cleared, the task can be moved to a different CPU. We @@ -5021,11 +4987,8 @@ static inline void finish_task(struct task_struct *prev) * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); -#endif } -#ifdef CONFIG_SMP - static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -5107,14 +5070,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) } } -#else - -static inline void __balance_callbacks(struct rq *rq) -{ -} - -#endif - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -5502,8 +5457,6 @@ unsigned int nr_iowait(void) return sum; } -#ifdef CONFIG_SMP - /* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. @@ -5527,8 +5480,6 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif - DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); @@ -5563,7 +5514,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns; -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +#ifdef CONFIG_64BIT /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. @@ -5690,12 +5641,10 @@ void sched_tick(void) if (donor->flags & PF_WQ_WORKER) wq_worker_tick(donor); -#ifdef CONFIG_SMP if (!scx_switched_all()) { rq->idle_balance = idle_cpu(cpu); sched_balance_trigger(rq); } -#endif } #ifdef CONFIG_NO_HZ_FULL @@ -5835,10 +5784,10 @@ int __init sched_tick_offload_init(void) return 0; } -#else /* !CONFIG_NO_HZ_FULL */ +#else /* !CONFIG_NO_HZ_FULL: */ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) @@ -6553,7 +6502,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu) rq->core = rq; } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_cpu_starting(unsigned int cpu) {} static inline void sched_core_cpu_deactivate(unsigned int cpu) {} @@ -6565,7 +6514,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return __pick_next_task(rq, prev, rf); } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* * Constants for the sched_mode argument of __schedule(). @@ -6581,11 +6530,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Helper function for __schedule() * - * If a task does not have signals pending, deactivate it - * Otherwise marks the task's __state as RUNNING + * Tries to deactivate the task, unless the should_block arg + * is false or if a signal is pending. In the case a signal + * is pending, marks the task's __state as RUNNING (and clear + * blocked_on). */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long *task_state_p) + unsigned long *task_state_p, bool should_block) { unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; @@ -6596,6 +6547,16 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return false; } + /* + * We check should_block after signal_pending because we + * will want to wake the task in that case. But if + * should_block is false, its likely due to the task being + * blocked on a mutex, and we want to keep it on the runqueue + * to be selectable for proxy-execution. + */ + if (!should_block) + return false; + p->sched_contributes_to_load = (task_state & TASK_UNINTERRUPTIBLE) && !(task_state & TASK_NOLOAD) && @@ -6619,6 +6580,194 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return true; } +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline struct task_struct *proxy_resched_idle(struct rq *rq) +{ + put_prev_set_next_task(rq, rq->donor, rq->idle); + rq_set_donor(rq, rq->idle); + set_tsk_need_resched(rq->idle); + return rq->idle; +} + +static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + unsigned long state = READ_ONCE(donor->__state); + + /* Don't deactivate if the state has been changed to TASK_RUNNING */ + if (state == TASK_RUNNING) + return false; + /* + * Because we got donor from pick_next_task(), it is *crucial* + * that we call proxy_resched_idle() before we deactivate it. + * As once we deactivate donor, donor->on_rq is set to zero, + * which allows ttwu() to immediately try to wake the task on + * another rq. So we cannot use *any* references to donor + * after that point. So things like cfs_rq->curr or rq->donor + * need to be changed from next *before* we deactivate. + */ + proxy_resched_idle(rq); + return try_to_block_task(rq, donor, &state, true); +} + +static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + if (!__proxy_deactivate(rq, donor)) { + /* + * XXX: For now, if deactivation failed, set donor + * as unblocked, as we aren't doing proxy-migrations + * yet (more logic will be needed then). + */ + donor->blocked_on = NULL; + } + return NULL; +} + +/* + * Find runnable lock owner to proxy for mutex blocked donor + * + * Follow the blocked-on relation: + * task->blocked_on -> mutex->owner -> task... + * + * Lock order: + * + * p->pi_lock + * rq->lock + * mutex->wait_lock + * + * Returns the task that is going to be used as execution context (the one + * that is actually going to be run on cpu_of(rq)). + */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + struct task_struct *owner = NULL; + int this_cpu = cpu_of(rq); + struct task_struct *p; + struct mutex *mutex; + + /* Follow blocked_on chain. */ + for (p = donor; task_is_blocked(p); p = owner) { + mutex = p->blocked_on; + /* Something changed in the chain, so pick again */ + if (!mutex) + return NULL; + /* + * By taking mutex->wait_lock we hold off concurrent mutex_unlock() + * and ensure @owner sticks around. + */ + guard(raw_spinlock)(&mutex->wait_lock); + + /* Check again that p is blocked with wait_lock held */ + if (mutex != __get_task_blocked_on(p)) { + /* + * Something changed in the blocked_on chain and + * we don't know if only at this level. So, let's + * just bail out completely and let __schedule() + * figure things out (pick_again loop). + */ + return NULL; + } + + owner = __mutex_owner(mutex); + if (!owner) { + __clear_task_blocked_on(p, mutex); + return p; + } + + if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { + /* XXX Don't handle blocked owners/delayed dequeue yet */ + return proxy_deactivate(rq, donor); + } + + if (task_cpu(owner) != this_cpu) { + /* XXX Don't handle migrations yet */ + return proxy_deactivate(rq, donor); + } + + if (task_on_rq_migrating(owner)) { + /* + * One of the chain of mutex owners is currently migrating to this + * CPU, but has not yet been enqueued because we are holding the + * rq lock. As a simple solution, just schedule rq->idle to give + * the migration a chance to complete. Much like the migrate_task + * case we should end up back in find_proxy_task(), this time + * hopefully with all relevant tasks already enqueued. + */ + return proxy_resched_idle(rq); + } + + /* + * Its possible to race where after we check owner->on_rq + * but before we check (owner_cpu != this_cpu) that the + * task on another cpu was migrated back to this cpu. In + * that case it could slip by our checks. So double check + * we are still on this cpu and not migrating. If we get + * inconsistent results, try again. + */ + if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu) + return NULL; + + if (owner == p) { + /* + * It's possible we interleave with mutex_unlock like: + * + * lock(&rq->lock); + * find_proxy_task() + * mutex_unlock() + * lock(&wait_lock); + * donor(owner) = current->blocked_donor; + * unlock(&wait_lock); + * + * wake_up_q(); + * ... + * ttwu_runnable() + * __task_rq_lock() + * lock(&wait_lock); + * owner == p + * + * Which leaves us to finish the ttwu_runnable() and make it go. + * + * So schedule rq->idle so that ttwu_runnable() can get the rq + * lock and mark owner as running. + */ + return proxy_resched_idle(rq); + } + /* + * OK, now we're absolutely sure @owner is on this + * rq, therefore holding @rq->lock is sufficient to + * guarantee its existence, as per ttwu_remote(). + */ + } + + WARN_ON_ONCE(owner && !owner->on_rq); + return owner; +} +#else /* SCHED_PROXY_EXEC */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n"); + return donor; +} +#endif /* SCHED_PROXY_EXEC */ + +static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) +{ + if (!sched_proxy_exec()) + return; + /* + * pick_next_task() calls set_next_task() on the chosen task + * at some point, which ensures it is not push/pullable. + * However, the chosen/donor task *and* the mutex owner form an + * atomic pair wrt push/pull. + * + * Make sure owner we run is not pushable. Unfortunately we can + * only deal with that by means of a dequeue/enqueue cycle. :-/ + */ + dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); + enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); +} + /* * __schedule() is the main scheduler function. * @@ -6731,15 +6880,31 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, &prev_state); + /* + * We pass task_is_blocked() as the should_block arg + * in order to keep mutex-blocked tasks on the runqueue + * for slection with proxy-exec (without proxy-exec + * task_is_blocked() will always be false). + */ + try_to_block_task(rq, prev, &prev_state, + !task_is_blocked(prev)); switch_count = &prev->nvcsw; } - next = pick_next_task(rq, prev, &rf); +pick_again: + next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + if (unlikely(task_is_blocked(next))) { + next = find_proxy_task(rq, next, &rf); + if (!next) + goto pick_again; + if (next == rq->idle) + goto keep_resched; + } picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); +keep_resched: rq->last_seen_need_resched_ns = 0; is_switch = prev != next; @@ -6750,6 +6915,10 @@ picked: * changes to task_struct made by pick_next_task(). */ RCU_INIT_POINTER(rq->curr, next); + + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6784,6 +6953,10 @@ picked: /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + /* In case next was already curr but just got blocked_donor */ + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); @@ -6992,14 +7165,14 @@ NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_dynamic_enabled -#define preempt_schedule_dynamic_enabled preempt_schedule -#define preempt_schedule_dynamic_disabled NULL -#endif +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# ifndef preempt_schedule_dynamic_enabled +# define preempt_schedule_dynamic_enabled preempt_schedule +# define preempt_schedule_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); void __sched notrace dynamic_preempt_schedule(void) { @@ -7009,8 +7182,8 @@ void __sched notrace dynamic_preempt_schedule(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule); EXPORT_SYMBOL(dynamic_preempt_schedule); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -7065,14 +7238,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_notrace_dynamic_enabled -#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace -#define preempt_schedule_notrace_dynamic_disabled NULL -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# ifndef preempt_schedule_notrace_dynamic_enabled +# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +# define preempt_schedule_notrace_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); void __sched notrace dynamic_preempt_schedule_notrace(void) { @@ -7082,7 +7255,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); -#endif +# endif #endif #endif /* CONFIG_PREEMPTION */ @@ -7301,7 +7474,7 @@ out_unlock: preempt_enable(); } -#endif +#endif /* CONFIG_RT_MUTEXES */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) @@ -7332,17 +7505,17 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define cond_resched_dynamic_enabled __cond_resched -#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# define cond_resched_dynamic_enabled __cond_resched +# define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); -#define might_resched_dynamic_enabled __cond_resched -#define might_resched_dynamic_disabled ((void *)&__static_call_return0) +# define might_resched_dynamic_enabled __cond_resched +# define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { @@ -7360,8 +7533,8 @@ int __sched dynamic_might_resched(void) return __cond_resched(); } EXPORT_SYMBOL(dynamic_might_resched); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -7427,9 +7600,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC -#ifdef CONFIG_GENERIC_ENTRY -#include <linux/entry-common.h> -#endif +# ifdef CONFIG_GENERIC_IRQ_ENTRY +# include <linux/irq-entry-common.h> +# endif /* * SC:cond_resched @@ -7484,37 +7657,37 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -#ifndef CONFIG_PREEMPT_RT +# ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; -#endif +# endif if (!strcmp(str, "full")) return preempt_dynamic_full; -#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY +# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY if (!strcmp(str, "lazy")) return preempt_dynamic_lazy; -#endif +# endif return -EINVAL; } -#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) -#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) -#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) -#else -#error "Unsupported PREEMPT_DYNAMIC mechanism" -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) +# else +# error "Unsupported PREEMPT_DYNAMIC mechanism" +# endif static DEFINE_MUTEX(sched_dynamic_mutex); @@ -7618,7 +7791,7 @@ static void __init preempt_dynamic_init(void) } } -#define PREEMPT_MODEL_ACCESSOR(mode) \ +# define PREEMPT_MODEL_ACCESSOR(mode) \ bool preempt_model_##mode(void) \ { \ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ @@ -7663,7 +7836,7 @@ const char *preempt_model_str(void) if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { seq_buf_printf(&s, "(%s)%s", - preempt_dynamic_mode > 0 ? + preempt_dynamic_mode >= 0 ? preempt_modes[preempt_dynamic_mode] : "undef", brace ? "}" : ""); return seq_buf_str(&s); @@ -7819,12 +7992,10 @@ void show_state_filter(unsigned int state_filter) */ void __init init_idle(struct task_struct *idle, int cpu) { -#ifdef CONFIG_SMP struct affinity_context ac = (struct affinity_context) { .new_mask = cpumask_of(cpu), .flags = 0, }; -#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -7840,13 +8011,11 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); -#ifdef CONFIG_SMP /* * No validation and serialization required at boot time and for * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); -#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the CPU isn't yet set to this CPU so the @@ -7865,9 +8034,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; -#ifdef CONFIG_SMP idle->on_cpu = 1; -#endif raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); @@ -7880,13 +8047,9 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif } -#ifdef CONFIG_SMP - int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -7934,9 +8097,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - __schedstat_inc(p->stats.numa_task_migrated); - count_vm_numa_event(NUMA_TASK_MIGRATE); - count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); + /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } @@ -8123,7 +8285,7 @@ static void balance_hotplug_wait(void) TASK_UNINTERRUPTIBLE); } -#else +#else /* !CONFIG_HOTPLUG_CPU: */ static inline void balance_push(struct rq *rq) { @@ -8137,7 +8299,7 @@ static inline void balance_hotplug_wait(void) { } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* !CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) { @@ -8446,7 +8608,7 @@ int sched_cpu_dying(unsigned int cpu) sched_core_cpu_dying(cpu); return 0; } -#endif +#endif /* CONFIG_HOTPLUG_CPU */ void __init sched_init_smp(void) { @@ -8470,6 +8632,8 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_init_dl_servers(); + sched_smp_initialized = true; } @@ -8480,13 +8644,6 @@ static int __init migration_init(void) } early_initcall(migration_init); -#else -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* CONFIG_SMP */ - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8512,9 +8669,7 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ -#ifdef CONFIG_SMP BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -#endif BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); @@ -8557,9 +8712,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_SMP init_defrootdomain(); -#endif #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, @@ -8620,7 +8773,6 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif -#ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -8646,7 +8798,6 @@ void __init sched_init(void) #ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); #endif -#endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); @@ -8694,10 +8845,9 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; -#ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); -#endif init_sched_fair_class(); init_sched_ext_class(); @@ -8830,7 +8980,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset) } EXPORT_SYMBOL_GPL(__cant_sleep); -#ifdef CONFIG_SMP +# ifdef CONFIG_SMP void __cant_migrate(const char *file, int line) { static unsigned long prev_jiffy; @@ -8861,8 +9011,8 @@ void __cant_migrate(const char *file, int line) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_migrate); -#endif -#endif +# endif /* CONFIG_SMP */ +#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) @@ -8902,7 +9052,7 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_KGDB_KDB) +#ifdef CONFIG_KGDB_KDB /* * These functions are only useful for KDB. * @@ -8926,7 +9076,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_KGDB_KDB) */ +#endif /* CONFIG_KGDB_KDB */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -9422,47 +9572,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); -const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ -/* More than 203 days if BW_SHIFT equals 20. */ -static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; - static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, - u64 burst) +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 period, quota, burst; - if (tg == &root_task_group) - return -EINVAL; - - /* - * Ensure we have at some amount of bandwidth every period. This is - * to prevent reaching a state of large arrears when throttled via - * entity_tick() resulting in prolonged exit starvation. - */ - if (quota < min_cfs_quota_period || period < min_cfs_quota_period) - return -EINVAL; - - /* - * Likewise, bound things on the other side by preventing insane quota - * periods. This also allows us to normalize in computing quota - * feasibility. - */ - if (period > max_cfs_quota_period) - return -EINVAL; + period = (u64)period_us * NSEC_PER_USEC; - /* - * Bound quota to defend quota against overflow during bandwidth shift. - */ - if (quota != RUNTIME_INF && quota > max_cfs_runtime) - return -EINVAL; + if (quota_us == RUNTIME_INF) + quota = RUNTIME_INF; + else + quota = (u64)quota_us * NSEC_PER_USEC; - if (quota != RUNTIME_INF && (burst > quota || - burst + quota > max_cfs_runtime)) - return -EINVAL; + burst = (u64)burst_us * NSEC_PER_USEC; /* * Prevent race between setting of cfs_rq->runtime_enabled and @@ -9517,28 +9643,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, return 0; } -static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static u64 tg_get_cfs_period(struct task_group *tg) { - u64 quota, period, burst; + u64 cfs_period_us; - period = ktime_to_ns(tg->cfs_bandwidth.period); - burst = tg->cfs_bandwidth.burst; - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - else - return -EINVAL; + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + do_div(cfs_period_us, NSEC_PER_USEC); - return tg_set_cfs_bandwidth(tg, period, quota, burst); + return cfs_period_us; } -static long tg_get_cfs_quota(struct task_group *tg) +static u64 tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; if (tg->cfs_bandwidth.quota == RUNTIME_INF) - return -1; + return RUNTIME_INF; quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); @@ -9546,45 +9666,7 @@ static long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg->cfs_bandwidth.quota; - burst = tg->cfs_bandwidth.burst; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static long tg_get_cfs_period(struct task_group *tg) -{ - u64 cfs_period_us; - - cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); - do_div(cfs_period_us, NSEC_PER_USEC); - - return cfs_period_us; -} - -static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - burst = (u64)cfs_burst_us * NSEC_PER_USEC; - period = ktime_to_ns(tg->cfs_bandwidth.period); - quota = tg->cfs_bandwidth.quota; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static long tg_get_cfs_burst(struct task_group *tg) +static u64 tg_get_cfs_burst(struct task_group *tg) { u64 burst_us; @@ -9594,42 +9676,6 @@ static long tg_get_cfs_burst(struct task_group *tg) return burst_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_quota(css_tg(css)); -} - -static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 cfs_quota_us) -{ - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); -} - -static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_period(css_tg(css)); -} - -static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_period_us) -{ - return tg_set_cfs_period(css_tg(css), cfs_period_us); -} - -static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_burst(css_tg(css)); -} - -static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_burst_us) -{ - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); -} - struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9762,6 +9808,126 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } + +const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ +static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_bw_runtime_us = MAX_BW; + +static void tg_bandwidth(struct task_group *tg, + u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) +{ + if (period_us_p) + *period_us_p = tg_get_cfs_period(tg); + if (quota_us_p) + *quota_us_p = tg_get_cfs_quota(tg); + if (burst_us_p) + *burst_us_p = tg_get_cfs_burst(tg); +} + +static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 period_us; + + tg_bandwidth(css_tg(css), &period_us, NULL, NULL); + return period_us; +} + +static int tg_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + const u64 max_usec = U64_MAX / NSEC_PER_USEC; + + if (tg == &root_task_group) + return -EINVAL; + + /* Values should survive translation to nsec */ + if (period_us > max_usec || + (quota_us != RUNTIME_INF && quota_us > max_usec) || + burst_us > max_usec) + return -EINVAL; + + /* + * Ensure we have some amount of bandwidth every period. This is to + * prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota_us < min_bw_quota_period_us || + period_us < min_bw_quota_period_us) + return -EINVAL; + + /* + * Likewise, bound things on the other side by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period_us > max_bw_quota_period_us) + return -EINVAL; + + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us) + return -EINVAL; + + if (quota_us != RUNTIME_INF && (burst_us > quota_us || + burst_us + quota_us > max_bw_runtime_us)) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +} + +static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 quota_us; + + tg_bandwidth(css_tg(css), NULL, "a_us, NULL); + return quota_us; /* (s64)RUNTIME_INF becomes -1 */ +} + +static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 burst_us; + + tg_bandwidth(css_tg(css), NULL, NULL, &burst_us); + return burst_us; +} + +static int cpu_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period_us) +{ + struct task_group *tg = css_tg(css); + u64 quota_us, burst_us; + + tg_bandwidth(tg, NULL, "a_us, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} + +static int cpu_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 quota_us) +{ + struct task_group *tg = css_tg(css); + u64 period_us, burst_us; + + if (quota_us < 0) + quota_us = RUNTIME_INF; + + tg_bandwidth(tg, &period_us, NULL, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} + +static int cpu_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 burst_us) +{ + struct task_group *tg = css_tg(css); + u64 period_us, quota_us; + + tg_bandwidth(tg, &period_us, "a_us, NULL); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} #endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9807,7 +9973,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, scx_group_set_idle(css_tg(css), idle); return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -9824,19 +9990,19 @@ static struct cftype cpu_legacy_files[] = { #endif #ifdef CONFIG_CFS_BANDWIDTH { - .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, + .name = "cfs_period_us", + .read_u64 = cpu_period_read_u64, + .write_u64 = cpu_period_write_u64, }, { - .name = "cfs_period_us", - .read_u64 = cpu_cfs_period_read_u64, - .write_u64 = cpu_cfs_period_write_u64, + .name = "cfs_quota_us", + .read_s64 = cpu_quota_read_s64, + .write_s64 = cpu_quota_write_s64, }, { .name = "cfs_burst_us", - .read_u64 = cpu_cfs_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .read_u64 = cpu_burst_read_u64, + .write_u64 = cpu_burst_write_u64, }, { .name = "stat", @@ -9935,7 +10101,7 @@ static int cpu_extra_stat_show(struct seq_file *sf, cfs_b->nr_periods, cfs_b->nr_throttled, throttled_usec, cfs_b->nr_burst, burst_usec); } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ return 0; } @@ -10033,22 +10199,20 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, } /* caller should put the current value in *@periodp before calling */ -static int __maybe_unused cpu_period_quota_parse(char *buf, - u64 *periodp, u64 *quotap) +static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, + u64 *quota_us_p) { char tok[21]; /* U64_MAX */ - if (sscanf(buf, "%20s %llu", tok, periodp) < 1) + if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1) return -EINVAL; - *periodp *= NSEC_PER_USEC; - - if (sscanf(tok, "%llu", quotap)) - *quotap *= NSEC_PER_USEC; - else if (!strcmp(tok, "max")) - *quotap = RUNTIME_INF; - else - return -EINVAL; + if (sscanf(tok, "%llu", quota_us_p) < 1) { + if (!strcmp(tok, "max")) + *quota_us_p = RUNTIME_INF; + else + return -EINVAL; + } return 0; } @@ -10057,8 +10221,10 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); + u64 period_us, quota_us; - cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + tg_bandwidth(tg, &period_us, "a_us, NULL); + cpu_period_quota_print(sf, period_us, quota_us); return 0; } @@ -10066,17 +10232,16 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct task_group *tg = css_tg(of_css(of)); - u64 period = tg_get_cfs_period(tg); - u64 burst = tg->cfs_bandwidth.burst; - u64 quota; + u64 period_us, quota_us, burst_us; int ret; - ret = cpu_period_quota_parse(buf, &period, "a); + tg_bandwidth(tg, &period_us, NULL, &burst_us); + ret = cpu_period_quota_parse(buf, &period_us, "a_us); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota, burst); + ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us); return ret ?: nbytes; } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ static struct cftype cpu_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -10109,10 +10274,10 @@ static struct cftype cpu_files[] = { { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_cfs_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .read_u64 = cpu_burst_read_u64, + .write_u64 = cpu_burst_write_u64, }, -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -10126,7 +10291,7 @@ static struct cftype cpu_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, -#endif +#endif /* CONFIG_UCLAMP_TASK_GROUP */ { } /* terminate */ }; @@ -10147,7 +10312,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .threaded = true, }; -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ void dump_cpu_task(int cpu) { @@ -10733,7 +10898,7 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!t->mm || t->mm_cid != -1); t->mm_cid_active = 1; } -#endif +#endif /* CONFIG_SCHED_MM_CID */ #ifdef CONFIG_SCHED_CLASS_EXT void sched_deq_and_put_task(struct task_struct *p, int queue_flags, @@ -10768,4 +10933,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) if (ctx->running) set_next_task(rq, ctx->p); } -#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index c4606ca89210..9ede71ecba7f 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -4,6 +4,8 @@ * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. */ +#include "sched.h" + struct sched_core_cookie { refcount_t refcnt; }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda09949..23a56ba12d81 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -6,6 +6,8 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include <linux/sched/cputime.h> +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 95baa12a1029..cdd740b3f774 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -6,6 +6,7 @@ * * Author: Juri Lelli <j.lelli@sssup.it> */ +#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 0adeda93b5fb..11c0f1faa7e1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/types.h> +#include <linux/spinlock.h> #define IDX_INVALID -1 @@ -15,7 +17,6 @@ struct cpudl { struct cpudl_item *elements; }; -#ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_clear(struct cpudl *cp, int cpu); @@ -23,4 +24,3 @@ int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); -#endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 5252fb191fae..742fb9e62e1a 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,6 +5,7 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 461242ec958a..0ab5f9d4bc59 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include <uapi/linux/sched/types.h> +#include "sched.h" #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) @@ -380,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) sg_cpu->saved_idle_calls = idle_calls; return ret; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 42c40cfdf836..76a9ac5eb794 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,6 +22,7 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ +#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index d6cba0020064..6f562088c056 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/atomic.h> +#include <linux/cpumask.h> +#include <linux/sched/rt.h> #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) @@ -17,7 +20,6 @@ struct cpupri { int *cpu_to_pri; }; -#ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, @@ -26,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6dab4854c6c0..7097de2c8cda 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,6 +2,9 @@ /* * Simple CPU accounting cgroup controller */ +#include <linux/sched/cputime.h> +#include <linux/tsacct_kern.h> +#include "sched.h" #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include <asm/cputime.h> @@ -88,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime) return delta; } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static u64 irqtime_tick_accounted(u64 dummy) { @@ -241,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) task_group_account_field(p, CPUTIME_FORCEIDLE, delta); } -#endif +#endif /* CONFIG_SCHED_CORE */ /* * When a guest is interrupted for a longer amount of time, missed clock @@ -262,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) return steal; } -#endif +#endif /* CONFIG_PARAVIRT */ return 0; } @@ -288,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t) { return t->se.sum_exec_runtime; } -#else +#else /* !CONFIG_64BIT: */ static u64 read_sum_exec_runtime(struct task_struct *t) { u64 ns; @@ -301,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) return ns; } -#endif +#endif /* !CONFIG_64BIT */ /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live @@ -411,11 +414,11 @@ static void irqtime_account_idle_ticks(int ticks) { irqtime_account_process_tick(current, 0, ticks); } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, int nr_ticks) { } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ad45a8fea245..e2d51f4306b3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,10 @@ */ #include <linux/cpuset.h> +#include <linux/sched/clock.h> +#include <uapi/linux/sched/types.h> +#include "sched.h" +#include "pelt.h" /* * Default limits for DL period; on the top end we guard against small util @@ -51,7 +55,7 @@ static int __init sched_dl_sysctl_init(void) return 0; } late_initcall(sched_dl_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static bool dl_server(struct sched_dl_entity *dl_se) { @@ -99,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return pi_of(dl_se) != dl_se; } -#else +#else /* !CONFIG_RT_MUTEXES: */ static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) { return dl_se; @@ -109,9 +113,8 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return false; } -#endif +#endif /* !CONFIG_RT_MUTEXES */ -#ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -191,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) rq->dl.extra_bw += bw; } } -#else -static inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -static inline int dl_bw_cpus(int i) -{ - return 1; -} - -static inline unsigned long dl_bw_capacity(int i) -{ - return SCHED_CAPACITY_SCALE; -} - -bool dl_bw_visited(int cpu, u64 cookie) -{ - return false; -} - -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} -#endif static inline void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) @@ -552,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->root = RB_ROOT_CACHED; -#ifdef CONFIG_SMP /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; -#else - init_dl_bw(&dl_rq->dl_bw); -#endif dl_rq->running_bw = 0; dl_rq->this_bw = 0; init_dl_rq_bw_ratio(dl_rq); } -#ifdef CONFIG_SMP - static inline int dl_overloaded(struct rq *rq) { return atomic_read(&rq->rd->dlo_count); @@ -753,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } -#else - -static inline -void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline -void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline void deadline_queue_push_tasks(struct rq *rq) -{ -} - -static inline void deadline_queue_pull_task(struct rq *rq) -{ -} -#endif /* CONFIG_SMP */ - static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -824,6 +761,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + update_rq_clock(rq); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); @@ -1195,7 +1134,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) static void __push_dl_task(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -1209,12 +1147,13 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) push_dl_task(rq); rq_repin_lock(rq, rf); } -#endif } /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; +static bool dl_server_stopped(struct sched_dl_entity *dl_se); + static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1234,6 +1173,7 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->server_has_tasks(dl_se)) { replenish_dl_entity(dl_se); + dl_server_stopped(dl_se); return HRTIMER_NORESTART; } @@ -1339,7 +1279,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } -#ifdef CONFIG_SMP if (unlikely(!rq->online)) { /* * If the runqueue is no longer available, migrate the @@ -1356,7 +1295,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * there. */ } -#endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->donor)) @@ -1504,7 +1442,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 if (dl_entity_is_special(dl_se)) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + scaled_delta_exec = delta_exec; + if (!dl_server(dl_se)) + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); dl_se->runtime -= scaled_delta_exec; @@ -1598,7 +1538,7 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } /* @@ -1611,7 +1551,7 @@ throttle: */ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) { - s64 delta_exec, scaled_delta_exec; + s64 delta_exec; if (!rq->fair_server.dl_defer) return; @@ -1624,9 +1564,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) if (delta_exec < 0) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); - - rq->fair_server.runtime -= scaled_delta_exec; + rq->fair_server.runtime -= delta_exec; if (rq->fair_server.runtime < 0) { rq->fair_server.dl_defer_running = 0; @@ -1639,31 +1577,17 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_runtime) { + dl_se->dl_server_idle = 0; update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + } } void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; - /* - * XXX: the apply do not work fine at the init phase for the - * fair server because things are not yet set. We need to improve - * this before getting generic. - */ - if (!dl_server(dl_se)) { - u64 runtime = 50 * NSEC_PER_MSEC; - u64 period = 1000 * NSEC_PER_MSEC; - - dl_server_apply_params(dl_se, runtime, period, 1); - - dl_se->dl_server = 1; - dl_se->dl_defer = 1; - setup_new_dl_entity(dl_se); - } - - if (!dl_se->dl_runtime) + if (!dl_server(dl_se) || dl_se->dl_server_active) return; dl_se->dl_server_active = 1; @@ -1674,7 +1598,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) void dl_server_stop(struct sched_dl_entity *dl_se) { - if (!dl_se->dl_runtime) + if (!dl_server(dl_se) || !dl_server_active(dl_se)) return; dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); @@ -1684,6 +1608,20 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } +static bool dl_server_stopped(struct sched_dl_entity *dl_se) +{ + if (!dl_se->dl_server_active) + return false; + + if (dl_se->dl_server_idle) { + dl_server_stop(dl_se); + return true; + } + + dl_se->dl_server_idle = 1; + return false; +} + void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) @@ -1693,6 +1631,32 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_se->server_pick_task = pick_task; } +void sched_init_dl_servers(void) +{ + int cpu; + struct rq *rq; + struct sched_dl_entity *dl_se; + + for_each_online_cpu(cpu) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + rq = cpu_rq(cpu); + + guard(rq_lock_irq)(rq); + + dl_se = &rq->fair_server; + + WARN_ON(dl_server(dl_se)); + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } +} + void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) { u64 new_bw = dl_se->dl_bw; @@ -1844,8 +1808,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) #define __node_2_dle(node) \ rb_entry((node), struct sched_dl_entity, rb_node) -#ifdef CONFIG_SMP - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); @@ -1881,13 +1843,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) } } -#else - -static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} -static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} - -#endif /* CONFIG_SMP */ - static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { @@ -2166,6 +2121,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (dl_server(&p->dl)) return; + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2214,8 +2172,6 @@ static void yield_task_dl(struct rq *rq) rq_clock_skip_update(rq); } -#ifdef CONFIG_SMP - static inline bool dl_task_is_earliest_deadline(struct task_struct *p, struct rq *rq) { @@ -2345,7 +2301,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Only called when both the current and waking task are -deadline @@ -2359,7 +2314,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, return; } -#ifdef CONFIG_SMP /* * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... @@ -2367,7 +2321,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); -#endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK @@ -2375,11 +2328,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { hrtick_start(rq, dl_se->runtime); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { @@ -2435,7 +2388,7 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (dl_server_active(dl_se)) { + if (!dl_server_stopped(dl_se)) { dl_se->dl_yielded = 1; update_curr_dl_se(rq, dl_se, 0); } @@ -2465,6 +2418,10 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); + + if (task_is_blocked(p)) + return; + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2500,8 +2457,6 @@ static void task_fork_dl(struct task_struct *p) */ } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define DL_MAX_TRIES 3 @@ -2976,7 +2931,14 @@ void dl_clear_root_domain(struct root_domain *rd) int i; guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); + + /* + * Reset total_bw to zero and extra_bw to max_bw so that next + * loop will add dl-servers contributions back properly, + */ rd->dl_bw.total_bw = 0; + for_each_cpu(i, rd->span) + cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw; /* * dl_servers are not tasks. Since dl_add_task_root_domain ignores @@ -2995,8 +2957,6 @@ void dl_clear_root_domain_cpu(int cpu) dl_clear_root_domain(cpu_rq(cpu)->rd); } -#endif /* CONFIG_SMP */ - static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* @@ -3069,10 +3029,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } if (rq->donor != p) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); -#endif if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else @@ -3092,7 +3050,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!task_on_rq_queued(p)) return; -#ifdef CONFIG_SMP /* * This might be too much, but unfortunately * we don't have the old deadline value, and @@ -3121,13 +3078,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) resched_curr(rq); } -#else - /* - * We don't know if p has a earlier or later deadline, so let's blindly - * set a (maybe not needed) rescheduling point. - */ - resched_curr(rq); -#endif } #ifdef CONFIG_SCHED_CORE @@ -3149,7 +3099,6 @@ DEFINE_SCHED_CLASS(dl) = { .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, -#ifdef CONFIG_SMP .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, @@ -3158,7 +3107,6 @@ DEFINE_SCHED_CLASS(dl) = { .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, .find_lock_rq = find_lock_later_rq, -#endif .task_tick = task_tick_dl, .task_fork = task_fork_dl, @@ -3242,6 +3190,9 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + for_each_possible_cpu(cpu) + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + for_each_possible_cpu(cpu) { rcu_read_lock_sched(); @@ -3257,7 +3208,6 @@ void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } @@ -3458,7 +3408,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) return false; } -#ifdef CONFIG_SMP int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -3570,7 +3519,6 @@ void dl_bw_free(int cpu, u64 dl_bw) { dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } -#endif void print_dl_stats(struct seq_file *m, int cpu) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d71baf08075..3f06ab84d53f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,6 +6,9 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#include <linux/debugfs.h> +#include <linux/nmi.h> +#include "sched.h" /* * This allows printing both to /sys/kernel/debug/sched/debug and @@ -90,10 +93,10 @@ static void sched_feat_enable(int i) { static_key_enable_cpuslocked(&sched_feat_keys[i]); } -#else +#else /* !CONFIG_JUMP_LABEL: */ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ static int sched_feat_set(char *cmp) { @@ -166,8 +169,6 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -#ifdef CONFIG_SMP - static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -214,8 +215,6 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; -#endif /* SMP */ - #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -283,7 +282,6 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_verbose; -#ifdef CONFIG_SMP static struct dentry *sd_dentry; @@ -311,9 +309,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, return result; } -#else -#define sched_verbose_write debugfs_write_file_bool -#endif static const struct file_operations sched_verbose_fops = { .read = debugfs_read_file_bool, @@ -512,7 +507,6 @@ static __init int sched_init_debug(void) debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -#ifdef CONFIG_SMP debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -520,7 +514,6 @@ static __init int sched_init_debug(void) sched_domains_mutex_lock(); update_sched_domain_debugfs(); sched_domains_mutex_unlock(); -#endif #ifdef CONFIG_NUMA_BALANCING numa = debugfs_create_dir("numa_balancing", debugfs_sched); @@ -530,7 +523,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); -#endif +#endif /* CONFIG_NUMA_BALANCING */ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); @@ -540,8 +533,6 @@ static __init int sched_init_debug(void) } late_initcall(sched_init_debug); -#ifdef CONFIG_SMP - static cpumask_var_t sd_sysctl_cpus; static int sd_flags_show(struct seq_file *m, void *v) @@ -652,8 +643,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -690,18 +679,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); -#ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); P(se->avg.runnable_avg); -#endif #undef PN_SCHEDSTAT #undef PN #undef P_SCHEDSTAT #undef P } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED static DEFINE_SPINLOCK(sched_debug_lock); @@ -854,7 +841,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", @@ -874,8 +860,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); @@ -929,11 +914,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) PU(dl_nr_running); -#ifdef CONFIG_SMP dl_bw = &cpu_rq(cpu)->rd->dl_bw; -#else - dl_bw = &dl_rq->dl_bw; -#endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); @@ -951,9 +932,9 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } -#else +#else /* !CONFIG_X86: */ SEQ_printf(m, "cpu#%d\n", cpu); -#endif +#endif /* !CONFIG_X86 */ #define P(x) \ do { \ @@ -976,12 +957,10 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SMP #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); #undef P64 -#endif #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { @@ -1163,7 +1142,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); -#endif +#endif /* CONFIG_NUMA_BALANCING */ } void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, @@ -1210,10 +1189,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); -#ifdef CONFIG_NUMA_BALANCING - P_SCHEDSTAT(numa_task_migrated); - P_SCHEDSTAT(numa_task_swapped); -#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); @@ -1251,7 +1226,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); -#ifdef CONFIG_SMP P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); @@ -1260,13 +1234,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); -#endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); -#endif +#endif /* CONFIG_UCLAMP_TASK */ P(policy); P(prio); if (task_has_dl_policy(p)) { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index b498d867ba21..7dd5cbcb7a06 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1272,7 +1272,8 @@ static inline struct rq *scx_locked_rq(void) #define SCX_CALL_OP(sch, mask, op, rq, args...) \ do { \ - update_locked_rq(rq); \ + if (rq) \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ (sch)->ops.op(args); \ @@ -1280,14 +1281,16 @@ do { \ } else { \ (sch)->ops.op(args); \ } \ - update_locked_rq(NULL); \ + if (rq) \ + update_locked_rq(NULL); \ } while (0) #define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ ({ \ __typeof__((sch)->ops.op(args)) __ret; \ \ - update_locked_rq(rq); \ + if (rq) \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ __ret = (sch)->ops.op(args); \ @@ -1295,7 +1298,8 @@ do { \ } else { \ __ret = (sch)->ops.op(args); \ } \ - update_locked_rq(NULL); \ + if (rq) \ + update_locked_rq(NULL); \ __ret; \ }) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 6d29d3cbc670..001fb88a8481 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -903,7 +903,7 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * selection optimizations and simply check whether the previously * used CPU is idle and within the allowed cpumask. */ - if (p->nr_cpus_allowed == 1) { + if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) { if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && scx_idle_test_and_clear_cpu(prev_cpu)) cpu = prev_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a14da5396fb..b173a059315c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str) } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); -#ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. */ @@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: ~5%) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) -#endif #ifdef CONFIG_CFS_BANDWIDTH /* @@ -162,7 +160,7 @@ static int __init sched_fair_sysctl_init(void) return 0; } late_initcall(sched_fair_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -471,7 +469,7 @@ static int se_is_idle(struct sched_entity *se) return cfs_rq_is_idle(group_cfs_rq(se)); } -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -517,7 +515,7 @@ static int se_is_idle(struct sched_entity *se) return task_has_idle_policy(task_of(se)); } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); @@ -884,23 +882,44 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. + * Set the vruntime up to which an entity can run before looking + * for another entity to pick. + * In case of run to parity, we use the shortest slice of the enqueued + * entities to set the protected period. + * When run to parity is disabled, we give a minimum quantum to the running + * entity to ensure progress. */ -static inline void set_protect_slice(struct sched_entity *se) +static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - se->vlag = se->deadline; + u64 slice = normalized_sysctl_sched_base_slice; + u64 vprot = se->deadline; + + if (sched_feat(RUN_TO_PARITY)) + slice = cfs_rq_min_slice(cfs_rq); + + slice = min(slice, se->slice); + if (slice != se->slice) + vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se)); + + se->vprot = vprot; +} + +static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = cfs_rq_min_slice(cfs_rq); + + se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se)); } static inline bool protect_slice(struct sched_entity *se) { - return se->vlag == se->deadline; + return ((s64)(se->vprot - se->vruntime) > 0); } static inline void cancel_protect_slice(struct sched_entity *se) { if (protect_slice(se)) - se->vlag = se->deadline + 1; + se->vprot = se->vruntime; } /* @@ -922,7 +941,7 @@ static inline void cancel_protect_slice(struct sched_entity *se) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -939,7 +958,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) + if (curr && protect && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -983,6 +1002,11 @@ found: return best; } +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +{ + return __pick_eevdf(cfs_rq, true); +} + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -996,7 +1020,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ -#ifdef CONFIG_SMP int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1008,7 +1031,6 @@ int sched_update_scaling(void) return 0; } -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1041,7 +1063,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) } #include "pelt.h" -#ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); @@ -1131,34 +1152,40 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -#else /* !CONFIG_SMP */ -void init_entity_runnable_average(struct sched_entity *se) -{ -} -void post_init_entity_util_avg(struct task_struct *p) -{ -} -static void update_tg_load_avg(struct cfs_rq *cfs_rq) -{ -} -#endif /* CONFIG_SMP */ - -static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) +static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); s64 delta_exec; - delta_exec = now - curr->exec_start; + delta_exec = now - se->exec_start; if (unlikely(delta_exec <= 0)) return delta_exec; - curr->exec_start = now; - curr->sum_exec_runtime += delta_exec; + se->exec_start = now; + if (entity_is_task(se)) { + struct task_struct *donor = task_of(se); + struct task_struct *running = rq->curr; + /* + * If se is a task, we account the time against the running + * task, as w/ proxy-exec they may not be the same. + */ + running->se.exec_start = now; + running->se.sum_exec_runtime += delta_exec; + + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); + + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); + } else { + /* If not task, account the time against donor se */ + se->sum_exec_runtime += delta_exec; + } if (schedstat_enabled()) { struct sched_statistics *stats; - stats = __schedstats_from_se(curr); + stats = __schedstats_from_se(se); __schedstat_set(stats->exec_max, max(delta_exec, stats->exec_max)); } @@ -1166,58 +1193,12 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) return delta_exec; } -static inline void update_curr_task(struct task_struct *p, s64 delta_exec) -{ - trace_sched_stat_runtime(p, delta_exec); - account_group_exec_runtime(p, delta_exec); - cgroup_account_cputime(p, delta_exec); -} - -static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - if (!sched_feat(PREEMPT_SHORT)) - return false; - - if (curr->vlag == curr->deadline) - return false; - - return !entity_eligible(cfs_rq, curr); -} - -static inline bool do_preempt_short(struct cfs_rq *cfs_rq, - struct sched_entity *pse, struct sched_entity *se) -{ - if (!sched_feat(PREEMPT_SHORT)) - return false; - - if (pse->slice >= se->slice) - return false; - - if (!entity_eligible(cfs_rq, pse)) - return false; - - if (entity_before(pse, se)) - return true; - - if (!entity_eligible(cfs_rq, se)) - return true; - - return false; -} - /* * Used by other classes to account runtime. */ s64 update_curr_common(struct rq *rq) { - struct task_struct *donor = rq->donor; - s64 delta_exec; - - delta_exec = update_curr_se(rq, &donor->se); - if (likely(delta_exec > 0)) - update_curr_task(donor, delta_exec); - - return delta_exec; + return update_se(rq, &rq->donor->se); } /* @@ -1225,6 +1206,12 @@ s64 update_curr_common(struct rq *rq) */ static void update_curr(struct cfs_rq *cfs_rq) { + /* + * Note: cfs_rq->curr corresponds to the task picked to + * run (ie: rq->donor.se) which due to proxy-exec may + * not necessarily be the actual task running + * (rq->curr.se). This is easy to confuse! + */ struct sched_entity *curr = cfs_rq->curr; struct rq *rq = rq_of(cfs_rq); s64 delta_exec; @@ -1233,7 +1220,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq, curr); + delta_exec = update_se(rq, curr); if (unlikely(delta_exec <= 0)) return; @@ -1242,10 +1229,6 @@ static void update_curr(struct cfs_rq *cfs_rq) update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { - struct task_struct *p = task_of(curr); - - update_curr_task(p, delta_exec); - /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on @@ -1265,7 +1248,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (cfs_rq->nr_queued == 1) return; - if (resched || did_preempt_short(cfs_rq, curr)) { + if (resched || !protect_slice(curr)) { resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } @@ -2114,12 +2097,12 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline int numa_idle_core(int idle_core, int cpu) { return idle_core; } -#endif +#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -3673,7 +3656,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) p->numa_scan_period = task_scan_start(p); } -#else +#else /* !CONFIG_NUMA_BALANCING: */ + static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } @@ -3690,20 +3674,18 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) { } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); } -#endif cfs_rq->nr_queued++; } @@ -3711,12 +3693,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } -#endif cfs_rq->nr_queued--; } @@ -3768,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) -#ifdef CONFIG_SMP static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3785,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } -#else -static inline void -enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -#endif static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -3822,13 +3795,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_load_set(&se->load, weight); -#ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); -#endif enqueue_load_avg(cfs_rq, se); if (se->on_rq) { @@ -3863,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -3970,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } -#endif /* CONFIG_SMP */ /* * Recomputes the group entity based on the current state of its group @@ -3991,20 +3960,16 @@ static void update_cfs_group(struct sched_entity *se) if (throttled_hierarchy(gcfs_rq)) return; -#ifndef CONFIG_SMP - shares = READ_ONCE(gcfs_rq->tg->shares); -#else shares = calc_group_shares(gcfs_rq); -#endif if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_cfs_group(struct sched_entity *se) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { @@ -4029,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } } -#ifdef CONFIG_SMP static inline bool load_avg_is_decayed(struct sched_avg *sa) { if (sa->load_sum) @@ -4481,7 +4445,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) return true; } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} @@ -4494,7 +4458,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_NO_HZ_COMMON static inline void migrate_se_pelt_lag(struct sched_entity *se) @@ -4575,9 +4539,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) __update_load_avg_blocked_se(now, se); } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static void migrate_se_pelt_lag(struct sched_entity *se) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages @@ -5144,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } -#else /* CONFIG_SMP */ - -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - return !cfs_rq->nr_queued; -} - -#define UPDATE_TG 0x0 -#define SKIP_AGE_LOAD 0x0 -#define DO_ATTACH 0x0 -#define DO_DETACH 0x0 - -static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) -{ - cfs_rq_util_change(cfs_rq, 0); -} - -static inline void remove_entity_load_avg(struct sched_entity *se) {} - -static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} - -static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) -{ - return 0; -} - -static inline void -util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} -static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - -#endif /* CONFIG_SMP */ - void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { struct sched_entity *se = &p->se; @@ -5253,7 +5175,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) * = (W*V + w_i*(V - vl_i)) / (W + w_i) * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) - * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = (V*(W + w_i) - w_i*vl_i) / (W + w_i) * = V - w_i*vl_i / (W + w_i) * * And the actual lag after adding an entity with vl_i is: @@ -5550,7 +5472,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(se); + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -5685,7 +5607,7 @@ void cfs_bandwidth_usage_dec(void) { static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } -#else /* CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ static bool cfs_bandwidth_used(void) { return true; @@ -5693,16 +5615,7 @@ static bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} -#endif /* CONFIG_JUMP_LABEL */ - -/* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds - */ -static inline u64 default_cfs_period(void) -{ - return 100000000ULL; -} +#endif /* !CONFIG_JUMP_LABEL */ static inline u64 sched_cfs_bandwidth_slice(void) { @@ -5889,7 +5802,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long queued_delta, runnable_delta, idle_delta, dequeue = 1; - long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5973,10 +5885,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, queued_delta); - - /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the @@ -6088,7 +5996,6 @@ unthrottle_throttle: resched_curr(rq); } -#ifdef CONFIG_SMP static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; @@ -6147,12 +6054,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) if (first) smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); } -#else -static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -{ - unthrottle_cfs_rq(cfs_rq); -} -#endif static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { @@ -6490,8 +6391,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } -extern const u64 max_cfs_quota_period; - static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -6518,7 +6417,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) * to fail. */ new = old * 2; - if (new < max_cfs_quota_period) { + if (new < max_bw_quota_period_us * NSEC_PER_USEC) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; cfs_b->burst *= 2; @@ -6552,7 +6451,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->period = us_to_ktime(default_bw_period_us()); cfs_b->burst = 0; cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; @@ -6608,7 +6507,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * guaranteed at this point that no additional cfs_rq of this group can * join a CSD list. */ -#ifdef CONFIG_SMP for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); unsigned long flags; @@ -6620,7 +6518,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) __cfsb_csd_unthrottle(rq); local_irq_restore(flags); } -#endif } /* @@ -6733,9 +6630,9 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) if (cfs_task_bw_constrained(p)) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#endif +#endif /* CONFIG_NO_HZ_FULL */ -#else /* CONFIG_CFS_BANDWIDTH */ +#else /* !CONFIG_CFS_BANDWIDTH: */ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } @@ -6777,7 +6674,7 @@ bool cfs_task_bw_constrained(struct task_struct *p) return false; } #endif -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* !CONFIG_CFS_BANDWIDTH */ #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} @@ -6822,7 +6719,7 @@ static void hrtick_update(struct rq *rq) hrtick_start_fair(rq, donor); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -6831,9 +6728,8 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static inline void hrtick_update(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { unsigned long rq_util_min, rq_util_max; @@ -6875,9 +6771,6 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized(rq->rd, 1); } -#else -static inline void check_update_overutilized_status(struct rq *rq) { } -#endif /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) @@ -6886,12 +6779,10 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } -#endif static void requeue_delayed_entity(struct sched_entity *se) @@ -7070,7 +6961,6 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; @@ -7154,9 +7044,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); - /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; @@ -7206,8 +7093,6 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq) return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); } -#ifdef CONFIG_SMP - /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); @@ -7677,7 +7562,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT: */ static inline void set_idle_cores(int cpu, int val) { @@ -7698,7 +7583,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd return -1; } -#endif /* CONFIG_SCHED_SMT */ +#endif /* !CONFIG_SCHED_SMT */ /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by @@ -8743,9 +8628,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } -#else -static inline void set_task_max_allowed_capacity(struct task_struct *p) {} -#endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { @@ -8767,6 +8649,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + bool do_preempt_short = false; if (unlikely(se == pse)) return; @@ -8815,7 +8698,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - cancel_protect_slice(se); + do_preempt_short = true; goto preempt; } @@ -8833,22 +8716,24 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int /* * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. - * - * Note that even if @p does not turn out to be the most eligible - * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se)) - cancel_protect_slice(se); + do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); /* * If @p has become the most eligible task, force preemption. */ - if (pick_eevdf(cfs_rq) == pse) + if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt; + if (sched_feat(RUN_TO_PARITY) && do_preempt_short) + update_protect_slice(cfs_rq, se); + return; preempt: + if (do_preempt_short) + cancel_protect_slice(se); + resched_curr_lazy(rq); } @@ -8939,7 +8824,7 @@ again: return p; simple: -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ put_prev_set_next_task(rq, prev, p); return p; @@ -9055,7 +8940,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) return true; } -#ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods. * @@ -9357,13 +9241,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return src_weight - dst_weight; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { return 0; } -#endif +#endif /* !CONFIG_NUMA_BALANCING */ /* * Check whether the task is ineligible on the destination cpu @@ -9407,7 +9291,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) throttled_lb_pair, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or - * 5) are cache-hot on their current CPU. + * 5) are cache-hot on their current CPU, or + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; @@ -9429,6 +9314,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0; + if (task_is_blocked(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; @@ -9464,7 +9352,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_on_cpu(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p) || + task_current_donor(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -9508,6 +9397,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p->stats.nr_forced_migrations); } + WARN_ON(task_current(env->src_rq, p)); + WARN_ON(task_current_donor(env->src_rq, p)); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -9772,12 +9664,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) if (!has_blocked) rq->has_blocked_load = 0; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) { @@ -9886,7 +9778,7 @@ static unsigned long task_h_load(struct task_struct *p) return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, cfs_rq_load_avg(cfs_rq) + 1); } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -9903,7 +9795,7 @@ static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg; } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void sched_balance_update_blocked_averages(int cpu) { @@ -10048,9 +9940,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) min_capacity = ULONG_MAX; max_capacity = 0; - if (child->flags & SD_OVERLAP) { + if (child->flags & SD_NUMA) { /* - * SD_OVERLAP domains cannot assume that child groups + * SD_NUMA domains cannot assume that child groups * span the current group. */ @@ -10063,7 +9955,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } } else { /* - * !SD_OVERLAP domains can assume that child groups + * !SD_NUMA domains can assume that child groups * span the current group. */ @@ -10616,7 +10508,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) return remote; return all; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { return all; @@ -10626,7 +10518,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) { return regular; } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ struct sg_lb_stats; @@ -12174,8 +12066,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. + * + * sched_balance_newidle() bumps the cost whenever newidle + * balance fails, and we don't want things to grow out of + * control. Use the sysctl_sched_migration_cost as the upper + * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = cost; + sd->max_newidle_lb_cost = + min(cost, sysctl_sched_migration_cost + 200); sd->last_decay_max_lb_cost = jiffies; } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { /* @@ -12772,7 +12670,7 @@ static void nohz_newidle_balance(struct rq *this_rq) atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) @@ -12781,7 +12679,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle } static inline void nohz_newidle_balance(struct rq *this_rq) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * sched_balance_newidle is called by schedule() if this_cpu is about to become @@ -12867,10 +12765,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Failing newidle means it is not effective; + * bump the cost so we end up doing less of it. + */ + if (!pulled_task) + domain_cost = (3 * sd->max_newidle_lb_cost) / 2; + + update_newidle_cost(sd, domain_cost); } /* @@ -12978,8 +12883,6 @@ static void rq_offline_fair(struct rq *rq) clear_tg_offline_cfs_rqs(rq); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_SCHED_CORE static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) @@ -13076,10 +12979,10 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, cfs_rqa = sea->cfs_rq; cfs_rqb = seb->cfs_rq; -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ cfs_rqa = &task_rq(a)->cfs; cfs_rqb = &task_rq(b)->cfs; -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ /* * Find delta after normalizing se's vruntime with its cfs_rq's @@ -13103,9 +13006,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } -#else +#else /* !CONFIG_SCHED_CORE: */ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} -#endif +#endif /* !CONFIG_SCHED_CORE */ /* * scheduler tick hitting a task of our scheduling class. @@ -13199,15 +13102,14 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) list_add_leaf_cfs_rq(cfs_rq); } } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static void propagate_entity_cfs_rq(struct sched_entity *se) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_SMP /* * In case the task sched_avg hasn't been attached: * - A forked task which hasn't been woken up by wake_up_new_task(). @@ -13216,7 +13118,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) */ if (!se->avg.last_update_time) return; -#endif /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); @@ -13280,7 +13181,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs { struct sched_entity *se = &p->se; -#ifdef CONFIG_SMP if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -13288,7 +13188,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs */ list_move(&se->group_node, &rq->cfs_tasks); } -#endif if (!first) return; @@ -13326,9 +13225,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13343,10 +13240,8 @@ static void task_change_group_fair(struct task_struct *p) detach_task_cfs_rq(p); -#ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; -#endif set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } @@ -13642,7 +13537,6 @@ DEFINE_SCHED_CLASS(fair) = { .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, -#ifdef CONFIG_SMP .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13652,7 +13546,6 @@ DEFINE_SCHED_CLASS(fair) = { .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_fair, -#endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -13715,7 +13608,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { -#ifdef CONFIG_SMP int i; for_each_possible_cpu(i) { @@ -13737,6 +13629,4 @@ __init void init_sched_fair_class(void) nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -#endif /* SMP */ - } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2c85c86b455f..c39b089d4f09 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,6 +6,11 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ +#include <linux/cpuidle.h> +#include <linux/suspend.h> +#include <linux/livepatch.h> +#include "sched.h" +#include "smp.h" /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -47,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) return 1; } __setup("hlt", cpu_idle_nopoll_setup); -#endif +#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */ static noinline int __cpuidle cpu_idle_poll(void) { @@ -95,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void) if (static_branch_unlikely(&arch_needs_tick_broadcast)) tick_broadcast_exit(); } -#else +#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */ static inline void cond_tick_broadcast_enter(void) { } static inline void cond_tick_broadcast_exit(void) { } -#endif +#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */ /** * default_idle_call - Default CPU idle routine. @@ -427,7 +432,6 @@ void cpu_startup_entry(enum cpuhp_state state) * idle-task scheduling class. */ -#ifdef CONFIG_SMP static int select_task_rq_idle(struct task_struct *p, int cpu, int flags) { @@ -439,7 +443,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return WARN_ON_ONCE(1); } -#endif /* * Idle tasks are unconditionally rescheduled: @@ -526,11 +529,9 @@ DEFINE_SCHED_CLASS(idle) = { .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, -#ifdef CONFIG_SMP .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 93b038d48900..a4cf17b1fab0 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,6 +7,8 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ +#include <linux/sched/isolation.h> +#include "sched.h" enum hk_flags { HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c48900b856a2..b601e0243d0e 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,6 +6,8 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ +#include <linux/sched/nohz.h> +#include "sched.h" /* * Global load-average calculations @@ -80,7 +82,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) long nr_active, delta = 0; nr_active = this_rq->nr_running - adjust; - nr_active += (int)this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; @@ -333,12 +335,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 809194cd779f..62fba83b7bb1 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,6 +4,8 @@ * * membarrier system call */ +#include <uapi/linux/membarrier.h> +#include "sched.h" /* * For documentation purposes, here are some membarrier ordering diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 7a8534a2deff..fa83bbaf4f3e 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -23,6 +23,7 @@ * Move PELT related code from fair.c into this pelt.c file * Author: Vincent Guittot <vincent.guittot@linaro.org> */ +#include "pelt.h" /* * Approximate: @@ -413,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) return 0; } -#endif +#endif /* CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* @@ -466,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } -#endif +#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */ /* * Load avg and utiliztion metrics need to be updated periodically and before diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index f4f6a0875c66..62c3fa543c0f 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,4 +1,8 @@ -#ifdef CONFIG_SMP +// SPDX-License-Identifier: GPL-2.0 +#ifndef _KERNEL_SCHED_PELT_H +#define _KERNEL_SCHED_PELT_H +#include "sched.h" + #include "sched-pelt.h" int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); @@ -15,7 +19,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return READ_ONCE(rq->avg_hw.load_avg); } -#else +#else /* !CONFIG_SCHED_HW_PRESSURE: */ static inline int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { @@ -26,7 +30,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return 0; } -#endif +#endif /* !CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); @@ -174,63 +178,12 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } -#else +#else /* !CONFIG_CFS_BANDWIDTH: */ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); } -#endif - -#else - -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - -static inline int -update_rt_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_dl_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) -{ - return 0; -} - -static inline u64 hw_load_avg(struct rq *rq) -{ - return 0; -} - -static inline int -update_irq_load_avg(struct rq *rq, u64 running) -{ - return 0; -} - -static inline u64 rq_clock_pelt(struct rq *rq) -{ - return rq_clock_task(rq); -} - -static inline void -update_rq_clock_pelt(struct rq *rq, s64 delta) { } - -static inline void -update_idle_rq_clock_pelt(struct rq *rq) { } - -static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } -#endif - +#endif /* !CONFIG_CFS_BANDWIDTH */ +#endif /* _KERNEL_SCHED_PELT_H */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ad04a5c3162a..2024c1d36402 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,6 +136,10 @@ * cost-wise, yet way more sensitive and accurate than periodic * sampling of the aggregate task states would be. */ +#include <linux/sched/clock.h> +#include <linux/workqueue.h> +#include <linux/psi.h> +#include "sched.h" static int psi_bug __read_mostly; @@ -172,6 +176,28 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +static DEFINE_PER_CPU(seqcount_t, psi_seq); + +static inline void psi_write_begin(int cpu) +{ + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline void psi_write_end(int cpu) +{ + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline u32 psi_read_begin(int cpu) +{ + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline bool psi_read_retry(int cpu, u32 seq) +{ + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); +} + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); @@ -182,7 +208,7 @@ static void group_init(struct psi_group *group) group->enabled = true; for_each_possible_cpu(cpu) - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); @@ -262,14 +288,14 @@ static void get_recent_times(struct psi_group *group, int cpu, /* Snapshot a coherent view of the CPU state */ do { - seq = read_seqcount_begin(&groupc->seq); + seq = psi_read_begin(cpu); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; if (cpu == current_cpu) memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); - } while (read_seqcount_retry(&groupc->seq, seq)); + } while (psi_read_retry(cpu, seq)); /* Calculate state time deltas against the previous snapshot */ for (s = 0; s < NR_PSI_STATES; s++) { @@ -768,31 +794,21 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) groupc->times[PSI_NONIDLE] += delta; } +#define for_each_group(iter, group) \ + for (typeof(group) iter = group; iter; iter = iter->parent) + static void psi_group_change(struct psi_group *group, int cpu, unsigned int clear, unsigned int set, - bool wake_clock) + u64 now, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; - u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we update the task counts according to the state - * change requested through the @clear and @set bits. - * - * Then if the cgroup PSI stats accounting enabled, we - * assess the aggregate resource states this CPU's tasks - * have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - */ - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - - /* * Start with TSK_ONCPU, which doesn't have a corresponding * task count - it's just a boolean flag directly encoded in * the state mask. Clear, set, or carry the current state if @@ -843,7 +859,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); return; } @@ -864,8 +879,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); - if (state_mask & group->rtpoll_states) psi_schedule_rtpoll_work(group, 1, false); @@ -900,24 +913,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); - struct psi_group *group; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - group = task_psi_group(task); - do { - psi_group_change(group, cpu, clear, set, true); - } while ((group = group->parent)); + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(task)) + psi_group_change(group, cpu, clear, set, now, true); + psi_write_end(cpu); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep) { - struct psi_group *group, *common = NULL; + struct psi_group *common = NULL; int cpu = task_cpu(prev); + u64 now; + + psi_write_begin(cpu); + now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -926,16 +944,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - group = task_psi_group(next); - do { - if (per_cpu_ptr(group->pcpu, cpu)->state_mask & - PSI_ONCPU) { + for_each_group(group, task_psi_group(next)) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + if (groupc->state_mask & PSI_ONCPU) { common = group; break; } - - psi_group_change(group, cpu, 0, TSK_ONCPU, true); - } while ((group = group->parent)); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + } } if (prev->pid) { @@ -968,12 +985,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - group = task_psi_group(prev); - do { + for_each_group(group, task_psi_group(prev)) { if (group == common) break; - psi_group_change(group, cpu, clear, set, wake_clock); - } while ((group = group->parent)); + psi_group_change(group, cpu, clear, set, now, wake_clock); + } /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -983,20 +999,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, wake_clock); + for_each_group(group, common) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } + psi_write_end(cpu); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { int cpu = task_cpu(curr); - struct psi_group *group; struct psi_group_cpu *groupc; s64 delta; u64 irq; + u64 now; if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; @@ -1005,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; lockdep_assert_rq_held(rq); - group = task_psi_group(curr); - if (prev && task_psi_group(prev) == group) + if (prev && task_psi_group(prev) == task_psi_group(curr)) return; irq = irq_time_read(cpu); @@ -1015,27 +1031,24 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; rq->psi_irq_time = irq; - do { - u64 now; + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(curr)) { if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; - write_seqcount_end(&groupc->seq); - if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) psi_schedule_rtpoll_work(group, 1, false); - } while ((group = group->parent)); + } + psi_write_end(cpu); } -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -1221,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group *group) return; for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; + u64 now; + + guard(rq_lock_irq)(cpu_rq(cpu)); - rq_lock_irq(rq, &rf); - psi_group_change(group, cpu, 0, 0, true); - rq_unlock_irq(rq, &rf); + psi_write_begin(cpu); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + psi_write_end(cpu); } } #endif /* CONFIG_CGROUPS */ @@ -1651,7 +1666,7 @@ static const struct proc_ops psi_irq_proc_ops = { .proc_poll = psi_fop_poll, .proc_release = psi_fop_release, }; -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ static int __init psi_proc_init(void) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e40422c37033..7936d4333731 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,6 +4,9 @@ * policies) */ +#include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -60,7 +63,7 @@ static int __init sched_rt_sysctl_init(void) return 0; } late_initcall(sched_rt_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ void init_rt_rq(struct rt_rq *rt_rq) { @@ -75,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -291,7 +292,7 @@ err: return 0; } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #define rt_entity_is_task(rt_se) (1) @@ -327,9 +328,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_SMP +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { @@ -430,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) } } -#else - -static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void rt_queue_push_tasks(struct rq *rq) -{ -} -#endif /* CONFIG_SMP */ - static void enqueue_top_rt_rq(struct rt_rq *rt_rq); static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); @@ -485,12 +469,12 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) return cpu_cap >= min(min_cap, max_cap); } -#else +#else /* !CONFIG_UCLAMP_TASK: */ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) { return true; } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_RT_GROUP_SCHED @@ -594,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { return this_rq()->rd->span; } -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) @@ -625,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) rt_rq->rt_time < rt_b->rt_runtime); } -#ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. */ @@ -798,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq) raw_spin_lock(&rt_rq->rt_runtime_lock); } } -#else /* !CONFIG_SMP */ -static inline void balance_runtime(struct rt_rq *rt_rq) {} -#endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -930,7 +903,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ typedef struct rt_rq *rt_rq_iter_t; @@ -977,12 +950,10 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) return &cpu_rq(cpu)->rt; } -#ifdef CONFIG_SMP static void __enable_runtime(struct rq *rq) { } static void __disable_runtime(struct rq *rq) { } -#endif -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline int rt_se_prio(struct sched_rt_entity *rt_se) { @@ -1033,7 +1004,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } static void @@ -1075,8 +1046,6 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) cpufreq_update_util(rq, 0); } -#if defined CONFIG_SMP - static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { @@ -1107,16 +1076,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* CONFIG_SMP */ - -static inline -void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -static inline -void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} - -#endif /* CONFIG_SMP */ - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void inc_rt_prio(struct rt_rq *rt_rq, int prio) { @@ -1155,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else - -static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} -static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} - -#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED static void @@ -1182,7 +1134,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) @@ -1192,7 +1144,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) @@ -1488,6 +1440,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1538,7 +1493,6 @@ static void yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr, 0); } -#ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); static int @@ -1653,7 +1607,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Preempt the current task with a newly woken task if needed: @@ -1667,7 +1620,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) return; } -#ifdef CONFIG_SMP /* * If: * @@ -1682,7 +1634,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) */ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) @@ -1768,6 +1719,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); + if (task_is_blocked(p)) + return; /* * The previous task needs to be made eligible for pushing * if it is still active @@ -1776,8 +1729,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s enqueue_pushable_task(rq, p); } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define RT_MAX_TRIES 3 @@ -2451,7 +2402,6 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#endif /* CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue @@ -2475,10 +2425,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * then see if we can move to another run queue. */ if (task_on_rq_queued(p)) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); -#endif /* CONFIG_SMP */ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } @@ -2495,7 +2443,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) return; if (task_current_donor(rq, p)) { -#ifdef CONFIG_SMP /* * If our priority decreases while running, we * may need to pull tasks to this runqueue. @@ -2509,11 +2456,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) */ if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); -#else - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_curr(rq); -#endif /* CONFIG_SMP */ } else { /* * This task is not running, but if it is @@ -2549,9 +2491,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) } } } -#else +#else /* !CONFIG_POSIX_TIMERS: */ static inline void watchdog(struct rq *rq, struct task_struct *p) { } -#endif +#endif /* !CONFIG_POSIX_TIMERS */ /* * scheduler tick hitting a task of our scheduling class. @@ -2620,7 +2562,7 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) return rt_rq_throttled(rt_rq); } -#endif +#endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { @@ -2634,7 +2576,6 @@ DEFINE_SCHED_CLASS(rt) = { .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, -#ifdef CONFIG_SMP .balance = balance_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, @@ -2643,7 +2584,6 @@ DEFINE_SCHED_CLASS(rt) = { .task_woken = task_woken_rt, .switched_from = switched_from_rt, .find_lock_rq = find_lock_lowest_rq, -#endif .task_tick = task_tick_rt, @@ -2887,7 +2827,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) return 1; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) @@ -2895,7 +2835,7 @@ static int sched_rt_global_constraints(void) return 0; } #endif /* CONFIG_SYSCTL */ -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) @@ -2951,6 +2891,12 @@ undo: sched_domains_mutex_unlock(); mutex_unlock(&mutex); + /* + * After changing maximum available bandwidth for DEADLINE, we need to + * recompute per root domain and per cpus variables accordingly. + */ + rebuild_sched_domains(); + return ret; } diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index c529706bed11..6803cfec7a1e 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ +#include <linux/types.h> static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 475bb5998295..d3f33d10c58c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -69,6 +69,7 @@ #include <linux/wait_bit.h> #include <linux/workqueue_api.h> #include <linux/delayacct.h> +#include <linux/mmu_context.h> #include <trace/events/power.h> #include <trace/events/sched.h> @@ -384,6 +385,7 @@ extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); +extern void sched_init_dl_servers(void); extern void dl_server_update_idle_time(struct rq *rq, struct task_struct *p); @@ -401,6 +403,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; +#ifdef CONFIG_CFS_BANDWIDTH +extern const u64 max_bw_quota_period_us; + +/* + * default period for group bandwidth. + * default: 0.1s, units: microseconds + */ +static inline u64 default_bw_period_us(void) +{ + return 100000ULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ + struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; @@ -424,7 +439,7 @@ struct cfs_bandwidth { int nr_burst; u64 throttled_time; u64 burst_time; -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ }; /* Task group related information */ @@ -442,15 +457,13 @@ struct task_group { /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; -#ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; @@ -531,7 +544,7 @@ extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void free_fair_sched_group(struct task_group *tg) { } static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { @@ -539,7 +552,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou } static inline void online_fair_sched_group(struct task_group *tg) { } static inline void unregister_fair_sched_group(struct task_group *tg) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -573,25 +586,20 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_idle(struct task_group *tg, long idle); -#ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -#else /* !CONFIG_SMP */ -static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } -#endif /* CONFIG_SMP */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ struct cfs_bandwidth { }; static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ extern void unregister_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg); @@ -667,7 +675,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SMP /* * CFS load tracking */ @@ -699,7 +706,6 @@ struct cfs_rq { u64 last_h_load_update; struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ @@ -796,19 +802,13 @@ struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; unsigned int rr_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP int next; /* next highest */ -#endif } highest_prio; -#endif -#ifdef CONFIG_SMP bool overloaded; struct plist_head pushable_tasks; -#endif /* CONFIG_SMP */ int rt_queued; #ifdef CONFIG_RT_GROUP_SCHED @@ -839,7 +839,6 @@ struct dl_rq { unsigned int dl_nr_running; -#ifdef CONFIG_SMP /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates @@ -859,9 +858,7 @@ struct dl_rq { * of the leftmost (earliest deadline) element. */ struct rb_root_cached pushable_dl_tasks_root; -#else - struct dl_bw dl_bw; -#endif + /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a @@ -932,7 +929,6 @@ static inline long se_runnable(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * XXX we want to get rid of these helpers and use the full load resolution. */ @@ -1008,7 +1004,7 @@ struct root_domain { /* These atomics are updated outside of a lock */ atomic_t rto_loop_next; atomic_t rto_loop_start; -#endif +#endif /* HAVE_RT_PUSH_IPI */ /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -1043,7 +1039,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status) #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK /* @@ -1107,18 +1102,14 @@ struct rq { unsigned int numa_migrate_on; #endif #ifdef CONFIG_NO_HZ_COMMON -#ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; call_single_data_t nohz_csd; -#endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_SMP unsigned int ttwu_pending; -#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -1149,12 +1140,17 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned int nr_uninterruptible; + unsigned long nr_uninterruptible; +#ifdef CONFIG_SCHED_PROXY_EXEC + struct task_struct __rcu *donor; /* Scheduling context */ + struct task_struct __rcu *curr; /* Execution context */ +#else union { struct task_struct __rcu *donor; /* Scheduler context */ struct task_struct __rcu *curr; /* Execution context */ }; +#endif struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1183,7 +1179,6 @@ struct rq { int membarrier_state; #endif -#ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -1224,7 +1219,6 @@ struct rq { #ifdef CONFIG_HOTPLUG_CPU struct rcuwait hotplug_wait; #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1242,9 +1236,7 @@ struct rq { long calc_load_active; #ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP call_single_data_t hrtick_csd; -#endif struct hrtimer hrtick_timer; ktime_t hrtick_time; #endif @@ -1271,9 +1263,7 @@ struct rq { struct cpuidle_state *idle_state; #endif -#ifdef CONFIG_SMP unsigned int nr_pinned; -#endif unsigned int push_busy; struct cpu_stop_work push_work; @@ -1294,12 +1284,12 @@ struct rq { unsigned int core_forceidle_seq; unsigned int core_forceidle_occupation; u64 core_forceidle_start; -#endif +#endif /* CONFIG_SCHED_CORE */ /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; -#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) +#ifdef CONFIG_CFS_BANDWIDTH call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif @@ -1313,32 +1303,24 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { return container_of(cfs_rq, struct rq, cfs); } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline int cpu_of(struct rq *rq) { -#ifdef CONFIG_SMP return rq->cpu; -#else - return 0; -#endif } #define MDF_PUSH 0x01 static inline bool is_migration_disabled(struct task_struct *p) { -#ifdef CONFIG_SMP return p->migration_disabled; -#else - return false; -#endif } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1349,10 +1331,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + rcu_assign_pointer(rq->donor, t); +} +#else static inline void rq_set_donor(struct rq *rq, struct task_struct *t) { /* Do nothing */ } +#endif #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1500,6 +1489,7 @@ static inline bool sched_group_cookie_match(struct rq *rq, } #endif /* !CONFIG_SCHED_CORE */ + #ifdef CONFIG_RT_GROUP_SCHED # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED DECLARE_STATIC_KEY_FALSE(rt_group_sched); @@ -1507,16 +1497,16 @@ static inline bool rt_group_sched_enabled(void) { return static_branch_unlikely(&rt_group_sched); } -# else +# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */ DECLARE_STATIC_KEY_TRUE(rt_group_sched); static inline bool rt_group_sched_enabled(void) { return static_branch_likely(&rt_group_sched); } -# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ -#else +# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ # define rt_group_sched_enabled() false -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1574,9 +1564,9 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline void update_idle_core(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_SMT */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1757,7 +1747,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq) WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); } -#else /* !CONFIG_SCHED_CLASS_EXT */ +#else /* !CONFIG_SCHED_CLASS_EXT: */ #define scx_enabled() false #define scx_switched_all() false @@ -1781,9 +1771,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#ifdef CONFIG_SMP WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) @@ -1961,8 +1949,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #endif /* !CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_SMP - static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, @@ -2128,8 +2114,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2174,7 +2158,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } #else /* !CONFIG_CGROUP_SCHED: */ @@ -2200,7 +2184,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; -#endif +#endif /* CONFIG_SMP */ } /* @@ -2278,13 +2262,17 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p) return rq->donor == p; } +static inline bool task_is_blocked(struct task_struct *p) +{ + if (!sched_proxy_exec()) + return false; + + return !!p->blocked_on; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_SMP return p->on_cpu; -#else - return task_current(rq, p); -#endif } static inline int task_on_rq_queued(struct task_struct *p) @@ -2307,11 +2295,9 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ -#ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); static_assert(WF_FORK == SD_BALANCE_FORK); static_assert(WF_TTWU == SD_BALANCE_WAKE); -#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -2367,11 +2353,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_HEAD 0x10 #define ENQUEUE_REPLENISH 0x20 -#ifdef CONFIG_SMP #define ENQUEUE_MIGRATED 0x40 -#else -#define ENQUEUE_MIGRATED 0x00 -#endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 @@ -2416,7 +2398,6 @@ struct sched_class { void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); -#ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -2429,7 +2410,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); -#endif void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); @@ -2487,7 +2467,7 @@ static inline void put_prev_set_next_task(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); __put_prev_set_next_dl_server(rq, prev, next); @@ -2581,8 +2561,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_MIGRATE_ENABLE 0x04 #define SCA_USER 0x08 -#ifdef CONFIG_SMP - extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void sched_balance_trigger(struct rq *rq); @@ -2634,26 +2612,6 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#else /* !CONFIG_SMP: */ - -static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) -{ - return true; -} - -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) -{ - return set_cpus_allowed_ptr(p, ctx->new_mask); -} - -static inline cpumask_t *alloc_user_cpus_ptr(int node) -{ - return NULL; -} - -#endif /* !CONFIG_SMP */ - #ifdef CONFIG_CPU_IDLE static inline void idle_set_state(struct rq *rq, @@ -2749,10 +2707,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) call_trace_sched_update_nr_running(rq, count); } -#ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) set_rd_overloaded(rq->rd, 1); -#endif sched_update_tick_dependency(rq); } @@ -2918,10 +2874,7 @@ unsigned long arch_scale_freq_capacity(int cpu) static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); - /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ -#ifdef CONFIG_SMP rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); -#endif } #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ @@ -2930,8 +2883,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } -#ifdef CONFIG_SMP - static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) { #ifdef CONFIG_SCHED_CORE @@ -2954,7 +2905,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) /* * __sched_core_flip() relies on SMT having cpu-id lock order. */ -#endif +#endif /* CONFIG_SCHED_CORE */ return rq1->cpu < rq2->cpu; } @@ -3091,42 +3042,6 @@ extern void set_rq_offline(struct rq *rq); extern bool sched_smp_initialized; -#else /* !CONFIG_SMP: */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - WARN_ON_ONCE(!irqs_disabled()); - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_lock(rq1); - __acquire(rq2->lock); /* Fake it out ;) */ - double_rq_clock_clear_update(rq1, rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_unlock(rq1); - __release(rq2->lock); -} - -#endif /* !CONFIG_SMP */ - DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) @@ -3145,6 +3060,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); + #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void @@ -3184,7 +3100,7 @@ extern void nohz_balance_exit_idle(struct rq *rq); static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif /* !CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_run_idle_balance(int cpu); #else static inline void nohz_run_idle_balance(int cpu) { } @@ -3254,14 +3170,14 @@ static inline u64 irq_time_read(int cpu) return total; } -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline int irqtime_enabled(void) { return 0; } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3310,8 +3226,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } # define arch_scale_freq_invariant() false #endif -#ifdef CONFIG_SMP - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3355,10 +3269,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } -#else /* !CONFIG_SMP */ -static inline bool update_other_load_avgs(struct rq *rq) { return false; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); @@ -3535,13 +3445,13 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ #define perf_domain_span(pd) NULL static inline bool sched_energy_enabled(void) { return false; } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #ifdef CONFIG_MEMBARRIER @@ -3567,7 +3477,7 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else /* !CONFIG_MEMBARRIER :*/ +#else /* !CONFIG_MEMBARRIER: */ static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, @@ -3577,7 +3487,6 @@ static inline void membarrier_switch_mm(struct rq *rq, #endif /* !CONFIG_MEMBARRIER */ -#ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) { if (!(p->flags & PF_KTHREAD)) @@ -3588,7 +3497,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } -#endif extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -3887,7 +3795,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); -#ifdef CONFIG_SMP static inline void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) { @@ -3908,7 +3815,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) return false; } -#endif #ifdef CONFIG_RT_MUTEXES @@ -3949,21 +3855,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio); -#ifdef CONFIG_SMP extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#else - -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -{ - return NULL; -} - -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -{ -} - -#endif #ifdef CONFIG_SCHED_CLASS_EXT /* diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 21ac44428bb0..7f151d96dba9 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_SCHED_SMP_H +#define _KERNEL_SCHED_SMP_H + /* * Scheduler internal SMP callback types and methods between the scheduler * and other internal parts of the core kernel: */ +#include <linux/types.h> extern void sched_ttwu_pending(void *arg); @@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void); #else static inline void flush_smp_call_function_queue(void) { } #endif + +#endif /* _KERNEL_SCHED_SMP_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 4346fd81c31f..d1c9429a4ac5 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,6 +2,7 @@ /* * /proc/schedstat implementation */ +#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) @@ -114,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "timestamp %lu\n", jiffies); } else { struct rq *rq; -#ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; -#endif cpu = (unsigned long)(v - 2); rq = cpu_rq(cpu); @@ -132,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); -#ifdef CONFIG_SMP /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -163,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_move_balance); } rcu_read_unlock(); -#endif } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 452826df6ae1..26f3fd4d34ce 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -220,7 +220,7 @@ static inline void psi_sched_switch(struct task_struct *prev, psi_task_switch(prev, next, sleep); } -#else /* CONFIG_PSI */ +#else /* !CONFIG_PSI: */ static inline void psi_enqueue(struct task_struct *p, bool migrate) {} static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} @@ -229,7 +229,7 @@ static inline void psi_sched_switch(struct task_struct *prev, bool sleep) {} static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /* CONFIG_PSI */ +#endif /* !CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO /* @@ -334,6 +334,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_enqueue(rq, t) do { } while (0) # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHED_INFO */ +#endif /* !CONFIG_SCHED_INFO */ #endif /* _KERNEL_STATS_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 058dd42e3d9b..2d4e279f05ee 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,8 +7,8 @@ * * See kernel/stop_machine.c */ +#include "sched.h" -#ifdef CONFIG_SMP static int select_task_rq_stop(struct task_struct *p, int cpu, int flags) { @@ -20,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return sched_stop_runnable(rq); } -#endif /* CONFIG_SMP */ static void wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) @@ -106,11 +105,9 @@ DEFINE_SCHED_CLASS(stop) = { .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, -#ifdef CONFIG_SMP .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_stop, diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 72505cd3b60a..0fef6496c4c8 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,6 +2,7 @@ /* * <linux/swait.h> (simple wait queues ) implementation: */ +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 547c1f05b667..77ae87f36e84 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment) return 0; } -#endif +#endif /* __ARCH_WANT_SYS_NICE */ /** * task_prio - return the priority value of a given task. @@ -209,10 +209,8 @@ int idle_cpu(int cpu) if (rq->nr_running) return 0; -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -255,8 +253,7 @@ int sched_core_idle_cpu(int cpu) return idle_cpu(cpu); } - -#endif +#endif /* CONFIG_SCHED_CORE */ /** * find_process_by_pid - find a process with a matching PID value. @@ -448,7 +445,7 @@ static inline int uclamp_validate(struct task_struct *p, } static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ /* * Allow unprivileged RT tasks to decrease priority. @@ -642,7 +639,6 @@ change: goto unlock; } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; @@ -658,7 +654,6 @@ change: goto unlock; } } -#endif } /* Re-check policy now with rq lock held: */ @@ -1120,7 +1115,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } -#ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { /* @@ -1149,7 +1143,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) return 0; } -#endif /* CONFIG_SMP */ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { @@ -1242,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); if (user_mask) { cpumask_copy(user_mask, in_mask); - } else if (IS_ENABLED(CONFIG_SMP)) { + } else { return -ENOMEM; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b958fe48e020..977e133bb8a4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,7 +3,9 @@ * Scheduler topology setup/handling methods */ +#include <linux/sched/isolation.h> #include <linux/bsearch.h> +#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); void sched_domains_mutex_lock(void) @@ -87,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!(sd->flags & SD_OVERLAP) && + if (!(sd->flags & SD_NUMA) && cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); @@ -100,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->id, cpumask_pr_args(sched_group_span(group))); - if ((sd->flags & SD_OVERLAP) && + if ((sd->flags & SD_NUMA) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); @@ -313,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void) } late_initcall(sched_energy_aware_sysctl_init); -#endif +#endif /* CONFIG_PROC_SYSCTL */ static void free_pd(struct perf_domain *pd) { @@ -449,9 +451,9 @@ free: return false; } -#else +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ static void free_pd(struct perf_domain *pd) { } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ static void free_rootdomain(struct rcu_head *rcu) { @@ -1318,8 +1320,6 @@ next: update_group_capacity(sd, cpu); } -#ifdef CONFIG_SMP - /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) { @@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" * which is shared by all the overlapping groups. */ - WARN_ON_ONCE(sd->flags & SD_OVERLAP); + WARN_ON_ONCE(sd->flags & SD_NUMA); sg = sd->groups; if (cpu != sg->asym_prefer_cpu) { @@ -1374,8 +1374,6 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) } } -#endif /* CONFIG_SMP */ - /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same @@ -1598,7 +1596,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -#endif +#endif /* CONFIG_NUMA */ /* * SD_flags allowed in topology descriptions. @@ -1714,7 +1712,7 @@ sd_init(struct sched_domain_topology_level *tl, SD_WAKE_AFFINE); } -#endif +#endif /* CONFIG_NUMA */ } else { sd->cache_nice_tries = 1; } @@ -1739,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, + SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), #endif - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; @@ -2010,23 +2008,14 @@ void sched_init_numa(int offline_node) /* * Add the NUMA identity distance, aka single NODE. */ - tl[i++] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .numa_level = 0, - SD_INIT_NAME(NODE) - }; + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); /* * .. and append 'j' levels of NUMA goodness. */ for (j = 1; j < nr_levels; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); + tl[i].numa_level = j; } sched_domain_topology_saved = sched_domain_topology; @@ -2337,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sd) { sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) + if (sd && (sd->flags & SD_NUMA)) free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sd, j)); } @@ -2403,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map) id_seen = sched_domains_tmpmask2; for_each_sd_topology(tl) { + int tl_common_flags = 0; + + if (tl->sd_flags) + tl_common_flags = (*tl->sd_flags)(); /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) + if (tl_common_flags & SD_NUMA) continue; cpumask_clear(covered); @@ -2476,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP) - sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } @@ -2490,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - if (sd->flags & SD_OVERLAP) { + if (sd->flags & SD_NUMA) { if (build_overlap_sched_groups(sd, i)) goto error; } else { diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f4701..a6f00012c72e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,6 +4,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index b410b61cec95..1088d3b7012c 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/sched/debug.h> +#include "sched.h" + /* * The implementation of the wait_bit*() and related waiting APIs: */ |