diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 632 |
1 files changed, 534 insertions, 98 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f360326861e..84758f34cdb0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6,6 +6,10 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> +#undef CREATE_TRACE_POINTS + #include "sched.h" #include <linux/nospec.h> @@ -23,9 +27,6 @@ #include "pelt.h" #include "smp.h" -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> - /* * Export tracepoints that act as a bare tracehook (ie: have no trace event * associated with them) to allow external modules to probe them. @@ -36,6 +37,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -75,6 +79,100 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; + +/* + * Serialization rules: + * + * Lock order: + * + * p->pi_lock + * rq->lock + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) + * + * rq1->lock + * rq2->lock where: rq1 < rq2 + * + * Regular state: + * + * Normal scheduling state is serialized by rq->lock. __schedule() takes the + * local CPU's rq->lock, it optionally removes the task from the runqueue and + * always looks at the local rq data structures to find the most elegible task + * to run next. + * + * Task enqueue is also under rq->lock, possibly taken from another CPU. + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to + * the local CPU to avoid bouncing the runqueue state around [ see + * ttwu_queue_wakelist() ] + * + * Task wakeup, specifically wakeups that involve migration, are horribly + * complicated to avoid having to take two rq->locks. + * + * Special state: + * + * System-calls and anything external will use task_rq_lock() which acquires + * both p->pi_lock and rq->lock. As a consequence the state they change is + * stable while holding either lock: + * + * - sched_setaffinity()/ + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed + * - set_user_nice(): p->se.load, p->*prio + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, + * p->se.load, p->rt_priority, + * p->dl.dl_{runtime, deadline, period, flags, bw, density} + * - sched_setnuma(): p->numa_preferred_nid + * - sched_move_task()/ + * cpu_cgroup_fork(): p->sched_task_group + * - uclamp_update_active() p->uclamp* + * + * p->state <- TASK_*: + * + * is changed locklessly using set_current_state(), __set_current_state() or + * set_special_state(), see their respective comments, or by + * try_to_wake_up(). This latter uses p->pi_lock to serialize against + * concurrent self. + * + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: + * + * is set by activate_task() and cleared by deactivate_task(), under + * rq->lock. Non-zero indicates the task is runnable, the special + * ON_RQ_MIGRATING state is used for migration without holding both + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). + * + * p->on_cpu <- { 0, 1 }: + * + * is set by prepare_task() and cleared by finish_task() such that it will be + * set before p is scheduled-in and cleared after p is scheduled-out, both + * under rq->lock. Non-zero indicates the task is running on its CPU. + * + * [ The astute reader will observe that it is possible for two tasks on one + * CPU to have ->on_cpu = 1 at the same time. ] + * + * task_cpu(p): is changed by set_task_cpu(), the rules are: + * + * - Don't call set_task_cpu() on a blocked task: + * + * We don't care what CPU we're not running on, this simplifies hotplug, + * the CPU assignment of blocked tasks isn't required to be valid. + * + * - for try_to_wake_up(), called under p->pi_lock: + * + * This allows try_to_wake_up() to only take one rq->lock, see its comment. + * + * - for migration called under rq->lock: + * [ see task_on_rq_migrating() in task_rq_lock() ] + * + * o move_queued_task() + * o detach_task() + * + * - for migration called under double_rq_lock(): + * + * o __migrate_swap_task() + * o push_rt_task() / pull_rt_task() + * o push_dl_task() / pull_dl_task() + * o dl_task_offline_migration() + * + */ + /* * __task_rq_lock - lock the rq @p resides on. */ @@ -791,9 +889,46 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; + /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); + /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) @@ -873,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + rq = task_rq_lock(p, &rf); + __uclamp_update_util_min_rt_default(p); + task_rq_unlock(rq, p, &rf); +} + +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -990,10 +1183,38 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, lockdep_assert_held(&rq->lock); + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqeueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bukcet->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -1021,6 +1242,15 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1036,6 +1266,15 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1114,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) @@ -1128,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, goto done; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1144,8 +1386,15 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } + + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } /* * We update all RUNNABLE tasks only when task groups are in use. @@ -1158,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; done: mutex_unlock(&uclamp_mutex); @@ -1180,6 +1430,15 @@ static int uclamp_validate(struct task_struct *p, if (upper_bound > SCHED_CAPACITY_SCALE) return -EINVAL; + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + return 0; } @@ -1194,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p, */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); /* Keep using defined clamps across class changes */ if (uc_se->user_defined) continue; - /* By default, RT tasks always get 100% boost */ + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); + __uclamp_update_util_min_rt_default(p); + else + uclamp_se_set(uc_se, uclamp_none(clamp_id), false); - uclamp_se_set(uc_se, clamp_value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) @@ -1225,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; + /* + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. + */ for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1237,19 +1503,33 @@ static void uclamp_fork(struct task_struct *p) } } +static void uclamp_post_fork(struct task_struct *p) +{ + uclamp_update_util_min_rt_default(p); +} + +static void __init init_uclamp_rq(struct rq *rq) +{ + enum uclamp_id clamp_id; + struct uclamp_rq *uc_rq = rq->uclamp; + + for_each_clamp_id(clamp_id) { + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; + } + + rq->uclamp_flags = 0; +} + static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; enum uclamp_id clamp_id; int cpu; - mutex_init(&uclamp_mutex); - - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], @@ -1278,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -1311,9 +1592,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1601,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -1410,20 +1685,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - const struct sched_class *class; - - if (p->sched_class == rq->curr->sched_class) { + if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_curr(rq); - break; - } - } - } + else if (p->sched_class > rq->curr->sched_class) + resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In @@ -1474,8 +1739,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); - dequeue_task(rq, p, DEQUEUE_NOCLOCK); + deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -1483,8 +1747,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); - enqueue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); return rq; @@ -1637,7 +1900,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(p->cpus_ptr, new_mask)) + if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; /* @@ -2236,10 +2499,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2249,12 +2512,31 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Consider @p being inside a wait loop: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * between set_current_state() and schedule(). In this case @p is still + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in + * an atomic manner. + * + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq + * then schedule() must still happen and p->state can be changed to + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we + * need to do a full wakeup with enqueue. + * + * Returns: %true when the wakeup is done, + * %false otherwise. */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +static int ttwu_runnable(struct task_struct *p, int wake_flags) { struct rq_flags rf; struct rq *rq; @@ -2293,8 +2575,15 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { + if (WARN_ON_ONCE(p->on_cpu)) + smp_cond_load_acquire(&p->on_cpu, !VAL); + + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) + set_task_cpu(p, cpu_of(rq)); + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); + } rq_unlock_irqrestore(rq, &rf); } @@ -2322,7 +2611,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - __smp_call_single_queue(cpu, &p->wake_entry); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2369,7 +2658,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) * the soon-to-be-idle CPU as the current CPU is likely busy. * nr_running is checked to avoid unnecessary task stacking. */ - if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) return true; return false; @@ -2378,6 +2667,9 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + if (WARN_ON_ONCE(cpu == smp_processor_id())) + return false; + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -2385,6 +2677,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } + +#else /* !CONFIG_SMP */ + +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + return false; +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2392,10 +2692,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; -#endif rq_lock(rq, &rf); update_rq_clock(rq); @@ -2451,8 +2749,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * migration. However the means are completely different as there is no lock * chain to provide order. Instead we do: * - * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_load_acquire(!X->on_cpu) + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * Example: * @@ -2492,15 +2790,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * If (@state & @p->state) @p->state = TASK_RUNNING. + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. * * If the task was not queued/runnable, also place it back on a runqueue. * - * Atomic against schedule() which would dequeue a task, also see - * set_current_state(). + * This function is atomic against schedule() which would dequeue the task. + * + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. + * + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. * - * This function executes a full memory barrier before accessing the task - * state; see set_current_state(). + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. @@ -2516,7 +2832,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special - * case the whole 'p->on_rq && ttwu_remote()' case below + * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * * In particular: @@ -2528,7 +2844,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; success = 1; - cpu = task_cpu(p); trace_sched_waking(p); p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -2538,8 +2853,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. + * reordered with p->state check below. This pairs with smp_store_mb() + * in set_current_state() that the waiting thread does. */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); @@ -2550,7 +2865,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* We're going to change ->state: */ success = 1; - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2575,7 +2889,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -2584,9 +2898,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #ifdef CONFIG_SMP - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2605,8 +2916,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -2614,8 +2937,21 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * which potentially sends an IPI instead of spinning on p->on_cpu to * let the waker make forward progress. This is safe because IRQs are * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. */ - if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) goto unlock; /* @@ -2635,6 +2971,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } +#else + cpu = task_cpu(p); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2642,7 +2980,7 @@ unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: if (success) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, task_cpu(p), wake_flags); preempt_enable(); return success; @@ -2763,7 +3101,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP - p->wake_entry_type = CSD_TYPE_TTWU; + p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif } @@ -2939,6 +3277,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); + rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -2963,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); +} + unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -3003,6 +3347,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); + rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); @@ -3119,8 +3464,10 @@ static inline void prepare_task(struct task_struct *next) /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. + * + * See the ttwu() WF_ON_CPU case and its ordering comment. */ - next->on_cpu = 1; + WRITE_ONCE(next->on_cpu, 1); #endif } @@ -3128,8 +3475,9 @@ static inline void finish_task(struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely * finished. * * In particular, the load of prev->state in finish_task_switch() must @@ -3628,17 +3976,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); - -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure) -{ - int cpu; - - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -4001,8 +4338,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ - if (likely((prev->sched_class == &idle_sched_class || - prev->sched_class == &fair_sched_class) && + if (likely(prev->sched_class <= &fair_sched_class && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); @@ -4074,6 +4410,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4093,9 +4430,16 @@ static void __sched notrace __schedule(bool preempt) /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4106,10 +4450,38 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; + if (!preempt && prev_state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { @@ -4421,6 +4793,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -4533,7 +4906,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) */ if (dl_prio(prio)) { if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; queue_flag |= ENQUEUE_REPLENISH; } else @@ -5122,6 +5496,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, * @policy: new policy. * @param: structure containing the new RT priority. * + * Use sched_set_fifo(), read its comment. + * * Return: 0 on success. An error code otherwise. * * NOTE that the task may be already dead. @@ -5131,13 +5507,11 @@ int sched_setscheduler(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, true); } -EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } -EXPORT_SYMBOL_GPL(sched_setattr); int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { @@ -5162,7 +5536,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); + +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) @@ -5453,6 +5871,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif @@ -5810,7 +6233,7 @@ again: if (task_running(p_rq, p) || p->state) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); + yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { schedstat_inc(rq->yld_count); /* @@ -6644,6 +7067,14 @@ void __init sched_init(void) unsigned long ptr = 0; int i; + /* Make sure the linker didn't screw up */ + BUG_ON(&idle_sched_class + 1 != &fair_sched_class || + &fair_sched_class + 1 != &rt_sched_class || + &rt_sched_class + 1 != &dl_sched_class); +#ifdef CONFIG_SMP + BUG_ON(&dl_sched_class + 1 != &stop_sched_class); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7365,6 +7796,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; + static_branch_enable(&sched_uclamp_used); + mutex_lock(&uclamp_mutex); rcu_read_lock(); @@ -8052,4 +8485,7 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#undef CREATE_TRACE_POINTS +void call_trace_sched_update_nr_running(struct rq *rq, int count) +{ + trace_sched_update_nr_running_tp(rq, count); +} |