diff options
Diffstat (limited to 'kernel/sched/rt.c')
| -rw-r--r-- | kernel/sched/rt.c | 2101 |
1 files changed, 1516 insertions, 585 deletions
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64df..f1867fe8e5c5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1,35 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ #include "sched.h" - -#include <linux/slab.h> +#include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; +/* More than 4 hours if BW_SHIFT equals 20. */ +static const u64 max_rt_runtime = MAX_BW; -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); +/* + * period over which we measure -rt task CPU usage in us. + * default: 1s + */ +int sysctl_sched_rt_period = 1000000; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +#ifdef CONFIG_SYSCTL +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; +static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static const struct ctl_table sched_rt_sysctls[] = { + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_NEG_ONE, + .extra2 = (void *)&sysctl_sched_rt_period, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, +}; + +static int __init sched_rt_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_rt_sysctls); + return 0; +} +late_initcall(sched_rt_sysctl_init); +#endif /* CONFIG_SYSCTL */ + +void init_rt_rq(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); + rt_rq->tg = &root_task_group; +#endif +} -struct rt_bandwidth def_rt_bandwidth; +#ifdef CONFIG_RT_GROUP_SCHED + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; int idle = 0; + int overrun; + raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - + overrun = hrtimer_forward_now(timer, rt_b->rt_period); if (!overrun) break; + raw_spin_unlock(&rt_b->rt_runtime_lock); idle = do_sched_rt_period_timer(rt_b, overrun); + raw_spin_lock(&rt_b->rt_runtime_lock); } + if (idle) + rt_b->rt_period_active = 0; + raw_spin_unlock(&rt_b->rt_runtime_lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -41,52 +129,38 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.function = sched_rt_period_timer; + hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) { - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + if (!rt_b->rt_period_active) { + rt_b->rt_period_active = 1; + /* + * SCHED_DEADLINE updates the bandwidth, as a run away + * RT task with a DL task could hog a CPU. But DL does + * not reset the period. If a deadline task was running + * without an RT task running, it can cause RT tasks to + * throttle when they start up. Kick the timer right away + * to update the period. + */ + hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); + hrtimer_start_expires(&rt_b->rt_period_timer, + HRTIMER_MODE_ABS_PINNED_HARD); + } raw_spin_unlock(&rt_b->rt_runtime_lock); } -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); + do_start_rt_bandwidth(rt_b); } -#ifdef CONFIG_RT_GROUP_SCHED static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) { hrtimer_cancel(&rt_b->rt_period_timer); @@ -96,28 +170,47 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_SCHED_DEBUG WARN_ON_ONCE(!rt_entity_is_task(rt_se)); -#endif + return container_of(rt_se, struct task_struct, rt); } static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { + /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */ + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) { + WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group); return rt_se->rt_rq; } -void free_rt_sched_group(struct task_group *tg) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { - int i; + struct rt_rq *rt_rq = rt_se->rt_rq; + + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); + return rt_rq->rq; +} + +void unregister_rt_sched_group(struct task_group *tg) +{ + if (!rt_group_sched_enabled()) + return; if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); +} + +void free_rt_sched_group(struct task_group *tg) +{ + int i; + + if (!rt_group_sched_enabled()) + return; for_each_possible_cpu(i) { if (tg->rt_rq) @@ -136,7 +229,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; rt_rq->tg = tg; @@ -163,15 +256,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) struct sched_rt_entity *rt_se; int i; - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + if (!rt_group_sched_enabled()) + return 1; + + tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); if (!tg->rt_rq) goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL); if (!tg->rt_se) goto err; - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); + init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); for_each_possible_cpu(i) { rt_rq = kzalloc_node(sizeof(struct rt_rq), @@ -184,7 +279,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_rt_rq(rt_rq, cpu_rq(i)); + init_rt_rq(rt_rq); rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } @@ -197,7 +292,7 @@ err: return 0; } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #define rt_entity_is_task(rt_se) (1) @@ -211,23 +306,35 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) return container_of(rt_rq, struct rq, rt); } -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct task_struct *p = rt_task_of(rt_se); - struct rq *rq = task_rq(p); + + return task_rq(p); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct rq *rq = rq_of_rt_se(rt_se); return &rq->rt; } +void unregister_rt_sched_group(struct task_group *tg) { } + void free_rt_sched_group(struct task_group *tg) { } int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_SMP +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + return rq->online && rq->rt.highest_prio.curr > prev->prio; +} static inline int rt_overloaded(struct rq *rq) { @@ -246,8 +353,10 @@ static inline void rt_set_overload(struct rq *rq) * if we should look at the mask. It would be a shame * if we looked at the mask, but the mask was not * updated yet. + * + * Matched by the barrier in pull_rt_task(). */ - wmb(); + smp_wmb(); atomic_inc(&rq->rd->rto_count); } @@ -261,56 +370,28 @@ static inline void rt_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } -static void update_rt_migration(struct rt_rq *rt_rq) +static inline int has_pushable_tasks(struct rq *rq) { - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { - if (!rt_rq->overloaded) { - rt_set_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 1; - } - } else if (rt_rq->overloaded) { - rt_clear_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 0; - } + return !plist_head_empty(&rq->rt.pushable_tasks); } -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory++; +static DEFINE_PER_CPU(struct balance_callback, rt_push_head); +static DEFINE_PER_CPU(struct balance_callback, rt_pull_head); - update_rt_migration(rt_rq); -} +static void push_rt_tasks(struct rq *); +static void pull_rt_task(struct rq *); -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +static inline void rt_queue_push_tasks(struct rq *rq) { - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) + if (!has_pushable_tasks(rq)) return; - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory--; - - update_rt_migration(rt_rq); + queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); } -static inline int has_pushable_tasks(struct rq *rq) +static inline void rt_queue_pull_task(struct rq *rq) { - return !plist_head_empty(&rq->rt.pushable_tasks); + queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); } static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) @@ -322,6 +403,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) /* Update the highest prio pushable task */ if (p->prio < rq->rt.highest_prio.next) rq->rt.highest_prio.next = p->prio; + + if (!rq->rt.overloaded) { + rt_set_overload(rq); + rq->rt.overloaded = 1; + } } static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) @@ -333,44 +419,67 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks); rq->rt.highest_prio.next = p->prio; - } else - rq->rt.highest_prio.next = MAX_RT_PRIO; + } else { + rq->rt.highest_prio.next = MAX_RT_PRIO-1; + + if (rq->rt.overloaded) { + rt_clear_overload(rq); + rq->rt.overloaded = 0; + } + } } -#else +static void enqueue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); -static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +static inline int on_rt_rq(struct sched_rt_entity *rt_se) { + return rt_se->on_rq; } -static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +#ifdef CONFIG_UCLAMP_TASK +/* + * Verify the fitness of task @p to run on @cpu taking into account the uclamp + * settings. + * + * This check is only important for heterogeneous systems where uclamp_min value + * is higher than the capacity of a @cpu. For non-heterogeneous system this + * function will always return true. + * + * The function will return true if the capacity of the @cpu is >= the + * uclamp_min and false otherwise. + * + * Note that uclamp_min will be clamped to uclamp_max if uclamp_min + * > uclamp_max. + */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) { -} + unsigned int min_cap; + unsigned int max_cap; + unsigned int cpu_cap; -static inline -void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} + /* Only heterogeneous systems can benefit from this check */ + if (!sched_asym_cpucap_active()) + return true; -static inline -void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} + min_cap = uclamp_eff_value(p, UCLAMP_MIN); + max_cap = uclamp_eff_value(p, UCLAMP_MAX); -#endif /* CONFIG_SMP */ + cpu_cap = arch_scale_cpu_capacity(cpu); -static inline int on_rt_rq(struct sched_rt_entity *rt_se) + return cpu_cap >= min(min_cap, max_cap); +} +#else /* !CONFIG_UCLAMP_TASK: */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) { - return !list_empty(&rt_se->run_list); + return true; } +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (!rt_rq->tg) - return RUNTIME_INF; - return rt_rq->rt_runtime; } @@ -383,6 +492,11 @@ typedef struct task_group *rt_rq_iter_t; static inline struct task_group *next_task_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) { + WARN_ON(tg != &root_task_group); + return NULL; + } + do { tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); @@ -395,9 +509,9 @@ static inline struct task_group *next_task_group(struct task_group *tg) } #define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = container_of(&task_groups, typeof(*iter), list); \ - (iter = next_task_group(iter)) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]);) + for (iter = &root_task_group; \ + iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = next_task_group(iter)) #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -407,23 +521,27 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; + struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; - int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + int cpu = cpu_of(rq); rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { - if (rt_se && !on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); - if (rt_rq->highest_prio.curr < curr->prio) - resched_task(curr); + if (!rt_se) + enqueue_top_rt_rq(rt_rq); + else if (!on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se, 0); + + if (rt_rq->highest_prio.curr < donor->prio) + resched_curr(rq); } } @@ -434,8 +552,13 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (rt_se && on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + if (!rt_se) { + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); + } + else if (on_rt_rq(rt_se)) + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -455,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { return this_rq()->rd->span; } -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) @@ -478,73 +594,22 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &rt_rq->tg->rt_bandwidth; } -#else /* !CONFIG_RT_GROUP_SCHED */ - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(def_rt_bandwidth.rt_period); -} - -typedef struct rt_rq *rt_rq_iter_t; - -#define for_each_rt_rq(rt_rq, iter, rq) \ - for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = NULL) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return NULL; -} - -static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_running) - resched_task(rq_of_rt_rq(rt_rq)->curr); -} - -static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) { - return &cpu_rq(cpu)->rt; -} + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &def_rt_bandwidth; + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); } -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. */ -static int do_balance_runtime(struct rt_rq *rt_rq) +static void do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; - int i, weight, more = 0; + int i, weight; u64 rt_period; weight = cpumask_weight(rd->span); @@ -562,7 +627,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) /* * Either all rqs have inf runtime and there's nothing to steal * or __disable_runtime() below sets a specific rq to inf to - * indicate its been disabled and disalow stealing. + * indicate its been disabled and disallow stealing. */ if (iter->rt_runtime == RUNTIME_INF) goto next; @@ -578,7 +643,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) diff = rt_period - rt_rq->rt_runtime; iter->rt_runtime -= diff; rt_rq->rt_runtime += diff; - more = 1; if (rt_rq->rt_runtime == rt_period) { raw_spin_unlock(&iter->rt_runtime_lock); break; @@ -588,8 +652,6 @@ next: raw_spin_unlock(&iter->rt_runtime_lock); } raw_spin_unlock(&rt_b->rt_runtime_lock); - - return more; } /* @@ -661,7 +723,7 @@ static void __disable_runtime(struct rq *rq) * We cannot be left wanting - that would mean some runtime * leaked out of the system. */ - BUG_ON(want); + WARN_ON_ONCE(want); balanced: /* * Disable all the borrow logic by pretending we have inf @@ -671,6 +733,9 @@ balanced: rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); + + /* Make rt_rq available for pick_next_task() */ + sched_rt_rq_enqueue(rt_rq); } } @@ -698,27 +763,17 @@ static void __enable_runtime(struct rq *rq) } } -static int balance_runtime(struct rt_rq *rt_rq) +static void balance_runtime(struct rt_rq *rt_rq) { - int more = 0; - if (!sched_feat(RT_RUNTIME_SHARE)) - return more; + return; if (rt_rq->rt_time > rt_rq->rt_runtime) { raw_spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); + do_balance_runtime(rt_rq); raw_spin_lock(&rt_rq->rt_runtime_lock); } - - return more; -} -#else /* !CONFIG_SMP */ -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; } -#endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -726,7 +781,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); -#ifdef CONFIG_RT_GROUP_SCHED + /* * FIXME: isolated CPUs should really leave the root task group, * whether they are isolcpus or were isolated via cpusets, lest @@ -738,13 +793,29 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) */ if (rt_b == &root_task_group.rt_bandwidth) span = cpu_online_mask; -#endif + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq_flags rf; + int skip; + + /* + * When span == cpu_online_mask, taking each rq->lock + * can be time-consuming. Try to avoid it when possible. + */ + raw_spin_lock(&rt_rq->rt_runtime_lock); + if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) + rt_rq->rt_runtime = rt_b->rt_runtime; + skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (skip) + continue; + + rq_lock(rq, &rf); + update_rq_clock(rq); - raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { u64 runtime; @@ -758,11 +829,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) enqueue = 1; /* - * Force a clock update if the CPU was idle, - * lest wakeup -> unthrottle time accumulate. + * When we're idle and a woken (rt) task is + * throttled wakeup_preempt() will set + * skip_update and the time between the wakeup + * and this unthrottle will get accounted as + * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq->skip_clock_update = -1; + rq_clock_cancel_skipupdate(rq); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; @@ -777,7 +851,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) @@ -786,18 +860,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) return idle; } -static inline int rt_se_prio(struct sched_rt_entity *rt_se) -{ -#ifdef CONFIG_RT_GROUP_SCHED - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq) - return rt_rq->highest_prio.curr; -#endif - - return rt_task_of(rt_se)->prio; -} - static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -821,14 +883,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { - static bool once = false; - rt_rq->rt_throttled = 1; - - if (!once) { - once = true; - printk_sched("sched: RT throttling activated\n"); - } + printk_deferred_once("sched: RT throttling activated\n"); } else { /* * In case we did anyway, make it go away, @@ -847,58 +903,160 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } +#else /* !CONFIG_RT_GROUP_SCHED: */ + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = NULL) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return NULL; +} + +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_curr(rq); +} + +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return false; +} + +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +static void __enable_runtime(struct rq *rq) { } +static void __disable_runtime(struct rq *rq) { } + +#endif /* !CONFIG_RT_GROUP_SCHED */ + +static inline int rt_se_prio(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq) + return rt_rq->highest_prio.curr; +#endif + + return rt_task_of(rt_se)->prio; +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ static void update_curr_rt(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_rt_entity *rt_se = &curr->rt; - struct rt_rq *rt_rq = rt_rq_of_se(rt_se); - u64 delta_exec; + struct task_struct *donor = rq->donor; + s64 delta_exec; - if (curr->sched_class != &rt_sched_class) + if (donor->sched_class != &rt_sched_class) return; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) return; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); - - sched_rt_avg_update(rq, delta_exec); +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity *rt_se = &donor->rt; if (!rt_bandwidth_enabled()) return; for_each_sched_rt_entity(rt_se) { - rt_rq = rt_rq_of_se(rt_se); + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) + resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } +#endif /* CONFIG_RT_GROUP_SCHED */ +} + +static void +dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (!rt_rq->rt_queued) + return; + + BUG_ON(!rq->nr_running); + + sub_nr_running(rq, count); + rt_rq->rt_queued = 0; + } -#if defined CONFIG_SMP +static void +enqueue_top_rt_rq(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + BUG_ON(&rq->rt != rt_rq); + + if (rt_rq->rt_queued) + return; + + if (rt_rq_throttled(rt_rq)) + return; + + if (rt_rq->rt_nr_running) { + add_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 1; + } + + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq, 0); +} static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) + return; + if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -908,20 +1066,16 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); + /* + * Change rq's cpupri only if rt_rq is the top queue. + */ + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) + return; + if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* CONFIG_SMP */ - -static inline -void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -static inline -void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} - -#endif /* CONFIG_SMP */ - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void inc_rt_prio(struct rt_rq *rt_rq, int prio) { @@ -944,7 +1098,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) /* * This may have been our highest task, and therefore - * we may have some recomputation to do + * we may have some re-computation to do */ if (prio == prev_prio) { struct rt_prio_array *array = &rt_rq->active; @@ -953,19 +1107,13 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) sched_find_first_bit(array->bitmap); } - } else - rt_rq->highest_prio.curr = MAX_RT_PRIO; + } else { + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + } dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else - -static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} -static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} - -#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED static void @@ -974,8 +1122,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); } static void @@ -987,18 +1134,42 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { - start_rt_bandwidth(&def_rt_bandwidth); } static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ + +static inline +unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + + if (group_rq) + return group_rq->rt_nr_running; + else + return 1; +} + +static inline +unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + struct task_struct *tsk; + + if (group_rq) + return group_rq->rr_nr_running; + + tsk = rt_task_of(rt_se); + + return (tsk->policy == SCHED_RR) ? 1 : 0; +} static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) @@ -1006,10 +1177,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); - rt_rq->rt_nr_running++; + rt_rq->rt_nr_running += rt_se_nr_running(rt_se); + rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); - inc_rt_migration(rt_se, rt_rq); inc_rt_group(rt_se, rt_rq); } @@ -1018,14 +1189,141 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); - rt_rq->rt_nr_running--; + rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); + rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); - dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static inline struct sched_statistics * +__schedstats_from_rt_se(struct sched_rt_entity *rt_se) +{ + /* schedstats is not supported for rt group. */ + if (!rt_entity_is_task(rt_se)) + return NULL; + + return &rt_task_of(rt_se)->stats; +} + +static inline void +update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_rt(rt_rq, rt_se); +} + +static inline void +update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + if ((flags & DEQUEUE_SLEEP) && p) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_rt_rq(rt_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_rt_rq(rt_rq))); + } +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1038,26 +1336,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1066,38 +1375,53 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; + unsigned int rt_nr_running; for_each_sched_rt_entity(rt_se) { rt_se->back = back; back = rt_se; } + rt_nr_running = rt_rq_of_se(back)->rt_nr_running; + for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } + + dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running); } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { - dequeue_rt_stack(rt_se); + struct rq *rq = rq_of_rt_se(rt_se); + + update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags); + + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); + enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { - dequeue_rt_stack(rt_se); + struct rq *rq = rq_of_rt_se(rt_se); + + update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags); + + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } + enqueue_top_rt_rq(&rq->rt); } /* @@ -1111,24 +1435,28 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + check_schedstat_required(); + update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); + + enqueue_rt_entity(rt_se, flags); + + if (task_is_blocked(p)) + return; if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - inc_nr_running(rq); } -static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) +static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); - dec_nr_running(rq); + return true; } /* @@ -1162,32 +1490,27 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) static void yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, rq->curr, 0); + requeue_task_rt(rq, rq->donor, 0); } -#ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; struct rq *rq; - int cpu; - - cpu = task_cpu(p); - - if (p->nr_cpus_allowed == 1) - goto out; + bool test; /* For anything but wake ups, just return the task_cpu */ - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (!(flags & (WF_TTWU | WF_FORK))) goto out; rq = cpu_rq(cpu); rcu_read_lock(); - curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If the current task on @p's runqueue is an RT task, then @@ -1205,21 +1528,40 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * * For equal prio tasks, we just let the scheduler sort it out. * - * Otherwise, just let it ride on the affined RQ and the + * Otherwise, just let it ride on the affine RQ and the * post-schedule router will push the preempted task away * * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. + * + * We take into account the capacity of the CPU to ensure it fits the + * requirement of the task - which is only important on heterogeneous + * systems like big.LITTLE. */ - if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio) && - (p->nr_cpus_allowed > 1)) { + test = curr && + unlikely(rt_task(donor)) && + (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); + + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); - if (target != -1) + /* + * Bail out if we were forcing a migration to find a better + * fitting CPU but our search failed. + */ + if (!test && target != -1 && !rt_task_fits_capacity(p, target)) + goto out_unlock; + + /* + * Don't bother moving it if the destination CPU is + * not running a lower priority task. + */ + if (target != -1 && + p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } + +out_unlock: rcu_read_unlock(); out: @@ -1228,38 +1570,56 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1 || + !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) return; - if (p->nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) - return; - - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ + if (p->nr_cpus_allowed != 1 && + cpupri_find(&rq->rd->cpupri, p, NULL)) return; /* - * There appears to be other cpus that can accept - * current and none to run 'p', so lets reschedule - * to try and push current away: + * There appear to be other CPUs that can accept + * the current task but none can run 'p', so lets reschedule + * to try and push the current task away: */ requeue_task_rt(rq, p, 1); - resched_task(rq->curr); + resched_curr(rq); } -#endif /* CONFIG_SMP */ +static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); +} /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { - if (p->prio < rq->curr->prio) { - resched_task(rq->curr); + struct task_struct *donor = rq->donor; + + if (p->prio < donor->prio) { + resched_curr(rq); return; } -#ifdef CONFIG_SMP /* * If: * @@ -1272,13 +1632,37 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif } -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq) +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) +{ + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; + + p->se.exec_start = rq_clock_task(rq); + if (on_rt_rq(&p->rt)) + update_stats_wait_end_rt(rt_rq, rt_se); + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); + + if (!first) + return; + + /* + * If prev task was rt, put_prev_task() has already updated the + * utilization. We only care of the case where we start to schedule a + * rt task + */ + if (rq->donor->sched_class != &rt_sched_class) + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + rt_queue_push_tasks(rq); +} + +static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) { struct rt_prio_array *array = &rt_rq->active; struct sched_rt_entity *next = NULL; @@ -1289,6 +1673,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; + if (WARN_ON_ONCE(list_empty(queue))) + return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; @@ -1297,52 +1683,44 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; - struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; + struct rt_rq *rt_rq = &rq->rt; do { - rt_se = pick_next_rt_entity(rq, rt_rq); - BUG_ON(!rt_se); + rt_se = pick_next_rt_entity(rt_rq); + if (unlikely(!rt_se)) + return NULL; rt_rq = group_rt_rq(rt_se); } while (rt_rq); - p = rt_task_of(rt_se); - p->se.exec_start = rq_clock_task(rq); - - return p; + return rt_task_of(rt_se); } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) { - struct task_struct *p = _pick_next_task_rt(rq); + struct task_struct *p; - /* The running task is never eligible for pushing */ - if (p) - dequeue_pushable_task(rq, p); + if (!sched_rt_runnable(rq)) + return NULL; -#ifdef CONFIG_SMP - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); -#endif + p = _pick_next_task_rt(rq); return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next) { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; + + if (on_rt_rq(&p->rt)) + update_stats_wait_start_rt(rt_rq, rt_se); + update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); + + if (task_is_blocked(p)) + return; /* * The previous task needs to be made eligible for pushing * if it is still active @@ -1351,22 +1729,12 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) enqueue_pushable_task(rq, p); } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_running(rq, p) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) - return 1; - return 0; -} - /* * Return the highest pushable rq's task, which is suitable to be executed - * on the cpu, NULL otherwise + * on the CPU, NULL otherwise */ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) { @@ -1377,7 +1745,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; plist_for_each_entry(p, head, pushable_tasks) { - if (pick_rt_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; } @@ -1389,9 +1757,10 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + int ret; /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1400,15 +1769,30 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + /* + * If we're on asym system ensure we consider the different capacities + * of the CPUs when searching for the lowest_mask. + */ + if (sched_asym_cpucap_active()) { + + ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, + task, lowest_mask, + rt_task_fits_capacity); + } else { + + ret = cpupri_find(&task_rq(task)->rd->cpupri, + task, lowest_mask); + } + + if (!ret) return -1; /* No targets found */ /* - * At this point we have built a mask of cpus representing the + * At this point we have built a mask of CPUs representing the * lowest priority tasks in the system. Now we want to elect * the best one based on our affinity and topology. * - * We prioritize the last cpu that the task executed on since + * We prioritize the last CPU that the task executed on since * it is most likely cache-hot in that location. */ if (cpumask_test_cpu(cpu, lowest_mask)) @@ -1416,7 +1800,7 @@ static int find_lowest_rq(struct task_struct *task) /* * Otherwise, we consult the sched_domains span maps to figure - * out which cpu is logically closest to our hot cache data. + * out which CPU is logically closest to our hot cache data. */ if (!cpumask_test_cpu(this_cpu, lowest_mask)) this_cpu = -1; /* Skip this_cpu opt if not among lowest */ @@ -1436,8 +1820,8 @@ static int find_lowest_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(lowest_mask, + sched_domain_span(sd)); if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; @@ -1454,12 +1838,34 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(lowest_mask); + cpu = cpumask_any_distribute(lowest_mask); if (cpu < nr_cpu_ids) return cpu; + return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1475,19 +1881,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) lowest_rq = cpu_rq(cpu); + if (lowest_rq->rt.highest_prio.curr <= task->prio) { + /* + * Target rq has tasks of equal or higher priority, + * retrying does not release any lock and is unlikely + * to yield a different result. + */ + lowest_rq = NULL; + break; + } + /* if the prio of this runqueue changed, try again */ if (double_lock_balance(rq, lowest_rq)) { /* * We had to unlock the run queue. In * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. + * migrated already or had its affinity changed, + * therefore check if the task is still at the + * head of the pushable tasks list. + * It is possible the task was scheduled, set + * "migrate_disabled" and then got preempted, so we must + * check the task migration disable flag here too. */ - if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, - tsk_cpus_allowed(task)) || - task_running(rq, task) || - !task->on_rq)) { + if (unlikely(is_migration_disabled(task) || + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || + task != pick_next_pushable_task(rq))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1507,32 +1925,12 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!p->on_rq); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task * of lesser priority. */ -static int push_rt_task(struct rq *rq) +static int push_rt_task(struct rq *rq, bool pull) { struct task_struct *next_task; struct rq *lowest_rq; @@ -1546,21 +1944,61 @@ static int push_rt_task(struct rq *rq) return 0; retry: - if (unlikely(next_task == rq->curr)) { - WARN_ON(1); - return 0; - } - /* * It's possible that the next_task slipped in of * higher priority than current. If that's the case * just reschedule current. */ - if (unlikely(next_task->prio < rq->curr->prio)) { - resched_task(rq->curr); + if (unlikely(next_task->prio < rq->donor->prio)) { + resched_curr(rq); + return 0; + } + + if (is_migration_disabled(next_task)) { + struct task_struct *push_task = NULL; + int cpu; + + if (!pull || rq->push_busy) + return 0; + + /* + * Invoking find_lowest_rq() on anything but an RT task doesn't + * make sense. Per the above priority check, curr has to + * be of higher priority than next_task, so no need to + * reschedule when bailing out. + * + * Note that the stoppers are masqueraded as SCHED_FIFO + * (cf. sched_set_stop_task()), so we can't rely on rt_task(). + */ + if (rq->donor->sched_class != &rt_sched_class) + return 0; + + cpu = find_lowest_rq(rq->curr); + if (cpu == -1 || cpu == rq->cpu) + return 0; + + /* + * Given we found a CPU with lower priority than @next_task, + * therefore it should be running. However we cannot migrate it + * to this other CPU, instead attempt to push the current + * running task on this CPU away. + */ + push_task = get_push_task(rq); + if (push_task) { + preempt_disable(); + raw_spin_rq_unlock(rq); + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + push_task, &rq->push_work); + preempt_enable(); + raw_spin_rq_lock(rq); + } + return 0; } + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -1577,12 +2015,12 @@ retry: * pushing. */ task = pick_next_pushable_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue * to push it to. Do not retry in this case, since - * other cpus will pull from us when ready. + * other CPUs will pull from us when ready. */ goto out; } @@ -1599,15 +2037,11 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); + move_queued_task_locked(rq, lowest_rq, next_task); + resched_curr(lowest_rq); ret = 1; - resched_task(lowest_rq->curr); - double_unlock_balance(rq, lowest_rq); - out: put_task_struct(next_task); @@ -1617,18 +2051,209 @@ out: static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ - while (push_rt_task(rq)) + while (push_rt_task(rq, false)) ; } -static int pull_rt_task(struct rq *this_rq) +#ifdef HAVE_RT_PUSH_IPI + +/* + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * All CPUs with overloaded RT tasks need to be notified as there is currently + * no way to know which of these CPUs have the highest priority task waiting + * to run. Instead of trying to take a spinlock on each of these CPUs, + * which has shown to cause large latency when done on machines with many + * CPUs, sending an IPI to the CPUs to have them push off the overloaded + * RT tasks waiting to run. + * + * Just sending an IPI to each of the CPUs is also an issue, as on large + * count CPU machines, this can cause an IPI storm on a CPU, especially + * if its the only CPU with multiple RT tasks queued, and a large number + * of CPUs scheduling a lower priority task at the same time. + * + * Each root domain has its own IRQ work function that can iterate over + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT + * task must be checked if there's one or many CPUs that are lowering + * their priority, there's a single IRQ work iterator that will try to + * push off RT tasks that are waiting to run. + * + * When a CPU schedules a lower priority task, it will kick off the + * IRQ work iterator that will jump to each CPU with overloaded RT tasks. + * As it only takes the first CPU that schedules a lower priority task + * to start the process, the rto_start variable is incremented and if + * the atomic result is one, then that CPU will try to take the rto_lock. + * This prevents high contention on the lock as the process handles all + * CPUs scheduling lower priority tasks. + * + * All CPUs that are scheduling a lower priority task will increment the + * rt_loop_next variable. This will make sure that the IRQ work iterator + * checks all RT overloaded CPUs whenever a CPU schedules a new lower + * priority task, even if the iterator is in the middle of a scan. Incrementing + * the rt_loop_next will cause the iterator to perform another scan. + * + */ +static int rto_next_cpu(struct root_domain *rd) { - int this_cpu = this_rq->cpu, ret = 0, cpu; - struct task_struct *p; + int next; + int cpu; + + /* + * When starting the IPI RT pushing, the rto_cpu is set to -1, + * rt_next_cpu() will simply return the first CPU found in + * the rto_mask. + * + * If rto_next_cpu() is called with rto_cpu is a valid CPU, it + * will return the next CPU found in the rto_mask. + * + * If there are no more CPUs left in the rto_mask, then a check is made + * against rto_loop and rto_loop_next. rto_loop is only updated with + * the rto_lock held, but any CPU may increment the rto_loop_next + * without any locking. + */ + for (;;) { + + /* When rto_cpu is -1 this acts like cpumask_first() */ + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); + + rd->rto_cpu = cpu; + + if (cpu < nr_cpu_ids) + return cpu; + + rd->rto_cpu = -1; + + /* + * ACQUIRE ensures we see the @rto_mask changes + * made prior to the @next value observed. + * + * Matches WMB in rt_set_overload(). + */ + next = atomic_read_acquire(&rd->rto_loop_next); + + if (rd->rto_loop == next) + break; + + rd->rto_loop = next; + } + + return -1; +} + +static inline bool rto_start_trylock(atomic_t *v) +{ + return !atomic_cmpxchg_acquire(v, 0, 1); +} + +static inline void rto_start_unlock(atomic_t *v) +{ + atomic_set_release(v, 0); +} + +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu = -1; + + /* Keep the loop going if the IPI is currently active */ + atomic_inc(&rq->rd->rto_loop_next); + + /* Only one CPU can initiate a loop at a time */ + if (!rto_start_trylock(&rq->rd->rto_loop_start)) + return; + + raw_spin_lock(&rq->rd->rto_lock); + + /* + * The rto_cpu is updated under the lock, if it has a valid CPU + * then the IPI is still running and will continue due to the + * update to loop_next, and nothing needs to be done here. + * Otherwise it is finishing up and an IPI needs to be sent. + */ + if (rq->rd->rto_cpu < 0) + cpu = rto_next_cpu(rq->rd); + + raw_spin_unlock(&rq->rd->rto_lock); + + rto_start_unlock(&rq->rd->rto_loop_start); + + if (cpu >= 0) { + /* Make sure the rd does not get freed while pushing */ + sched_get_rd(rq->rd); + irq_work_queue_on(&rq->rd->rto_push_work, cpu); + } +} + +/* Called from hardirq context */ +void rto_push_irq_work_func(struct irq_work *work) +{ + struct root_domain *rd = + container_of(work, struct root_domain, rto_push_work); + struct rq *rq; + int cpu; + + rq = this_rq(); + + /* + * We do not need to grab the lock to check for has_pushable_tasks. + * When it gets updated, a check is made if a push is possible. + */ + if (has_pushable_tasks(rq)) { + raw_spin_rq_lock(rq); + while (push_rt_task(rq, true)) + ; + raw_spin_rq_unlock(rq); + } + + raw_spin_lock(&rd->rto_lock); + + /* Pass the IPI to the next rt overloaded queue */ + cpu = rto_next_cpu(rd); + + raw_spin_unlock(&rd->rto_lock); + + if (cpu < 0) { + sched_put_rd(rd); + return; + } + + /* Try the next RT overloaded CPU */ + irq_work_queue_on(&rd->rto_push_work, cpu); +} +#endif /* HAVE_RT_PUSH_IPI */ + +static void pull_rt_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, cpu; + bool resched = false; + struct task_struct *p, *push_task; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) - return 0; + if (likely(!rt_overload_count)) + return; + + /* + * Match the barrier from rt_set_overloaded; this guarantees that if we + * see overloaded we must also see the rto_mask bit. + */ + smp_rmb(); + + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + +#ifdef HAVE_RT_PUSH_IPI + if (sched_feat(RT_PUSH_IPI)) { + tell_cpu_to_push(this_rq); + return; + } +#endif for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) @@ -1652,6 +2277,7 @@ static int pull_rt_task(struct rq *this_rq) * double_lock_balance, and another CPU could * alter this_rq */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -1666,24 +2292,25 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * There's a chance that p is higher in priority - * than what's currently running on its cpu. - * This is just that p is wakeing up and hasn't + * than what's currently running on its CPU. + * This is just that p is waking up and hasn't * had a chance to schedule. We only pull * p if it is lower in priority than the * current task on the run queue */ - if (p->prio < src_rq->curr->prio) + if (p->prio < src_rq->donor->prio) goto skip; - ret = 1; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + move_queued_task_locked(src_rq, this_rq, p); + resched = true; + } /* * We continue with the search, just in * case there's an even higher prio task @@ -1693,21 +2320,19 @@ static int pull_rt_task(struct rq *this_rq) } skip: double_unlock_balance(this_rq, src_rq); - } - return ret; -} - -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) - pull_rt_task(rq); -} + if (push_task) { + preempt_disable(); + raw_spin_rq_unlock(this_rq); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + preempt_enable(); + raw_spin_rq_lock(this_rq); + } + } -static void post_schedule_rt(struct rq *rq) -{ - push_rt_tasks(rq); + if (resched) + resched_curr(this_rq); } /* @@ -1716,53 +2341,15 @@ static void post_schedule_rt(struct rq *rq) */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - has_pushable_tasks(rq) && - p->nr_cpus_allowed > 1 && - rt_task(rq->curr) && - (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) - push_rt_tasks(rq); -} - -static void set_cpus_allowed_rt(struct task_struct *p, - const struct cpumask *new_mask) -{ - struct rq *rq; - int weight; - - BUG_ON(!rt_task(p)); - - if (!p->on_rq) - return; - - weight = cpumask_weight(new_mask); - - /* - * Only update if the process changes its state from whether it - * can migrate or not. - */ - if ((p->nr_cpus_allowed > 1) == (weight > 1)) - return; - - rq = task_rq(p); - - /* - * The process used to be able to migrate OR it can now migrate - */ - if (weight <= 1) { - if (!task_current(rq, p)) - dequeue_pushable_task(rq, p); - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } else { - if (!task_current(rq, p)) - enqueue_pushable_task(rq, p); - rq->rt.rt_nr_migratory++; - } + bool need_to_push = !task_on_cpu(rq, p) && + !test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + (dl_task(rq->donor) || rt_task(rq->donor)) && + (rq->curr->nr_cpus_allowed < 2 || + rq->donor->prio <= p->prio); - update_rt_migration(&rq->rt); + if (need_to_push) + push_rt_tasks(rq); } /* Assumes rq->lock is held */ @@ -1800,14 +2387,13 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!p->on_rq || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; - if (pull_rt_task(rq)) - resched_task(rq->curr); + rt_queue_pull_task(rq); } -void init_sched_rt_class(void) +void __init init_sched_rt_class(void) { unsigned int i; @@ -1816,7 +2402,6 @@ void init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#endif /* CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue @@ -1825,24 +2410,25 @@ void init_sched_rt_class(void) */ static void switched_to_rt(struct rq *rq, struct task_struct *p) { - int check_resched = 1; + /* + * If we are running, update the avg_rt tracking, as the running time + * will now on be accounted into the latter. + */ + if (task_current(rq, p)) { + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + return; + } /* - * If we are already running, then there's nothing - * that needs to be done. But if we are not running - * we may need to preempt the current running task. - * If that current running task is also an RT task + * If we are not running we may need to preempt the current + * running task. If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->on_rq && rq->curr != p) { -#ifdef CONFIG_SMP - if (rq->rt.overloaded && push_rt_task(rq) && - /* Don't resched if we changed runqueues */ - rq != task_rq(p)) - check_resched = 0; -#endif /* CONFIG_SMP */ - if (check_resched && p->prio < rq->curr->prio) - resched_task(rq->curr); + if (task_on_rq_queued(p)) { + if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + rt_queue_push_tasks(rq); + if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) + resched_curr(rq); } } @@ -1851,43 +2437,40 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * us to initiate a push or pull. */ static void -prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; - if (rq->curr == p) { -#ifdef CONFIG_SMP + if (p->prio == oldprio) + return; + + if (task_current_donor(rq, p)) { /* * If our priority decreases while running, we * may need to pull tasks to this runqueue. */ if (oldprio < p->prio) - pull_rt_task(rq); + rt_queue_pull_task(rq); + /* * If there's a higher priority task waiting to run - * then reschedule. Note, the above pull_rt_task - * can release the rq lock and p could migrate. - * Only reschedule if p is still on the same runqueue. + * then reschedule. */ - if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) - resched_task(p); -#else - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_task(p); -#endif /* CONFIG_SMP */ + if (p->prio > rq->rt.highest_prio.curr) + resched_curr(rq); } else { /* * This task is not running, but if it is * greater than the current running task * then reschedule. */ - if (p->prio < rq->curr->prio) - resched_task(rq->curr); + if (p->prio < rq->donor->prio) + resched_curr(rq); } } +#ifdef CONFIG_POSIX_TIMERS static void watchdog(struct rq *rq, struct task_struct *p) { unsigned long soft, hard; @@ -1905,21 +2488,35 @@ static void watchdog(struct rq *rq, struct task_struct *p) } next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); - if (p->rt.timeout > next) - p->cputime_expires.sched_exp = p->se.sum_exec_runtime; + if (p->rt.timeout > next) { + posix_cputimers_rt_watchdog(&p->posix_cputimers, + p->se.sum_exec_runtime); + } } } +#else /* !CONFIG_POSIX_TIMERS: */ +static inline void watchdog(struct rq *rq, struct task_struct *p) { } +#endif /* !CONFIG_POSIX_TIMERS */ +/* + * scheduler tick hitting a task of our scheduling class. + * + * NOTE: This function can be called remotely by the tick offload that + * goes along full dynticks. Therefore no local assumption can be made + * and everything must be accessed through the @rq and @curr passed in + * parameters. + */ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); watchdog(rq, p); /* - * RR tasks need a special form of timeslice management. + * RR tasks need a special form of time-slice management. * FIFO tasks have no timeslices. */ if (p->policy != SCHED_RR) @@ -1931,28 +2528,18 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = sched_rr_timeslice; /* - * Requeue to the end of queue if we (and all of our ancestors) are the - * only element on the queue + * Requeue to the end of queue if we (and all of our ancestors) are not + * the only element on the queue */ for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + resched_curr(rq); return; } } } -static void set_curr_task_rt(struct rq *rq) -{ - struct task_struct *p = rq->curr; - - p->se.exec_start = rq_clock_task(rq); - - /* The running task is never eligible for pushing */ - dequeue_pushable_task(rq, p); -} - static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* @@ -1964,40 +2551,385 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class = { - .next = &fair_sched_class, +#ifdef CONFIG_SCHED_CORE +static int task_is_throttled_rt(struct task_struct *p, int cpu) +{ + struct rt_rq *rt_rq; + +#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq + rt_rq = task_group(p)->rt_rq[cpu]; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); +#else + rt_rq = &cpu_rq(cpu)->rt; +#endif + + return rt_rq_throttled(rt_rq); +} +#endif /* CONFIG_SCHED_CORE */ + +DEFINE_SCHED_CLASS(rt) = { + + .queue_mask = 4, + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, - .check_preempt_curr = check_preempt_curr_rt, + .wakeup_preempt = wakeup_preempt_rt, - .pick_next_task = pick_next_task_rt, + .pick_task = pick_task_rt, .put_prev_task = put_prev_task_rt, + .set_next_task = set_next_task_rt, -#ifdef CONFIG_SMP + .balance = balance_rt, .select_task_rq = select_task_rq_rt, - - .set_cpus_allowed = set_cpus_allowed_rt, + .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, - .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, -#endif + .find_lock_rq = find_lock_lowest_rq, - .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, - .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + .prio_changed = prio_changed_rt, + + .update_curr = update_curr_rt, + +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_rt, +#endif + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif +}; + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *task; + struct css_task_iter it; + int ret = 0; + + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + + css_task_iter_start(&tg->css, 0, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret |= rt_task(task); + css_task_iter_end(&it); + + return ret; +} + +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; }; -#ifdef CONFIG_SCHED_DEBUG -extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +static int tg_rt_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks if runtime turns zero. + */ + if (rt_bandwidth_enabled() && !runtime && + tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); + } + + if (sum > total) + return -EINVAL; + + return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + int ret; + + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; +} + +static int tg_set_rt_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; + + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) + return -EINVAL; + + mutex_lock(&rt_constraints_mutex); + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) + goto unlock; + + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock: + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_bandwidth.rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) +{ + u64 rt_runtime, rt_period; + + if (rt_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + + rt_period = rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +#ifdef CONFIG_SYSCTL +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + ret = __rt_schedulable(NULL, 0, 0); + mutex_unlock(&rt_constraints_mutex); + + return ret; +} +#endif /* CONFIG_SYSCTL */ + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept real-time tasks when there is no way for them to run */ + if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED: */ + +#ifdef CONFIG_SYSCTL +static int sched_rt_global_constraints(void) +{ + return 0; +} +#endif /* CONFIG_SYSCTL */ +#endif /* !CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SYSCTL +static int sched_rt_global_validate(void) +{ + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || + ((u64)sysctl_sched_rt_runtime * + NSEC_PER_USEC > max_rt_runtime))) + return -EINVAL; + + return 0; +} + +static void sched_rt_do_global(void) +{ +} + +static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + sched_domains_mutex_lock(); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + + ret = sched_dl_global_validate(); + if (ret) + goto undo; + + ret = sched_rt_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } + sched_domains_mutex_unlock(); + mutex_unlock(&mutex); + + /* + * After changing maximum available bandwidth for DEADLINE, we need to + * recompute per root domain and per cpus variables accordingly. + */ + rebuild_sched_domains(); + + return ret; +} + +static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* + * Make sure that internally we keep jiffies. + * Also, writing zero resets the time-slice to default: + */ + if (!ret && write) { + sched_rr_timeslice = + sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : + msecs_to_jiffies(sysctl_sched_rr_timeslice); + + if (sysctl_sched_rr_timeslice <= 0) + sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE); + } + mutex_unlock(&mutex); + + return ret; +} +#endif /* CONFIG_SYSCTL */ void print_rt_stats(struct seq_file *m, int cpu) { @@ -2009,4 +2941,3 @@ void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif /* CONFIG_SCHED_DEBUG */ |
