diff options
Diffstat (limited to 'kernel/sched/deadline.c')
-rw-r--r-- | kernel/sched/deadline.c | 1979 |
1 files changed, 1380 insertions, 599 deletions
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f63f337c7147..ff4df16b5186 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -15,13 +15,52 @@ * Michael Trimarchi <michael@amarulasolutions.com>, * Fabio Checconi <fchecconi@gmail.com> */ -#include "sched.h" -#include "pelt.h" -struct dl_bandwidth def_dl_bandwidth; +#include <linux/cpuset.h> + +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ +#ifdef CONFIG_SYSCTL +static const struct ctl_table sched_dl_sysctls[] = { + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)&sysctl_sched_dl_period_min, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = (void *)&sysctl_sched_dl_period_max, + }, +}; + +static int __init sched_dl_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_dl_sysctls); + return 0; +} +late_initcall(sched_dl_sysctl_init); +#endif + +static bool dl_server(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server; +} static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { + BUG_ON(dl_server(dl_se)); return container_of(dl_se, struct task_struct, dl); } @@ -30,12 +69,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) return container_of(dl_rq, struct rq, dl); } -static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq = dl_se->rq; + + if (!dl_server(dl_se)) + rq = task_rq(dl_task_of(dl_se)); - return &rq->dl; + return rq; +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + return &rq_of_dl_se(dl_se)->dl; } static inline int on_dl_rq(struct sched_dl_entity *dl_se) @@ -43,6 +89,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -54,15 +122,75 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; return cpus; } + +static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) +{ + unsigned long cap = 0; + int i; + + for_each_cpu_and(i, mask, cpu_active_mask) + cap += arch_scale_cpu_capacity(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!sched_asym_cpucap_active() && + arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return __dl_bw_capacity(cpu_rq(i)->rd->span); + } +} + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + struct root_domain *rd = cpu_rq(cpu)->rd; + + if (rd->visit_gen == gen) + return true; + + rd->visit_gen = gen; + return false; +} + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -73,14 +201,53 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + return false; +} + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} #endif static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline bool +__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +static inline void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); @@ -93,7 +260,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) @@ -107,7 +274,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ } @@ -117,7 +284,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) @@ -153,33 +320,63 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -static void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) { - struct rq *rq; - - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); - - if (task_on_rq_queued(p)) - return; + if (dl_se->dl_non_contending) { + sub_running_bw(dl_se, &rq->dl); + dl_se->dl_non_contending = 0; - rq = task_rq(p); - if (p->dl.dl_non_contending) { - sub_running_bw(&p->dl, &rq->dl); - p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } - __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __sub_rq_bw(dl_se->dl_bw, &rq->dl); __add_rq_bw(new_bw, &rq->dl); } +static __always_inline +void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer) +{ + /* + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). + */ + if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); +} + +static __always_inline +void cancel_replenish_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->dl_timer); +} + +static __always_inline +void cancel_inactive_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->inactive_timer); +} + +static void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); + + if (task_on_rq_queued(p)) + return; + + dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); +} + +static void __dl_clear_params(struct sched_dl_entity *dl_se); + /* * The utilization of a task cannot be immediately removed from * the rq active utilization (running_bw) when the task blocks. @@ -190,7 +387,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * fires. * * If the task wakes up again before the inactive timer fires, - * the timer is cancelled, whereas if the task wakes up after the + * the timer is canceled, whereas if the task wakes up after the * inactive timer fired (and running_bw has been decreased) the * task's utilization has to be added to running_bw again. * A flag in the deadline scheduling entity (dl_non_contending) @@ -234,12 +431,11 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct task_struct *p) +static void task_non_contending(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->inactive_timer; - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); + struct dl_rq *dl_rq = &rq->dl; s64 zerolag_time; /* @@ -269,24 +465,33 @@ static void task_non_contending(struct task_struct *p) * utilization now, instead of starting a timer */ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { - if (dl_task(p)) + if (dl_server(dl_se)) { sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || p->state == TASK_DEAD) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - if (p->state == TASK_DEAD) - sub_rq_bw(&p->dl, &rq->dl); - raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - __dl_clear_params(p); - raw_spin_unlock(&dl_b->lock); + } else { + struct task_struct *p = dl_task_of(dl_se); + + if (dl_task(p)) + sub_running_bw(dl_se, dl_rq); + + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (READ_ONCE(p->__state) == TASK_DEAD) + sub_rq_bw(dl_se, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(dl_se); + } } return; } dl_se->dl_non_contending = 1; - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } @@ -308,13 +513,12 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) dl_se->dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) - put_task_struct(dl_task_of(dl_se)); + cancel_inactive_timer(dl_se); } else { /* * Since "dl_non_contending" is not set, the @@ -327,31 +531,20 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) } } -static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - struct sched_dl_entity *dl_se = &p->dl; - - return dl_rq->root.rb_leftmost == &dl_se->rb_node; + return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); -void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) -{ - raw_spin_lock_init(&dl_b->dl_runtime_lock); - dl_b->dl_period = period; - dl_b->dl_runtime = runtime; -} - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); - raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); if (global_rt_runtime() == RUNTIME_INF) dl_b->bw = -1; else dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); - raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); dl_b->total_bw = 0; } @@ -363,7 +556,6 @@ void init_dl_rq(struct dl_rq *dl_rq) /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; - dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else @@ -407,37 +599,17 @@ static inline void dl_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); } -static void update_dl_migration(struct dl_rq *dl_rq) -{ - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { - if (!dl_rq->overloaded) { - dl_set_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 1; - } - } else if (dl_rq->overloaded) { - dl_clear_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 0; - } -} +#define __node_2_pdl(node) \ + rb_entry((node), struct task_struct, pushable_dl_tasks) -static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) { - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory++; - - update_dl_migration(dl_rq); + return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); } -static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +static inline int has_pushable_dl_tasks(struct rq *rq) { - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory--; - - update_dl_migration(dl_rq); + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); } /* @@ -446,69 +618,52 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) */ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { - struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct task_struct *entry; - bool leftmost = true; - - BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct task_struct, - pushable_dl_tasks); - if (dl_entity_preempt(&p->dl, &entry->dl)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = false; - } - } + struct rb_node *leftmost; + WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + + leftmost = rb_add_cached(&p->pushable_dl_tasks, + &rq->dl.pushable_dl_tasks_root, + __pushable_less); if (leftmost) - dl_rq->earliest_dl.next = p->dl.deadline; + rq->dl.earliest_dl.next = p->dl.deadline; - rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color_cached(&p->pushable_dl_tasks, - &dl_rq->pushable_dl_tasks_root, leftmost); + if (!rq->dl.overloaded) { + dl_set_overload(rq); + rq->dl.overloaded = 1; + } } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; + struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root; + struct rb_node *leftmost; if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { - struct rb_node *next_node; - - next_node = rb_next(&p->pushable_dl_tasks); - if (next_node) { - dl_rq->earliest_dl.next = rb_entry(next_node, - struct task_struct, pushable_dl_tasks)->dl.deadline; - } - } + leftmost = rb_erase_cached(&p->pushable_dl_tasks, root); + if (leftmost) + dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; - rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); -} -static inline int has_pushable_dl_tasks(struct rq *rq) -{ - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); + if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) { + dl_clear_overload(rq); + rq->dl.overloaded = 0; + } } static int push_dl_task(struct rq *rq); static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) { - return dl_task(prev); + return rq->online && dl_task(prev); } -static DEFINE_PER_CPU(struct callback_head, dl_push_head); -static DEFINE_PER_CPU(struct callback_head, dl_pull_head); +static DEFINE_PER_CPU(struct balance_callback, dl_push_head); +static DEFINE_PER_CPU(struct balance_callback, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); @@ -547,7 +702,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * Failed to find any suitable CPU. * The task will never come back! */ - BUG_ON(dl_bandwidth_enabled()); + WARN_ON_ONCE(dl_bandwidth_enabled()); /* * If admission control is disabled we @@ -578,7 +733,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p } /* - * And we finally need to fixup root_domain(s) bandwidth accounting, + * And we finally need to fix up root_domain(s) bandwidth accounting, * since p is still hanging out in the old (now moved to default) root * domain. */ @@ -620,15 +775,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } -static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_dl_task(struct rq *rq) -{ -} - static inline void deadline_queue_push_tasks(struct rq *rq) { } @@ -638,9 +784,28 @@ static inline void deadline_queue_pull_task(struct rq *rq) } #endif /* CONFIG_SMP */ +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags); +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); + +static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +{ + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; + + /* + * If it is a deferred reservation, and the server + * is not handling an starvation case, defer it. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running) { + dl_se->dl_throttled = 1; + dl_se->dl_defer_armed = 1; + } +} /* * We are being explicitly informed that a new instance is starting, @@ -659,7 +824,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -675,10 +840,12 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } +static int start_dl_timer(struct sched_dl_entity *dl_se); +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); + /* * Pure Earliest Deadline First (EDF) scheduling does not deal with the * possibility of a entity lasting more than what it declared, and thus @@ -697,21 +864,27 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. + * + * Or, it could be the case of a deferred reservation that + * was not able to consume its runtime in background and + * reached this point with current u > U. + * + * In both cases, set a new period. */ - if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + if (dl_se->dl_deadline == 0 || + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded && dl_se->runtime > 0) @@ -724,8 +897,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -739,14 +912,51 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } if (dl_se->dl_yielded) dl_se->dl_yielded = 0; if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + + /* + * If this is the replenishment of a deferred reservation, + * clear the flag and return. + */ + if (dl_se->dl_defer_armed) { + dl_se->dl_defer_armed = 0; + return; + } + + /* + * A this point, if the deferred server is not armed, and the deadline + * is in the future, if it is not running already, throttle the server + * and arm the defer timer. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running && + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + + /* + * Set dl_se->dl_defer_armed and dl_throttled variables to + * inform the start_dl_timer() that this is a deferred + * activation. + */ + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + if (!start_dl_timer(dl_se)) { + /* + * If for whatever reason (delays), a previous timer was + * queued but not serviced, cancel it and clean the + * deferrable server variables intended for start_dl_timer(). + */ + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + } + } } /* @@ -773,8 +983,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -796,9 +1005,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -867,7 +1076,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * is detected, the runtime and deadline need to be updated. * * If the task has an implicit deadline, i.e., deadline == period, the Original - * CBS is applied. the runtime is replenished and a new absolute deadline is + * CBS is applied. The runtime is replenished and a new absolute deadline is * set, as in the previous cases. * * However, the Original CBS does not work properly for tasks with @@ -883,24 +1092,30 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); + } else if (dl_server(dl_se) && dl_se->dl_defer) { + /* + * The server can still use its previous deadline, so check if + * it left the dl_defer_running state. + */ + if (!dl_se->dl_defer_running) { + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + } } } @@ -919,22 +1134,35 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se) * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct task_struct *p) +static int start_dl_timer(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->dl_timer; - struct rq *rq = task_rq(p); + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; s64 delta; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. + * + * The deferred reservation will have its timer set to + * (deadline - runtime). At that point, the CBS rule will decide + * if the current deadline can be used, or if a replenishment is + * required to avoid add too much pressure on the system + * (current u > U). */ - act = ns_to_ktime(dl_next_period(dl_se)); + if (dl_se->dl_defer_armed) { + WARN_ON_ONCE(!dl_se->dl_throttled); + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); + } else { + /* act = deadline - rel-deadline + period */ + act = ns_to_ktime(dl_next_period(dl_se)); + } + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -957,13 +1185,89 @@ static int start_dl_timer(struct task_struct *p) * and observe our state. */ if (!hrtimer_is_queued(timer)) { - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } return 1; } +static void __push_dl_task(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + rq_unpin_lock(rq, rf); + push_dl_task(rq); + rq_repin_lock(rq, rf); + } +#endif +} + +/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ +static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; + +static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) +{ + struct rq *rq = rq_of_dl_se(dl_se); + u64 fw; + + scoped_guard (rq_lock, rq) { + struct rq_flags *rf = &scope.rf; + + if (!dl_se->dl_throttled || !dl_se->dl_runtime) + return HRTIMER_NORESTART; + + sched_clock_tick(); + update_rq_clock(rq); + + if (!dl_se->dl_runtime) + return HRTIMER_NORESTART; + + if (!dl_se->server_has_tasks(dl_se)) { + replenish_dl_entity(dl_se); + return HRTIMER_NORESTART; + } + + if (dl_se->dl_defer_armed) { + /* + * First check if the server could consume runtime in background. + * If so, it is possible to push the defer timer for this amount + * of time. The dl_server_min_res serves as a limit to avoid + * forwarding the timer for a too small amount of time. + */ + if (dl_time_before(rq_clock(dl_se->rq), + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { + + /* reset the defer timer */ + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; + + hrtimer_forward_now(timer, ns_to_ktime(fw)); + return HRTIMER_RESTART; + } + + dl_se->dl_defer_running = 1; + } + + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) + resched_curr(rq); + + __push_dl_task(rq, rf); + } + + return HRTIMER_NORESTART; +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -982,10 +1286,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, dl_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p; struct rq_flags rf; struct rq *rq; + if (dl_server(dl_se)) + return dl_server_timer(timer, dl_se); + + p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); /* @@ -999,7 +1307,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1027,7 +1335,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1037,9 +1345,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * If the runqueue is no longer available, migrate the * task elsewhere. This necessarily changes rq. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf.cookie); rq = dl_task_offline_migration(rq, p); - rf.cookie = lockdep_pin_lock(&rq->lock); + rf.cookie = lockdep_pin_lock(__rq_lockp(rq)); update_rq_clock(rq); /* @@ -1051,26 +1359,12 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) #endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (dl_task(rq->donor)) + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); -#ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, check if we need - * to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) { - /* - * Nothing relies on rq->lock after this, so its safe to drop - * rq->lock. - */ - rq_unpin_lock(rq, &rf); - push_dl_task(rq); - rq_repin_lock(rq, &rf); - } -#endif + __push_dl_task(rq, &rf); unlock: task_rq_unlock(rq, p, &rf); @@ -1084,7 +1378,7 @@ unlock: return HRTIMER_NORESTART; } -void init_dl_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; @@ -1098,7 +1392,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the @@ -1112,12 +1406,11 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) */ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1131,91 +1424,46 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); } -extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* - * This function implements the GRUB accounting rule: - * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as - * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * This function implements the GRUB accounting rule. According to the + * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt", + * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt", * where u is the utilization of the task, Umax is the maximum reclaimable * utilization, Uinact is the (per-runqueue) inactive utilization, computed * as the difference between the "total runqueue utilization" and the - * runqueue active utilization, and Uextra is the (per runqueue) extra + * "runqueue active utilization", and Uextra is the (per runqueue) extra * reclaimable utilization. - * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations - * multiplied by 2^BW_SHIFT, the result has to be shifted right by - * BW_SHIFT. - * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, - * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. - * Since delta is a 64 bit variable, to have an overflow its value - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. - * So, overflow is not an issue here. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied + * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw + * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value should be + * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is + * not an issue here. */ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { - u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; - u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ /* - * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, - * we compare u_inact + rq->dl.extra_bw with - * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because - * u_inact + rq->dl.extra_bw can be larger than - * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative - * leading to wrong results) + * Instead of computing max{u, (u_max - u_inact - u_extra)}, we + * compare u_inact + u_extra with u_max - u, because u_inact + u_extra + * can be larger than u_max. So, u_max - u_inact - u_extra would be + * negative leading to wrong results. */ - if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) - u_act = u_act_min; + if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw) + u_act = dl_se->dl_bw; else - u_act = BW_UNIT - u_inact - rq->dl.extra_bw; + u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw; + u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT; return (delta * u_act) >> BW_SHIFT; } -/* - * Update the current task's runtime statistics (provided it is still - * a -deadline task and has not been removed from the dl_rq). - */ -static void update_curr_dl(struct rq *rq) +s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec, scaled_delta_exec; - int cpu = cpu_of(rq); - u64 now; - - if (!dl_task(curr) || !on_dl_rq(dl_se)) - return; - - /* - * Consumed budget is computed considering the time as - * observed by schedulable tasks (excluding time spent - * in hardirq context, etc.). Deadlines are instead - * computed using hard walltime. This seems to be the more - * natural solution, but the full ramifications of this - * approach need further study. - */ - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) { - if (unlikely(dl_se->dl_yielded)) - goto throttle; - return; - } - - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); - - if (dl_entity_is_special(dl_se)) - return; + s64 scaled_delta_exec; /* * For tasks that participate in GRUB, we implement GRUB-PA: the @@ -1225,10 +1473,9 @@ static void update_curr_dl(struct rq *rq) * according to current frequency and CPU maximum capacity. */ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { - scaled_delta_exec = grub_reclaim(delta_exec, - rq, - &curr->dl); + scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se); } else { + int cpu = cpu_of(rq); unsigned long scale_freq = arch_scale_freq_capacity(cpu); unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); @@ -1236,8 +1483,64 @@ static void update_curr_dl(struct rq *rq) scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); } + return scaled_delta_exec; +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +{ + s64 scaled_delta_exec; + + if (unlikely(delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; + return; + } + + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) + return; + + if (dl_entity_is_special(dl_se)) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + dl_se->runtime -= scaled_delta_exec; + /* + * The fair server can consume its runtime while throttled (not queued/ + * running as regular CFS). + * + * If the server consumes its entire runtime in this state. The server + * is not required for the current period. Thus, reset the server by + * starting a new period, pushing the activation. + */ + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { + /* + * If the server was previously activated - the starving condition + * took place, it this point it went away because the fair scheduler + * was able to get runtime in background. So return to the initial + * state. + */ + dl_se->dl_defer_running = 0; + + hrtimer_try_to_cancel(&dl_se->dl_timer); + + replenish_dl_new_period(dl_se, dl_se->rq); + + /* + * Not being able to start the timer seems problematic. If it could not + * be started for whatever reason, we need to "unthrottle" the DL server + * and queue right away. Otherwise nothing might queue it. That's similar + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. + */ + WARN_ON_ONCE(!start_dl_timer(dl_se)); + + return; + } + throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; @@ -1247,15 +1550,32 @@ throttle: (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) dl_se->dl_overrun = 1; - __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) - enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + dequeue_dl_entity(dl_se, 0); + if (!dl_server(dl_se)) { + update_stats_dequeue_dl(&rq->dl, dl_se, 0); + dequeue_pushable_dl_task(rq, dl_task_of(dl_se)); + } + + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { + if (dl_server(dl_se)) + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + else + enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } - if (!is_leftmost(curr, &rq->dl)) + if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); } /* + * The fair server (sole dl_server) does not account for real-time + * workload because it is running fair work. + */ + if (dl_se == &rq->fair_server) + return; + +#ifdef CONFIG_RT_GROUP_SCHED + /* * Because -- for now -- we share the rt bandwidth, we need to * account our runtime there too, otherwise actual rt tasks * would be able to exceed the shared quota. @@ -1279,6 +1599,182 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } +#endif +} + +/* + * In the non-defer mode, the idle time is not accounted, as the + * server provides a guarantee. + * + * If the dl_server is in defer mode, the idle time is also considered + * as time available for the fair server, avoiding a penalty for the + * rt scheduler that did not consumed that time. + */ +void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +{ + s64 delta_exec, scaled_delta_exec; + + if (!rq->fair_server.dl_defer) + return; + + /* no need to discount more */ + if (rq->fair_server.runtime < 0) + return; + + delta_exec = rq_clock_task(rq) - p->se.exec_start; + if (delta_exec < 0) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); + + rq->fair_server.runtime -= scaled_delta_exec; + + if (rq->fair_server.runtime < 0) { + rq->fair_server.dl_defer_running = 0; + rq->fair_server.runtime = 0; + } + + p->se.exec_start = rq_clock_task(rq); +} + +void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) +{ + /* 0 runtime = fair server disabled */ + if (dl_se->dl_runtime) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); +} + +void dl_server_start(struct sched_dl_entity *dl_se) +{ + struct rq *rq = dl_se->rq; + + /* + * XXX: the apply do not work fine at the init phase for the + * fair server because things are not yet set. We need to improve + * this before getting generic. + */ + if (!dl_server(dl_se)) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } + + if (!dl_se->dl_runtime) + return; + + dl_se->dl_server_active = 1; + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) + resched_curr(dl_se->rq); +} + +void dl_server_stop(struct sched_dl_entity *dl_se) +{ + if (!dl_se->dl_runtime) + return; + + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + dl_se->dl_server_active = 0; +} + +void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick_task) +{ + dl_se->rq = rq; + dl_se->server_has_tasks = has_tasks; + dl_se->server_pick_task = pick_task; +} + +void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 new_bw = dl_se->dl_bw; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + + dl_b = dl_bw_of(cpu_of(rq)); + guard(raw_spinlock)(&dl_b->lock); + + if (!dl_bw_cpus(cpu)) + return; + + __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); +} + +int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) +{ + u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 new_bw = to_ratio(period, runtime); + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + unsigned long cap; + int retval = 0; + int cpus; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + + if (__dl_overflow(dl_b, cap, old_bw, new_bw)) + return -EBUSY; + + if (init) { + __add_rq_bw(new_bw, &rq->dl); + __dl_add(dl_b, new_bw, cpus); + } else { + __dl_sub(dl_b, dl_se->dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + + dl_rq_change_utilization(rq, dl_se, new_bw); + } + + dl_se->dl_runtime = runtime; + dl_se->dl_deadline = period; + dl_se->dl_period = period; + + dl_se->runtime = 0; + dl_se->deadline = 0; + + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); + + return retval; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *donor = rq->donor; + struct sched_dl_entity *dl_se = &donor->dl; + s64 delta_exec; + + if (!dl_task(donor) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = update_curr_common(rq); + update_curr_dl_se(rq, dl_se, delta_exec); } static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) @@ -1286,19 +1782,28 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, inactive_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p = NULL; struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &rf); + if (!dl_server(dl_se)) { + p = dl_task_of(dl_se); + rq = task_rq_lock(p, &rf); + } else { + rq = dl_se->rq; + rq_lock(rq, &rf); + } sched_clock_tick(); update_rq_clock(rq); - if (!dl_task(p) || p->state == TASK_DEAD) { + if (dl_server(dl_se)) + goto no_task; + + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; @@ -1307,23 +1812,30 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); goto unlock; } + +no_task: if (dl_se->dl_non_contending == 0) goto unlock; sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: - task_rq_unlock(rq, p, &rf); - put_task_struct(p); + + if (!dl_server(dl_se)) { + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + } else { + rq_unlock(rq, &rf); + } return HRTIMER_NORESTART; } -void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; @@ -1331,6 +1843,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) timer->function = inactive_task_timer; } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -1339,6 +1854,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + if (dl_rq->earliest_dl.curr == 0) + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } @@ -1356,11 +1873,11 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { - struct rb_node *leftmost = dl_rq->root.rb_leftmost; - struct sched_dl_entity *entry; + struct rb_node *leftmost = rb_first_cached(&dl_rq->root); + struct sched_dl_entity *entry = __node_2_dle(leftmost); - entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } @@ -1376,54 +1893,106 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; u64 deadline = dl_se->deadline; - WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); - inc_dl_migration(dl_se, dl_rq); } static inline void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; - - WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); - dec_dl_migration(dl_se, dl_rq); +} + +static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); +} + +static __always_inline struct sched_statistics * +__schedstats_from_dl_se(struct sched_dl_entity *dl_se) +{ + if (!schedstat_enabled()) + return NULL; + + if (dl_server(dl_se)) + return NULL; + + return &dl_task_of(dl_se)->stats; +} + +static inline void +update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_dl(dl_rq, dl_se); +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (!schedstat_enabled()) + return; + + if ((flags & DEQUEUE_SLEEP)) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_dl_rq(dl_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_dl_rq(dl_rq))); + } } static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_dl_entity *entry; - int leftmost = 1; - - BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_dl_entity, rb_node); - if (dl_time_before(dl_se->deadline, entry->deadline)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = 0; - } - } - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); + WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node)); + + rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); inc_dl_tasks(dl_se, dl_rq); } @@ -1436,67 +2005,18 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) return; rb_erase_cached(&dl_se->rb_node, &dl_rq->root); + RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - BUG_ON(on_dl_rq(dl_se)); + WARN_ON_ONCE(on_dl_rq(dl_se)); - /* - * If this is a wakeup or a new instance, the scheduling - * parameters of the task might need updating. Otherwise, - * we want a replenishment of its runtime. - */ - if (flags & ENQUEUE_WAKEUP) { - task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); - } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); - } else if ((flags & ENQUEUE_RESTORE) && - dl_time_before(dl_se->deadline, - rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { - setup_new_dl_entity(dl_se); - } - - __enqueue_dl_entity(dl_se); -} - -static void dequeue_dl_entity(struct sched_dl_entity *dl_se) -{ - __dequeue_dl_entity(dl_se); -} - -static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) -{ - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; - } else if (!dl_prio(p->normal_prio)) { - /* - * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceeds its - * runtime while doing so. No point in replenishing - * it, as it's going to return back to its original - * scheduling class after this. - */ - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); - return; - } + update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); /* * Check if a constrained deadline task was activated @@ -1504,12 +2024,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) - dl_check_constrained_dl(&p->dl); + if (!dl_se->dl_throttled && !dl_is_implicit(dl_se)) + dl_check_constrained_dl(dl_se); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(&p->dl, &rq->dl); - add_running_bw(&p->dl, &rq->dl); + if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + add_rq_bw(dl_se, dl_rq); + add_running_bw(dl_se, dl_rq); } /* @@ -1524,33 +2046,60 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * be counted in the active utilization; hence, we need to call * add_running_bw(). */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl, flags); + task_contending(dl_se, flags); return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + /* + * If this is a wakeup or a new instance, the scheduling + * parameters of the task might need updating. Otherwise, + * we want a replenishment of its runtime. + */ + if (flags & ENQUEUE_WAKEUP) { + task_contending(dl_se, flags); + update_dl_entity(dl_se); + } else if (flags & ENQUEUE_REPLENISH) { + replenish_dl_entity(dl_se); + } else if ((flags & ENQUEUE_RESTORE) && + !is_dl_boosted(dl_se) && + dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { + setup_new_dl_entity(dl_se); + } + + /* + * If the reservation is still throttled, e.g., it got replenished but is a + * deferred task and still got to wait, don't enqueue. + */ + if (dl_se->dl_throttled && start_dl_timer(dl_se)) + return; - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) - enqueue_pushable_dl_task(rq, p); -} + /* + * We're about to enqueue, make sure we're not ->dl_throttled! + * In case the timer was not started, say because the defer time + * has passed, mark as not throttled and mark unarmed. + * Also cancel earlier timers, since letting those run is pointless. + */ + if (dl_se->dl_throttled) { + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) -{ - dequeue_dl_entity(&p->dl); - dequeue_pushable_dl_task(rq, p); + __enqueue_dl_entity(dl_se); } -static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - update_curr_dl(rq); - __dequeue_task_dl(rq, p, flags); + __dequeue_dl_entity(dl_se); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(&p->dl, &rq->dl); - sub_rq_bw(&p->dl, &rq->dl); + if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + sub_running_bw(dl_se, dl_rq); + sub_rq_bw(dl_se, dl_rq); } /* @@ -1563,7 +2112,78 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - task_non_contending(p); + task_non_contending(dl_se); +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + if (is_dl_boosted(&p->dl)) { + /* + * Because of delays in the detection of the overrun of a + * thread's runtime, it might be the case that a thread + * goes to sleep in a rt mutex with negative runtime. As + * a consequence, the thread will be throttled. + * + * While waiting for the mutex, this thread can also be + * boosted via PI, resulting in a thread that is throttled + * and boosted at the same time. + * + * In this case, the boost overrides the throttle. + */ + if (p->dl.dl_throttled) { + /* + * The replenish timer needs to be canceled. No + * problem if it fires concurrently: boosted threads + * are ignored in dl_task_timer(). + */ + cancel_replenish_timer(&p->dl); + p->dl.dl_throttled = 0; + } + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task that is going + * to be deboosted, but exceeds its runtime while doing so. No point in + * replenishing it, as it's going to return back to its original + * scheduling class after this. If it has been throttled, we need to + * clear the flag, otherwise the task may wake up as throttled after + * being boosted again with no means to replenish the runtime and clear + * the throttle. + */ + p->dl.dl_throttled = 0; + if (!(flags & ENQUEUE_REPLENISH)) + printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", + task_pid_nr(p)); + + return; + } + + check_schedstat_required(); + update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= ENQUEUE_MIGRATING; + + enqueue_dl_entity(&p->dl, flags); + + if (dl_server(&p->dl)) + return; + + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); +} + +static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + update_curr_dl(rq); + + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= DEQUEUE_MIGRATING; + + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled && !dl_server(&p->dl)) + dequeue_pushable_dl_task(rq, p); + + return true; } /* @@ -1598,21 +2218,31 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP +static inline bool dl_task_is_earliest_deadline(struct task_struct *p, + struct rq *rq) +{ + return (!rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + rq->dl.earliest_dl.curr)); +} + static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; + bool select_rq; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE) + if (!(flags & WF_TTWU)) goto out; rq = cpu_rq(cpu); rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If we are dealing with a -deadline task, we must @@ -1623,16 +2253,23 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(donor)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &donor->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (sched_asym_cpucap_active()) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && - (dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr) || - (cpu_rq(target)->dl.dl_nr_running == 0))) + dl_task_is_earliest_deadline(p, cpu_rq(target))) cpu = target; } rcu_read_unlock(); @@ -1643,9 +2280,10 @@ out: static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq; - if (p->state != TASK_WAKING) + if (READ_ONCE(p->__state) != TASK_WAKING) return; rq = task_rq(p); @@ -1654,22 +2292,22 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) @@ -1679,7 +2317,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) return; /* @@ -1715,10 +2353,10 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; } @@ -1728,26 +2366,31 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((p->dl.deadline == rq->curr->dl.deadline) && + if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, dl_se->runtime); } #else /* !CONFIG_SCHED_HRTICK */ -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } #endif static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + p->se.exec_start = rq_clock_task(rq); + if (on_dl_rq(&p->dl)) + update_stats_wait_end_dl(dl_rq, dl_se); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); @@ -1755,44 +2398,72 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, p); - - if (rq->curr->sched_class != &dl_sched_class) + if (rq->donor->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); + + if (hrtick_enabled_dl(rq)) + start_hrtick_dl(rq, &p->dl); } -static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, - struct dl_rq *dl_rq) +static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) { struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; - return rb_entry(left, struct sched_dl_entity, rb_node); + return __node_2_dle(left); } -static struct task_struct *pick_next_task_dl(struct rq *rq) +/* + * __pick_next_task_dl - Helper to pick the next -deadline task to run. + * @rq: The runqueue to pick the next task from. + */ +static struct task_struct *__pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; +again: if (!sched_dl_runnable(rq)) return NULL; - dl_se = pick_next_dl_entity(rq, dl_rq); - BUG_ON(!dl_se); - p = dl_task_of(dl_se); - set_next_task_dl(rq, p, true); + dl_se = pick_next_dl_entity(dl_rq); + WARN_ON_ONCE(!dl_se); + + if (dl_server(dl_se)) { + p = dl_se->server_pick_task(dl_se); + if (!p) { + if (dl_server_active(dl_se)) { + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + } + goto again; + } + rq->dl_server = dl_se; + } else { + p = dl_task_of(dl_se); + } + return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static struct task_struct *pick_task_dl(struct rq *rq) +{ + return __pick_task_dl(rq); +} + +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + + if (on_dl_rq(&p->dl)) + update_stats_wait_start_dl(dl_rq, dl_se); + update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); @@ -1818,9 +2489,9 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * not being the leftmost task anymore. In that case NEED_RESCHED will * be set and schedule() will start a new hrtick for the next task. */ - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && - is_leftmost(p, &rq->dl)) - start_hrtick_dl(rq, p); + if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && + is_leftmost(&p->dl, &rq->dl)) + start_hrtick_dl(rq, &p->dl); } static void task_fork_dl(struct task_struct *p) @@ -1836,35 +2507,26 @@ static void task_fork_dl(struct task_struct *p) /* Only try algorithms three times */ #define DL_MAX_TRIES 3 -static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) - return 1; - return 0; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; -next_node: - if (next_node) { - p = rb_entry(next_node, struct task_struct, pushable_dl_tasks); + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + while (next_node) { + p = __node_2_pdl(next_node); - if (pick_dl_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); - goto next_node; } return NULL; @@ -1929,8 +2591,8 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(later_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(later_mask, + sched_domain_span(sd)); /* * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our @@ -1952,7 +2614,7 @@ static int find_later_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(later_mask); + cpu = cpumask_any_distribute(later_mask); if (cpu < nr_cpu_ids) return cpu; @@ -1974,9 +2636,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (later_rq->dl.dl_nr_running && - !dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) { + if (!dl_task_is_earliest_deadline(task, later_rq)) { /* * Target rq has tasks of equal or earlier deadline, * retrying does not release any lock and is unlikely @@ -1989,9 +2649,10 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || - task_running(rq, task) || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || + task_on_cpu(rq, task) || !dl_task(task) || + is_migration_disabled(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; @@ -2004,9 +2665,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) * its earliest one has a later deadline than our * task, the rq is a good one. */ - if (!later_rq->dl.dl_nr_running || - dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) + if (dl_task_is_earliest_deadline(task, later_rq)) break; /* Otherwise we try again. */ @@ -2024,15 +2683,14 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, - struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); return p; } @@ -2048,29 +2706,29 @@ static int push_dl_task(struct rq *rq) struct rq *later_rq; int ret = 0; - if (!rq->dl.overloaded) - return 0; - next_task = pick_next_pushable_dl_task(rq); if (!next_task) return 0; retry: - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * If next_task preempts rq->curr, and rq->curr * can move away, it makes sense to just reschedule * without going further in pushing next_task. */ - if (dl_task(rq->curr) && - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + if (dl_task(rq->donor) && + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; } + if (is_migration_disabled(next_task)) + return 0; + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -2102,15 +2760,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, later_rq->cpu); - - /* - * Update the later_rq clock here, because the clock is used - * by the cpufreq_update_util() inside __add_running_bw(). - */ - update_rq_clock(later_rq); - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); + move_queued_task_locked(rq, later_rq, next_task); ret = 1; resched_curr(later_rq); @@ -2133,7 +2783,7 @@ static void push_dl_tasks(struct rq *rq) static void pull_dl_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; - struct task_struct *p; + struct task_struct *p, *push_task; bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; @@ -2154,7 +2804,7 @@ static void pull_dl_task(struct rq *this_rq) src_rq = cpu_rq(cpu); /* - * It looks racy, abd it is! However, as in sched_rt.c, + * It looks racy, and it is! However, as in sched_rt.c, * we are fine with this. */ if (this_rq->dl.dl_nr_running && @@ -2163,6 +2813,7 @@ static void pull_dl_task(struct rq *this_rq) continue; /* Might drop this_rq->lock */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2180,9 +2831,7 @@ static void pull_dl_task(struct rq *this_rq) * - it will preempt the last one we pulled (if any). */ if (p && dl_time_before(p->dl.deadline, dmin) && - (!this_rq->dl.dl_nr_running || - dl_time_before(p->dl.deadline, - this_rq->dl.earliest_dl.curr))) { + dl_task_is_earliest_deadline(p, this_rq)) { WARN_ON(p == src_rq->curr); WARN_ON(!task_on_rq_queued(p)); @@ -2191,20 +2840,30 @@ static void pull_dl_task(struct rq *this_rq) * deadline than the current task of its runqueue. */ if (dl_time_before(p->dl.deadline, - src_rq->curr->dl.deadline)) + src_rq->donor->dl.deadline)) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - dmin = p->dl.deadline; + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + move_queued_task_locked(src_rq, this_rq, p); + dmin = p->dl.deadline; + resched = true; + } /* Is there any other task even earlier? */ } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + preempt_disable(); + raw_spin_rq_unlock(this_rq); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + preempt_enable(); + raw_spin_rq_lock(this_rq); + } } if (resched) @@ -2217,23 +2876,23 @@ skip: */ static void task_woken_dl(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - dl_task(rq->curr) && + dl_task(rq->donor) && (rq->curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { push_dl_tasks(rq); } } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask) + struct affinity_context *ctx) { struct root_domain *src_rd; struct rq *rq; - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); src_rd = rq->rd; @@ -2243,7 +2902,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, new_mask)) { + if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -2257,7 +2916,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask); + set_cpus_allowed_common(p, ctx); } /* Assumes rq->lock is held */ @@ -2296,9 +2955,13 @@ void dl_add_task_root_domain(struct task_struct *p) struct rq *rq; struct dl_bw *dl_b; - rq = task_rq_lock(p, &rf); - if (!dl_task(p)) - goto unlock; + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + if (!dl_task(p)) { + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return; + } + + rq = __task_rq_lock(p, &rf); dl_b = &rq->rd->dl_bw; raw_spin_lock(&dl_b->lock); @@ -2307,17 +2970,27 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock(&dl_b->lock); -unlock: task_rq_unlock(rq, p, &rf); } void dl_clear_root_domain(struct root_domain *rd) { - unsigned long flags; + int i; - raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); rd->dl_bw.total_bw = 0; - raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); + + /* + * dl_server bandwidth is only restored when CPUs are attached to root + * domains (after domains are created or CPUs moved back to the + * default root doamin). + */ + for_each_cpu(i, rd->span) { + struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; + + if (dl_server(dl_se) && cpu_active(i)) + rd->dl_bw.total_bw += dl_se->dl_bw; + } } #endif /* CONFIG_SMP */ @@ -2333,7 +3006,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(p); + task_non_contending(&p->dl); + + /* + * In case a task is setscheduled out from SCHED_DEADLINE we need to + * keep track of that on its cpuset (for correct bandwidth tracking). + */ + dec_dl_tasks_cs(p); if (!task_on_rq_queued(p)) { /* @@ -2372,8 +3051,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); + + /* + * In case a task is setscheduled to SCHED_DEADLINE we need to keep + * track of that on its cpuset (for correct bandwidth tracking). + */ + inc_dl_tasks_cs(p); /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { @@ -2382,15 +3066,17 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - if (rq->curr != p) { + if (rq->donor != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); #endif - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (dl_task(rq->donor)) + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); + } else { + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); } } @@ -2401,17 +3087,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || rq->curr == p) { + if (!task_on_rq_queued(p)) + return; + #ifdef CONFIG_SMP - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) - deadline_queue_pull_task(rq); + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + deadline_queue_pull_task(rq); + if (task_current_donor(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this @@ -2419,26 +3108,42 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) resched_curr(rq); -#else + } else { /* - * Again, we don't know if p has a earlier - * or later deadline, so let's blindly set a - * (maybe not needed) rescheduling point. + * Current may not be deadline in case p was throttled but we + * have just replenished it (e.g. rt_mutex_setprio()). + * + * Otherwise, if p was given an earlier deadline, reschedule. */ - resched_curr(rq); -#endif /* CONFIG_SMP */ + if (!dl_task(rq->curr) || + dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) + resched_curr(rq); } +#else + /* + * We don't know if p has a earlier or later deadline, so let's blindly + * set a (maybe not needed) rescheduling point. + */ + resched_curr(rq); +#endif +} + +#ifdef CONFIG_SCHED_CORE +static int task_is_throttled_dl(struct task_struct *p, int cpu) +{ + return p->dl.dl_throttled; } +#endif + +DEFINE_SCHED_CLASS(dl) = { -const struct sched_class dl_sched_class = { - .next = &rt_sched_class, .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, - .check_preempt_curr = check_preempt_curr_dl, + .wakeup_preempt = wakeup_preempt_dl, - .pick_next_task = pick_next_task_dl, + .pick_task = pick_task_dl, .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, @@ -2450,6 +3155,7 @@ const struct sched_class dl_sched_class = { .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, + .find_lock_rq = find_lock_later_rq, #endif .task_tick = task_tick_dl, @@ -2460,35 +3166,44 @@ const struct sched_class dl_sched_class = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_dl, +#endif }; +/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ +static u64 dl_generation; + int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + u64 gen = ++dl_generation; struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus, ret = 0; unsigned long flags; /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! */ - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) + goto next; + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); +next: rcu_read_unlock_sched(); if (ret) @@ -2502,33 +3217,34 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; - dl_rq->extra_bw = 1 << BW_SHIFT; + dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT; } else { dl_rq->bw_ratio = to_ratio(global_rt_runtime(), global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); - dl_rq->extra_bw = to_ratio(global_rt_period(), - global_rt_runtime()); + dl_rq->max_bw = dl_rq->extra_bw = + to_ratio(global_rt_period(), global_rt_runtime()); } } void sched_dl_do_global(void) { u64 new_bw = -1; + u64 gen = ++dl_generation; struct dl_bw *dl_b; int cpu; unsigned long flags; - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - /* - * FIXME: As above... - */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) { + rcu_read_unlock_sched(); + continue; + } + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); @@ -2551,11 +3267,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2570,15 +3287,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2618,7 +3337,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2631,7 +3350,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2646,6 +3366,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2669,22 +3391,29 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } /* * This function clears the sched_dl_entity static params. */ -void __dl_clear_params(struct task_struct *p) +static void __dl_clear_params(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; dl_se->dl_deadline = 0; dl_se->dl_period = 0; @@ -2692,11 +3421,23 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; - dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + dl_se->dl_server = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif +} + +void init_dl_entity(struct sched_dl_entity *dl_se) +{ + RB_CLEAR_NODE(&dl_se->rb_node); + init_dl_task_timer(dl_se); + init_dl_inactive_task_timer(dl_se); + __dl_clear_params(dl_se); } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) @@ -2706,60 +3447,25 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; } #ifdef CONFIG_SMP -int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) -{ - unsigned int dest_cpu; - struct dl_bw *dl_b; - bool overflow; - int cpus, ret; - unsigned long flags; - - dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) { - ret = -EBUSY; - } else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, cpus); - ret = 0; - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - return ret; -} - int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; + unsigned long flags, cap; struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - + cap = __dl_bw_capacity(trial); raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + if (__dl_overflow(cur_dl_b, cap, 0, 0)) ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); @@ -2767,22 +3473,97 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -bool dl_cpu_busy(unsigned int cpu) +enum dl_bw_request { + dl_bw_req_deactivate = 0, + dl_bw_req_alloc, + dl_bw_req_free +}; + +static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; - bool overflow; - int cpus; + bool overflow = 0; + u64 fair_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + + cap = dl_bw_capacity(cpu); + switch (req) { + case dl_bw_req_free: + __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); + break; + case dl_bw_req_alloc: + overflow = __dl_overflow(dl_b, cap, 0, dl_bw); + + if (!overflow) { + /* + * We reserve space in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); + } + break; + case dl_bw_req_deactivate: + /* + * cpu is not off yet, but we need to do the math by + * considering it off already (i.e., what would happen if we + * turn cpu off?). + */ + cap -= arch_scale_cpu_capacity(cpu); + + /* + * cpu is going offline and NORMAL tasks will be moved away + * from it. We can thus discount dl_server bandwidth + * contribution as it won't need to be servicing tasks after + * the cpu is off. + */ + if (cpu_rq(cpu)->fair_server.dl_server) + fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + + /* + * Not much to check if no DEADLINE bandwidth is present. + * dl_servers we can discount, as tasks will be moved out the + * offlined CPUs anyway. + */ + if (dl_b->total_bw - fair_server_bw > 0) { + /* + * Leaving at least one CPU for DEADLINE tasks seems a + * wise thing to do. As said above, cpu is not offline + * yet, so account for that. + */ + if (dl_bw_cpus(cpu) - 1) + overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + else + overflow = 1; + } + + break; + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - return overflow; + return overflow ? -EBUSY : 0; +} + +int dl_bw_deactivate(int cpu) +{ + return dl_bw_manage(dl_bw_req_deactivate, cpu, 0); +} + +int dl_bw_alloc(int cpu, u64 dl_bw) +{ + return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw); +} + +void dl_bw_free(int cpu, u64 dl_bw) +{ + dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } #endif |