diff options
Diffstat (limited to 'kernel/sched/deadline.c')
-rw-r--r-- | kernel/sched/deadline.c | 745 |
1 files changed, 554 insertions, 191 deletions
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a04a436af8cc..ad45a8fea245 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -26,7 +26,7 @@ static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ #ifdef CONFIG_SYSCTL -static struct ctl_table sched_dl_sysctls[] = { +static const struct ctl_table sched_dl_sysctls[] = { { .procname = "sched_deadline_period_max_us", .data = &sysctl_sched_dl_period_max, @@ -43,7 +43,6 @@ static struct ctl_table sched_dl_sysctls[] = { .proc_handler = proc_douintvec_minmax, .extra2 = (void *)&sysctl_sched_dl_period_max, }, - {} }; static int __init sched_dl_sysctl_init(void) @@ -167,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i) } } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { struct root_domain *rd = cpu_rq(cpu)->rd; - if (rd->visit_gen == gen) + if (rd->visit_cookie == cookie) return true; - rd->visit_gen = gen; + rd->visit_cookie = cookie; return false; } @@ -208,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i) return SCHED_CAPACITY_SCALE; } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { return false; } @@ -250,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; - SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } @@ -263,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ @@ -277,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; - SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */ } static inline @@ -287,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) dl_rq->this_bw = 0; - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); } static inline @@ -321,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -static void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) { - struct rq *rq; - - WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); - - if (task_on_rq_queued(p)) - return; + if (dl_se->dl_non_contending) { + sub_running_bw(dl_se, &rq->dl); + dl_se->dl_non_contending = 0; - rq = task_rq(p); - if (p->dl.dl_non_contending) { - sub_running_bw(&p->dl, &rq->dl); - p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the * timer cannot be canceled, inactive_task_timer() @@ -341,13 +333,48 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } - __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __sub_rq_bw(dl_se->dl_bw, &rq->dl); __add_rq_bw(new_bw, &rq->dl); } +static __always_inline +void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer) +{ + /* + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). + */ + if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); +} + +static __always_inline +void cancel_replenish_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->dl_timer); +} + +static __always_inline +void cancel_inactive_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->inactive_timer); +} + +static void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); + + if (task_on_rq_queued(p)) + return; + + dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); +} + static void __dl_clear_params(struct sched_dl_entity *dl_se); /* @@ -491,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { - if (!dl_server(dl_se)) - put_task_struct(dl_task_of(dl_se)); - } + cancel_inactive_timer(dl_se); } else { /* * Since "dl_non_contending" is not set, the @@ -709,7 +733,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p } /* - * And we finally need to fixup root_domain(s) bandwidth accounting, + * And we finally need to fix up root_domain(s) bandwidth accounting, * since p is still hanging out in the old (now moved to default) root * domain. */ @@ -772,6 +796,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, /* for non-boosted task, pi_of(dl_se) == dl_se */ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; dl_se->runtime = pi_of(dl_se)->dl_runtime; + + /* + * If it is a deferred reservation, and the server + * is not handling an starvation case, defer it. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running) { + dl_se->dl_throttled = 1; + dl_se->dl_defer_armed = 1; + } } /* @@ -810,6 +843,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) replenish_dl_new_period(dl_se, rq); } +static int start_dl_timer(struct sched_dl_entity *dl_se); +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); + /* * Pure Earliest Deadline First (EDF) scheduling does not deal with the * possibility of a entity lasting more than what it declared, and thus @@ -838,9 +874,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. + * + * Or, it could be the case of a deferred reservation that + * was not able to consume its runtime in background and + * reached this point with current u > U. + * + * In both cases, set a new period. */ - if (dl_se->dl_deadline == 0) - replenish_dl_new_period(dl_se, rq); + if (dl_se->dl_deadline == 0 || + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; + } if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -874,6 +919,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) dl_se->dl_yielded = 0; if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + + /* + * If this is the replenishment of a deferred reservation, + * clear the flag and return. + */ + if (dl_se->dl_defer_armed) { + dl_se->dl_defer_armed = 0; + return; + } + + /* + * A this point, if the deferred server is not armed, and the deadline + * is in the future, if it is not running already, throttle the server + * and arm the defer timer. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running && + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + + /* + * Set dl_se->dl_defer_armed and dl_throttled variables to + * inform the start_dl_timer() that this is a deferred + * activation. + */ + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + if (!start_dl_timer(dl_se)) { + /* + * If for whatever reason (delays), a previous timer was + * queued but not serviced, cancel it and clean the + * deferrable server variables intended for start_dl_timer(). + */ + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + } + } } /* @@ -993,7 +1076,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * is detected, the runtime and deadline need to be updated. * * If the task has an implicit deadline, i.e., deadline == period, the Original - * CBS is applied. the runtime is replenished and a new absolute deadline is + * CBS is applied. The runtime is replenished and a new absolute deadline is * set, as in the previous cases. * * However, the Original CBS does not work properly for tasks with @@ -1024,6 +1107,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) } replenish_dl_new_period(dl_se, rq); + } else if (dl_server(dl_se) && dl_se->dl_defer) { + /* + * The server can still use its previous deadline, so check if + * it left the dl_defer_running state. + */ + if (!dl_se->dl_defer_running) { + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + } } } @@ -1056,8 +1148,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. + * + * The deferred reservation will have its timer set to + * (deadline - runtime). At that point, the CBS rule will decide + * if the current deadline can be used, or if a replenishment is + * required to avoid add too much pressure on the system + * (current u > U). */ - act = ns_to_ktime(dl_next_period(dl_se)); + if (dl_se->dl_defer_armed) { + WARN_ON_ONCE(!dl_se->dl_throttled); + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); + } else { + /* act = deadline - rel-deadline + period */ + act = ns_to_ktime(dl_next_period(dl_se)); + } + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -1107,6 +1212,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) #endif } +/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ +static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; + +static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) +{ + struct rq *rq = rq_of_dl_se(dl_se); + u64 fw; + + scoped_guard (rq_lock, rq) { + struct rq_flags *rf = &scope.rf; + + if (!dl_se->dl_throttled || !dl_se->dl_runtime) + return HRTIMER_NORESTART; + + sched_clock_tick(); + update_rq_clock(rq); + + if (!dl_se->dl_runtime) + return HRTIMER_NORESTART; + + if (!dl_se->server_has_tasks(dl_se)) { + replenish_dl_entity(dl_se); + return HRTIMER_NORESTART; + } + + if (dl_se->dl_defer_armed) { + /* + * First check if the server could consume runtime in background. + * If so, it is possible to push the defer timer for this amount + * of time. The dl_server_min_res serves as a limit to avoid + * forwarding the timer for a too small amount of time. + */ + if (dl_time_before(rq_clock(dl_se->rq), + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { + + /* reset the defer timer */ + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; + + hrtimer_forward_now(timer, ns_to_ktime(fw)); + return HRTIMER_RESTART; + } + + dl_se->dl_defer_running = 1; + } + + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) + resched_curr(rq); + + __push_dl_task(rq, rf); + } + + return HRTIMER_NORESTART; +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1129,28 +1290,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct rq_flags rf; struct rq *rq; - if (dl_server(dl_se)) { - struct rq *rq = rq_of_dl_se(dl_se); - struct rq_flags rf; - - rq_lock(rq, &rf); - if (dl_se->dl_throttled) { - sched_clock_tick(); - update_rq_clock(rq); - - if (dl_se->server_has_tasks(dl_se)) { - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); - resched_curr(rq); - __push_dl_task(rq, &rf); - } else { - replenish_dl_entity(dl_se); - } - - } - rq_unlock(rq, &rf); - - return HRTIMER_NORESTART; - } + if (dl_server(dl_se)) + return dl_server_timer(timer, dl_se); p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); @@ -1218,7 +1359,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) #endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1241,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = dl_task_timer; + hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } /* @@ -1295,7 +1435,7 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw - * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. * Since delta is a 64 bit variable, to have an overflow its value should be * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is * not an issue here. @@ -1320,22 +1460,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } -static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); -static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { s64 scaled_delta_exec; - if (unlikely(delta_exec <= 0)) { - if (unlikely(dl_se->dl_yielded)) - goto throttle; - return; - } - - if (dl_entity_is_special(dl_se)) - return; - /* * For tasks that participate in GRUB, we implement GRUB-PA: the * spare reclaimed bandwidth is used to clock down frequency. @@ -1354,8 +1482,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); } + return scaled_delta_exec; +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +{ + s64 scaled_delta_exec; + + if (unlikely(delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; + return; + } + + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) + return; + + if (dl_entity_is_special(dl_se)) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + dl_se->runtime -= scaled_delta_exec; + /* + * The fair server can consume its runtime while throttled (not queued/ + * running as regular CFS). + * + * If the server consumes its entire runtime in this state. The server + * is not required for the current period. Thus, reset the server by + * starting a new period, pushing the activation. + */ + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { + /* + * If the server was previously activated - the starving condition + * took place, it this point it went away because the fair scheduler + * was able to get runtime in background. So return to the initial + * state. + */ + dl_se->dl_defer_running = 0; + + hrtimer_try_to_cancel(&dl_se->dl_timer); + + replenish_dl_new_period(dl_se, dl_se->rq); + + /* + * Not being able to start the timer seems problematic. If it could not + * be started for whatever reason, we need to "unthrottle" the DL server + * and queue right away. Otherwise nothing might queue it. That's similar + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. + */ + WARN_ON_ONCE(!start_dl_timer(dl_se)); + + return; + } + throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; @@ -1383,6 +1567,14 @@ throttle: } /* + * The fair server (sole dl_server) does not account for real-time + * workload because it is running fair work. + */ + if (dl_se == &rq->fair_server) + return; + +#ifdef CONFIG_RT_GROUP_SCHED + /* * Because -- for now -- we share the rt bandwidth, we need to * account our runtime there too, otherwise actual rt tasks * would be able to exceed the shared quota. @@ -1406,34 +1598,157 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } +#endif +} + +/* + * In the non-defer mode, the idle time is not accounted, as the + * server provides a guarantee. + * + * If the dl_server is in defer mode, the idle time is also considered + * as time available for the fair server, avoiding a penalty for the + * rt scheduler that did not consumed that time. + */ +void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +{ + s64 delta_exec, scaled_delta_exec; + + if (!rq->fair_server.dl_defer) + return; + + /* no need to discount more */ + if (rq->fair_server.runtime < 0) + return; + + delta_exec = rq_clock_task(rq) - p->se.exec_start; + if (delta_exec < 0) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); + + rq->fair_server.runtime -= scaled_delta_exec; + + if (rq->fair_server.runtime < 0) { + rq->fair_server.dl_defer_running = 0; + rq->fair_server.runtime = 0; + } + + p->se.exec_start = rq_clock_task(rq); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { - update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + /* 0 runtime = fair server disabled */ + if (dl_se->dl_runtime) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_start(struct sched_dl_entity *dl_se) { + struct rq *rq = dl_se->rq; + + /* + * XXX: the apply do not work fine at the init phase for the + * fair server because things are not yet set. We need to improve + * this before getting generic. + */ if (!dl_server(dl_se)) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + dl_server_apply_params(dl_se, runtime, period, 1); + dl_se->dl_server = 1; + dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); } + + if (!dl_se->dl_runtime) + return; + + dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) + resched_curr(dl_se->rq); } void dl_server_stop(struct sched_dl_entity *dl_se) { + if (!dl_se->dl_runtime) + return; + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + dl_se->dl_server_active = 0; } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick) + dl_server_pick_f pick_task) { dl_se->rq = rq; dl_se->server_has_tasks = has_tasks; - dl_se->server_pick = pick; + dl_se->server_pick_task = pick_task; +} + +void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 new_bw = dl_se->dl_bw; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + + dl_b = dl_bw_of(cpu_of(rq)); + guard(raw_spinlock)(&dl_b->lock); + + if (!dl_bw_cpus(cpu)) + return; + + __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); +} + +int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) +{ + u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 new_bw = to_ratio(period, runtime); + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + unsigned long cap; + int retval = 0; + int cpus; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + + if (__dl_overflow(dl_b, cap, old_bw, new_bw)) + return -EBUSY; + + if (init) { + __add_rq_bw(new_bw, &rq->dl); + __dl_add(dl_b, new_bw, cpus); + } else { + __dl_sub(dl_b, dl_se->dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + + dl_rq_change_utilization(rq, dl_se, new_bw); + } + + dl_se->dl_runtime = runtime; + dl_se->dl_deadline = period; + dl_se->dl_period = period; + + dl_se->runtime = 0; + dl_se->deadline = 0; + + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); + + return retval; } /* @@ -1442,11 +1757,11 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, */ static void update_curr_dl(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; + struct task_struct *donor = rq->donor; + struct sched_dl_entity *dl_se = &donor->dl; s64 delta_exec; - if (!dl_task(curr) || !on_dl_rq(dl_se)) + if (!dl_task(donor) || !on_dl_rq(dl_se)) return; /* @@ -1523,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = inactive_task_timer; + hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #define __node_2_dle(node) \ @@ -1600,46 +1914,40 @@ static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); } -static inline struct sched_statistics * +static __always_inline struct sched_statistics * __schedstats_from_dl_se(struct sched_dl_entity *dl_se) { + if (!schedstat_enabled()) + return NULL; + + if (dl_server(dl_se)) + return NULL; + return &dl_task_of(dl_se)->stats; } static inline void update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void @@ -1736,7 +2044,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) * be counted in the active utilization; hence, we need to call * add_running_bw(). */ - if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { if (flags & ENQUEUE_WAKEUP) task_contending(dl_se, flags); @@ -1754,10 +2062,30 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && + !is_dl_boosted(dl_se) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } + /* + * If the reservation is still throttled, e.g., it got replenished but is a + * deferred task and still got to wait, don't enqueue. + */ + if (dl_se->dl_throttled && start_dl_timer(dl_se)) + return; + + /* + * We're about to enqueue, make sure we're not ->dl_throttled! + * In case the timer was not started, say because the defer time + * has passed, mark as not throttled and mark unarmed. + * Also cancel earlier timers, since letting those run is pointless. + */ + if (dl_se->dl_throttled) { + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + __enqueue_dl_entity(dl_se); } @@ -1806,7 +2134,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). */ - hrtimer_try_to_cancel(&p->dl.dl_timer); + cancel_replenish_timer(&p->dl); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { @@ -1842,7 +2170,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_pushable_dl_task(rq, p); } -static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); @@ -1852,6 +2180,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) dequeue_dl_entity(&p->dl, flags); if (!p->dl.dl_throttled && !dl_server(&p->dl)) dequeue_pushable_dl_task(rq, p); + + return true; } /* @@ -1899,7 +2229,7 @@ static int find_later_rq(struct task_struct *task); static int select_task_rq_dl(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; bool select_rq; struct rq *rq; @@ -1910,6 +2240,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If we are dealing with a -deadline task, we must @@ -1920,9 +2251,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - select_rq = unlikely(dl_task(curr)) && + select_rq = unlikely(dl_task(donor)) && (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && + !dl_entity_preempt(&p->dl, &donor->dl)) && p->nr_cpus_allowed > 1; /* @@ -1971,8 +2302,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); } sub_rq_bw(&p->dl, &rq->dl); rq_unlock(rq, &rf); @@ -1985,7 +2315,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) return; /* @@ -2024,7 +2354,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; } @@ -2034,7 +2364,7 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((p->dl.deadline == rq->curr->dl.deadline) && + if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ @@ -2066,10 +2396,13 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (rq->curr->sched_class != &dl_sched_class) + if (rq->donor->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); + + if (hrtick_enabled_dl(rq)) + start_hrtick_dl(rq, &p->dl); } static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) @@ -2082,7 +2415,11 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) return __node_2_dle(left); } -static struct task_struct *pick_task_dl(struct rq *rq) +/* + * __pick_next_task_dl - Helper to pick the next -deadline task to run. + * @rq: The runqueue to pick the next task from. + */ +static struct task_struct *__pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2096,14 +2433,15 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick(dl_se); + p = dl_se->server_pick_task(dl_se); if (!p) { - WARN_ON_ONCE(1); - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); + if (dl_server_active(dl_se)) { + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + } goto again; } - p->dl_server = dl_se; + rq->dl_server = dl_se; } else { p = dl_task_of(dl_se); } @@ -2111,24 +2449,12 @@ again: return p; } -static struct task_struct *pick_next_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq) { - struct task_struct *p; - - p = pick_task_dl(rq); - if (!p) - return p; - - if (!p->dl_server) - set_next_task_dl(rq, p, true); - - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, &p->dl); - - return p; + return __pick_task_dl(rq); } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct sched_dl_entity *dl_se = &p->dl; struct dl_rq *dl_rq = &rq->dl; @@ -2179,14 +2505,6 @@ static void task_fork_dl(struct task_struct *p) /* Only try algorithms three times */ #define DL_MAX_TRIES 3 -static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_on_cpu(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_mask)) - return 1; - return 0; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: @@ -2200,16 +2518,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); - -next_node: - if (next_node) { + while (next_node) { p = __node_2_pdl(next_node); - if (pick_dl_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); - goto next_node; } return NULL; @@ -2399,8 +2714,8 @@ retry: * can move away, it makes sense to just reschedule * without going further in pushing next_task. */ - if (dl_task(rq->curr) && - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + if (dl_task(rq->donor) && + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; @@ -2443,9 +2758,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, later_rq->cpu); - activate_task(later_rq, next_task, 0); + move_queued_task_locked(rq, later_rq, next_task); ret = 1; resched_curr(later_rq); @@ -2489,7 +2802,7 @@ static void pull_dl_task(struct rq *this_rq) src_rq = cpu_rq(cpu); /* - * It looks racy, abd it is! However, as in sched_rt.c, + * It looks racy, and it is! However, as in sched_rt.c, * we are fine with this. */ if (this_rq->dl.dl_nr_running && @@ -2525,15 +2838,13 @@ static void pull_dl_task(struct rq *this_rq) * deadline than the current task of its runqueue. */ if (dl_time_before(p->dl.deadline, - src_rq->curr->dl.deadline)) + src_rq->donor->dl.deadline)) goto skip; if (is_migration_disabled(p)) { push_task = get_push_task(src_rq); } else { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + move_queued_task_locked(src_rq, this_rq, p); dmin = p->dl.deadline; resched = true; } @@ -2566,9 +2877,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - dl_task(rq->curr) && + dl_task(rq->donor) && (rq->curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { push_dl_tasks(rq); } } @@ -2643,7 +2954,7 @@ void dl_add_task_root_domain(struct task_struct *p) struct dl_bw *dl_b; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - if (!dl_task(p)) { + if (!dl_task(p) || dl_entity_is_special(&p->dl)) { raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return; } @@ -2662,11 +2973,26 @@ void dl_add_task_root_domain(struct task_struct *p) void dl_clear_root_domain(struct root_domain *rd) { - unsigned long flags; + int i; - raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); rd->dl_bw.total_bw = 0; - raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); + + /* + * dl_servers are not tasks. Since dl_add_task_root_domain ignores + * them, we need to account for them here explicitly. + */ + for_each_cpu(i, rd->span) { + struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; + + if (dl_server(dl_se) && cpu_active(i)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); + } +} + +void dl_clear_root_domain_cpu(int cpu) +{ + dl_clear_root_domain(cpu_rq(cpu)->rd); } #endif /* CONFIG_SMP */ @@ -2727,8 +3053,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); /* * In case a task is setscheduled to SCHED_DEADLINE we need to keep @@ -2743,12 +3068,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - if (rq->curr != p) { + if (rq->donor != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); #endif - if (dl_task(rq->curr)) + if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -2777,7 +3102,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!rq->dl.overloaded) deadline_queue_pull_task(rq); - if (task_current(rq, p)) { + if (task_current_donor(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this @@ -2820,13 +3145,12 @@ DEFINE_SCHED_CLASS(dl) = { .wakeup_preempt = wakeup_preempt_dl, - .pick_next_task = pick_next_task_dl, + .pick_task = pick_task_dl, .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP .balance = balance_dl, - .pick_task = pick_task_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, @@ -2849,15 +3173,18 @@ DEFINE_SCHED_CLASS(dl) = { #endif }; -/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ -static u64 dl_generation; +/* + * Used for dl_bw check and update, used under sched_rt_handler()::mutex and + * sched_domains_mutex. + */ +u64 dl_cookie; int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu, cpus, ret = 0; unsigned long flags; @@ -2867,10 +3194,10 @@ int sched_dl_global_validate(void) * value smaller than the currently allocated bandwidth in * any of the root_domains. */ - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) + if (dl_bw_visited(cpu, cookie)) goto next; dl_b = dl_bw_of(cpu); @@ -2907,7 +3234,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -2918,7 +3245,7 @@ void sched_dl_do_global(void) for_each_possible_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) { + if (dl_bw_visited(cpu, cookie)) { rcu_read_unlock_sched(); continue; } @@ -3152,29 +3479,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, } enum dl_bw_request { - dl_bw_req_check_overflow = 0, + dl_bw_req_deactivate = 0, dl_bw_req_alloc, dl_bw_req_free }; static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; + u64 fair_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (req == dl_bw_req_free) { + cap = dl_bw_capacity(cpu); + switch (req) { + case dl_bw_req_free: __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); - } else { - unsigned long cap = dl_bw_capacity(cpu); - + break; + case dl_bw_req_alloc: overflow = __dl_overflow(dl_b, cap, 0, dl_bw); - if (req == dl_bw_req_alloc && !overflow) { + if (!overflow) { /* * We reserve space in the destination * root_domain, as we can't fail after this point. @@ -3183,6 +3512,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) */ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); } + break; + case dl_bw_req_deactivate: + /* + * cpu is not off yet, but we need to do the math by + * considering it off already (i.e., what would happen if we + * turn cpu off?). + */ + cap -= arch_scale_cpu_capacity(cpu); + + /* + * cpu is going offline and NORMAL tasks will be moved away + * from it. We can thus discount dl_server bandwidth + * contribution as it won't need to be servicing tasks after + * the cpu is off. + */ + if (cpu_rq(cpu)->fair_server.dl_server) + fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + + /* + * Not much to check if no DEADLINE bandwidth is present. + * dl_servers we can discount, as tasks will be moved out the + * offlined CPUs anyway. + */ + if (dl_b->total_bw - fair_server_bw > 0) { + /* + * Leaving at least one CPU for DEADLINE tasks seems a + * wise thing to do. As said above, cpu is not offline + * yet, so account for that. + */ + if (dl_bw_cpus(cpu) - 1) + overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + else + overflow = 1; + } + + break; } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -3191,9 +3556,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) return overflow ? -EBUSY : 0; } -int dl_bw_check_overflow(int cpu) +int dl_bw_deactivate(int cpu) { - return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); + return dl_bw_manage(dl_bw_req_deactivate, cpu, 0); } int dl_bw_alloc(int cpu, u64 dl_bw) @@ -3207,9 +3572,7 @@ void dl_bw_free(int cpu, u64 dl_bw) } #endif -#ifdef CONFIG_SCHED_DEBUG void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); } -#endif /* CONFIG_SCHED_DEBUG */ |