From 1ea6c46a23f1213d1972bfae220db5c165e27bba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 May 2017 15:59:54 +0200 Subject: sched/fair: Propagate an effective runnable_load_avg The load balancer uses runnable_load_avg as load indicator. For !cgroup this is: runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq That is, a direct sum of all runnable tasks on that runqueue. As opposed to load_avg, which is a sum of all tasks on the runqueue, which includes a blocked component. However, in the cgroup case, this comes apart since the group entities are always runnable, even if most of their constituent entities are blocked. Therefore introduce a runnable_weight which for task entities is the same as the regular weight, but for group entities is a fraction of the entity weight and represents the runnable part of the group runqueue. Then propagate this load through the PELT hierarchy to arrive at an effective runnable load avgerage -- which we should not confuse with the canonical runnable load average. Suggested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 8 ++- kernel/sched/fair.c | 172 +++++++++++++++++++++++++++++++++------------------ kernel/sched/sched.h | 3 +- 3 files changed, 121 insertions(+), 62 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2e039a81864c..1ca0130ed4f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P_SCHEDSTAT(se->statistics.wait_count); } P(se->load.weight); + P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); + P(se->avg.runnable_load_avg); #endif #undef PN_SCHEDSTAT @@ -558,10 +560,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->runnable_load_avg); + cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", @@ -1006,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); + P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); + P(se.avg.runnable_load_sum); P(se.avg.util_sum); P(se.avg.load_avg); + P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 086a5d979720..10d2000fca2d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -730,8 +730,9 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->load_avg = scale_load_down(se->load.weight); - sa->load_sum = LOAD_AVG_MAX; + sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); + sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX; + /* * At this point, util_avg won't be used in select_task_rq_fair anyway */ @@ -2731,25 +2732,35 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_SMP /* - * XXX we want to get rid of this helper and use the full load resolution. + * XXX we want to get rid of these helpers and use the full load resolution. */ static inline long se_weight(struct sched_entity *se) { return scale_load_down(se->load.weight); } +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + static inline void enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->runnable_load_avg += se->avg.load_avg; - cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum; + cfs_rq->runnable_weight += se->runnable_weight; + + cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; + cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; } static inline void dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum); + cfs_rq->runnable_weight -= se->runnable_weight; + + sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); + sub_positive(&cfs_rq->avg.runnable_load_sum, + se_runnable(se) * se->avg.runnable_load_sum); } static inline void @@ -2777,7 +2788,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - unsigned long weight) + unsigned long weight, unsigned long runnable) { if (se->on_rq) { /* commit outstanding execution time */ @@ -2788,11 +2799,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } dequeue_load_avg(cfs_rq, se); + se->runnable_weight = runnable; update_load_set(&se->load, weight); #ifdef CONFIG_SMP - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, - LOAD_AVG_MAX - 1024 + se->avg.period_contrib); + do { + u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); + se->avg.runnable_load_avg = + div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); + } while (0); #endif enqueue_load_avg(cfs_rq, se); @@ -2809,7 +2826,7 @@ void reweight_task(struct task_struct *p, int prio) struct load_weight *load = &se->load; unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); + reweight_entity(cfs_rq, se, weight, weight); load->inv_weight = sched_prio_to_wmult[prio]; } @@ -2917,31 +2934,45 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq) static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct sched_entity *se) +/* + * Recomputes the group entity based on the current state of its group + * runqueue. + */ +static void update_cfs_group(struct sched_entity *se) { - struct cfs_rq *cfs_rq = group_cfs_rq(se); - long shares; + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long shares, runnable; - if (!cfs_rq) + if (!gcfs_rq) return; - if (throttled_hierarchy(cfs_rq)) + if (throttled_hierarchy(gcfs_rq)) return; #ifndef CONFIG_SMP - shares = READ_ONCE(cfs_rq->tg->shares); + runnable = shares = READ_ONCE(gcfs_rq->tg->shares); if (likely(se->load.weight == shares)) return; #else - shares = calc_cfs_shares(cfs_rq); + shares = calc_cfs_shares(gcfs_rq); + /* + * The hierarchical runnable load metric is the proportional part + * of this group's runnable_load_avg / load_avg. + * + * Note: we need to deal with very sporadic 'runnable > load' cases + * due to numerical instability. + */ + runnable = shares * gcfs_rq->avg.runnable_load_avg; + if (runnable) + runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg); #endif - reweight_entity(cfs_rq_of(se), se, shares); + reweight_entity(cfs_rq_of(se), se, shares, runnable); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct sched_entity *se) +static inline void update_cfs_group(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3050,7 +3081,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) */ static __always_inline u32 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ @@ -3067,10 +3098,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ if (periods) { sa->load_sum = decay_load(sa->load_sum, periods); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods); - } + sa->runnable_load_sum = + decay_load(sa->runnable_load_sum, periods); sa->util_sum = decay_load((u64)(sa->util_sum), periods); /* @@ -3083,11 +3112,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, sa->period_contrib = delta; contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } + if (load) + sa->load_sum += load * contrib; + if (runnable) + sa->runnable_load_sum += runnable * contrib; if (running) sa->util_sum += contrib * scale_cpu; @@ -3124,7 +3152,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, */ static __always_inline int ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, - unsigned long weight, int running, struct cfs_rq *cfs_rq) + unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -3157,8 +3185,8 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * this happens during idle_balance() which calls * update_blocked_averages() */ - if (!weight) - running = 0; + if (!load) + runnable = running = 0; /* * Now we know we crossed measurement unit boundaries. The *_avg @@ -3167,45 +3195,60 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) return 0; return 1; } static __always_inline void -___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq) +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) { u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(weight * sa->load_sum, divider); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, divider); - } + sa->load_avg = div_u64(load * sa->load_sum, divider); + sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); sa->util_avg = sa->util_sum / divider; } /* * sched_entity: * + * task: + * se_runnable() == se_weight() + * + * group: [ see update_cfs_group() ] + * se_weight() = tg->weight * grq->load_avg / tg->load_avg + * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg + * * load_sum := runnable_sum * load_avg = se_weight(se) * runnable_avg * + * runnable_load_sum := runnable_sum + * runnable_load_avg = se_runnable(se) * runnable_avg + * + * XXX collapse load_sum and runnable_load_sum + * * cfq_rs: * * load_sum = \Sum se_weight(se) * se->avg.load_sum * load_avg = \Sum se->avg.load_avg + * + * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum + * runnable_load_avg = \Sum se->avg.runable_load_avg */ static int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) { - ___update_load_avg(&se->avg, se_weight(se), NULL); + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -3215,10 +3258,13 @@ __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) static int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, - cfs_rq->curr == se, NULL)) { + if (entity_is_task(se)) + se->runnable_weight = se->load.weight; + + if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + cfs_rq->curr == se)) { - ___update_load_avg(&se->avg, se_weight(se), NULL); + ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -3230,8 +3276,10 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, cpu, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->curr != NULL, cfs_rq)) { - ___update_load_avg(&cfs_rq->avg, 1, cfs_rq); + scale_load_down(cfs_rq->runnable_weight), + cfs_rq->curr != NULL)) { + + ___update_load_avg(&cfs_rq->avg, 1, 1); return 1; } @@ -3409,8 +3457,8 @@ static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long runnable_sum = gcfs_rq->prop_runnable_sum; - long load_avg; - s64 load_sum; + long runnable_load_avg, load_avg; + s64 runnable_load_sum, load_sum; if (!runnable_sum) return; @@ -3426,9 +3474,15 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf add_positive(&cfs_rq->avg.load_avg, load_avg); add_positive(&cfs_rq->avg.load_sum, load_sum); + runnable_load_sum = (s64)se_runnable(se) * runnable_sum; + runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); + + add_positive(&se->avg.runnable_load_sum, runnable_sum); + add_positive(&se->avg.runnable_load_avg, runnable_load_avg); + if (se->on_rq) { - add_positive(&cfs_rq->runnable_load_avg, load_avg); - add_positive(&cfs_rq->runnable_load_sum, load_sum); + add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); } } @@ -3710,7 +3764,7 @@ void remove_entity_load_avg(struct sched_entity *se) static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) { - return cfs_rq->runnable_load_avg; + return cfs_rq->avg.runnable_load_avg; } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) @@ -3882,8 +3936,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + update_cfs_group(se); enqueue_runnable_load_avg(cfs_rq, se); - update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) @@ -3989,7 +4043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(se); + update_cfs_group(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -4172,7 +4226,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(cfs_rq, curr, UPDATE_TG); - update_cfs_shares(curr); + update_cfs_group(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -5090,7 +5144,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } if (!se) @@ -5149,7 +5203,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } if (!se) @@ -7174,7 +7228,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (cfs_rq->avg.util_sum) return false; - if (cfs_rq->runnable_load_sum) + if (cfs_rq->avg.runnable_load_sum) return false; return true; @@ -9692,7 +9746,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(cfs_rq_of(se), se, UPDATE_TG); - update_cfs_shares(se); + update_cfs_group(se); } rq_unlock_irqrestore(rq, &rf); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5bcb86eb026b..e83d1b8be611 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -418,6 +418,7 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; + unsigned long runnable_weight; unsigned int nr_running, h_nr_running; u64 exec_clock; @@ -443,8 +444,6 @@ struct cfs_rq { * CFS load tracking */ struct sched_avg avg; - u64 runnable_load_sum; - unsigned long runnable_load_avg; #ifndef CONFIG_64BIT u64 load_last_update_time_copy; #endif -- cgit