summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c186
-rw-r--r--kernel/sched/sched.h9
3 files changed, 112 insertions, 85 deletions
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f22342c48ff..2e039a81864c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -568,6 +568,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
cfs_rq->removed.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
+ cfs_rq->removed.runnable_sum);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe4a66b10480..086a5d979720 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3319,11 +3319,77 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ * ge->avg == grq->avg (1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
+ *
+ * And per (1) we have:
+ *
+ * ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ * ge->load.weight * grq->avg.load_avg
+ * ge->avg.load_avg = ----------------------------------- (4)
+ * grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ * grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ * d(runnable_avg) += se->avg.load_avg / se->load.weight
+ * (5)
+ * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
/* Nothing to update */
@@ -3339,102 +3405,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}
-/* Take into account change of load of a child task group */
static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- struct cfs_rq *gcfs_rq = group_cfs_rq(se);
- long delta, load = gcfs_rq->avg.load_avg;
+ long runnable_sum = gcfs_rq->prop_runnable_sum;
+ long load_avg;
+ s64 load_sum;
- /*
- * If the load of group cfs_rq is null, the load of the
- * sched_entity will also be null so we can skip the formula
- */
- if (load) {
- long tg_load;
-
- /* Get tg's load and ensure tg_load > 0 */
- tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
- /* Ensure tg_load >= load and updated with current load*/
- tg_load -= gcfs_rq->tg_load_avg_contrib;
- tg_load += load;
-
- /*
- * We need to compute a correction term in the case that the
- * task group is consuming more CPU than a task of equal
- * weight. A task with a weight equals to tg->shares will have
- * a load less or equal to scale_load_down(tg->shares).
- * Similarly, the sched_entities that represent the task group
- * at parent level, can't have a load higher than
- * scale_load_down(tg->shares). And the Sum of sched_entities'
- * load must be <= scale_load_down(tg->shares).
- */
- if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
- /* scale gcfs_rq's load into tg's shares*/
- load *= scale_load_down(gcfs_rq->tg->shares);
- load /= tg_load;
- }
- }
+ if (!runnable_sum)
+ return;
- delta = load - se->avg.load_avg;
+ gcfs_rq->prop_runnable_sum = 0;
- /* Nothing to update */
- if (!delta)
- return;
+ load_sum = (s64)se_weight(se) * runnable_sum;
+ load_avg = div_s64(load_sum, LOAD_AVG_MAX);
- /* Set new sched_entity's load */
- se->avg.load_avg = load;
- se->avg.load_sum = LOAD_AVG_MAX;
+ add_positive(&se->avg.load_sum, runnable_sum);
+ add_positive(&se->avg.load_avg, load_avg);
- /* Update parent cfs_rq load */
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+ add_positive(&cfs_rq->avg.load_avg, load_avg);
+ add_positive(&cfs_rq->avg.load_sum, load_sum);
- /*
- * If the sched_entity is already enqueued, we also have to update the
- * runnable load avg.
- */
if (se->on_rq) {
- /* Update parent cfs_rq runnable_load_avg */
- add_positive(&cfs_rq->runnable_load_avg, delta);
- cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ add_positive(&cfs_rq->runnable_load_avg, load_avg);
+ add_positive(&cfs_rq->runnable_load_sum, load_sum);
}
}
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
{
- cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
- if (!cfs_rq->propagate_avg)
- return 0;
-
- cfs_rq->propagate_avg = 0;
- return 1;
+ cfs_rq->propagate = 1;
+ cfs_rq->prop_runnable_sum += runnable_sum;
}
/* Update task and its cfs_rq load average */
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *gcfs_rq;
if (entity_is_task(se))
return 0;
- if (!test_and_clear_tg_cfs_propagate(se))
+ gcfs_rq = group_cfs_rq(se);
+ if (!gcfs_rq->propagate)
return 0;
+ gcfs_rq->propagate = 0;
+
cfs_rq = cfs_rq_of(se);
- set_tg_cfs_propagate(cfs_rq);
+ add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
- update_tg_cfs_util(cfs_rq, se);
- update_tg_cfs_load(cfs_rq, se);
+ update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
return 1;
}
@@ -3458,7 +3481,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
* If there is a pending propagation, we have to update the load and
* the utilization of the sched_entity:
*/
- if (gcfs_rq->propagate_avg)
+ if (gcfs_rq->propagate)
return false;
/*
@@ -3478,7 +3501,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 0;
}
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3501,7 +3524,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- unsigned long removed_load = 0, removed_util = 0;
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed = 0;
@@ -3511,6 +3534,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
+ swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);
@@ -3526,7 +3550,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
- set_tg_cfs_propagate(cfs_rq);
+ add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
decayed = 1;
}
@@ -3558,7 +3582,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
@@ -3576,7 +3601,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- set_tg_cfs_propagate(cfs_rq);
+
+ add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq);
}
@@ -3678,6 +3704,7 @@ void remove_entity_load_avg(struct sched_entity *se)
++cfs_rq->removed.nr;
cfs_rq->removed.util_avg += se->avg.util_avg;
cfs_rq->removed.load_avg += se->avg.load_avg;
+ cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}
@@ -9466,9 +9493,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
- cfs_rq->propagate_avg = 0;
-#endif
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2fd350a12bb7..5bcb86eb026b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -448,18 +448,19 @@ struct cfs_rq {
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
- unsigned long tg_load_avg_contrib;
- unsigned long propagate_avg;
-#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
int nr;
unsigned long load_avg;
unsigned long util_avg;
+ unsigned long runnable_sum;
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ unsigned long tg_load_avg_contrib;
+ long propagate;
+ long prop_runnable_sum;
+
/*
* h_load = weight * f(tg)
*