summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c315
-rw-r--r--kernel/sched/pelt.c45
-rw-r--r--kernel/sched/pelt.h114
-rw-r--r--kernel/sched/rt.c6
-rw-r--r--kernel/sched/sched.h52
-rw-r--r--kernel/sched/topology.c31
9 files changed, 405 insertions, 178 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8d76a65cfdd..e86e2b8f6922 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
- * If we observe the old CPU in task_rq_lock, the acquire of
+ * If we observe the old CPU in task_rq_lock(), the acquire of
* the old rq->lock will fully serialize against the stores.
*
- * If we observe the new CPU in task_rq_lock, the acquire will
- * pair with the WMB to ensure we must then also see migrating.
+ * If we observe the new CPU in task_rq_lock(), the address
+ * dependency headed by '[L] rq = task_rq()' and the acquire
+ * will pair with the WMB to ensure we then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
@@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
+ update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
@@ -928,7 +930,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
{
lockdep_assert_held(&rq->lock);
- p->on_rq = TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb8b7b5d745d..6a73e41a2016 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
deadline_queue_push_tasks(rq);
if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index de3de997e245..8039d62ae36e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void)
{
static struct ctl_table *cpu_entries;
static struct ctl_table **cpu_idx;
+ static bool init_done = false;
char buf[32];
int i;
@@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void)
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
+ }
+ if (!init_done) {
+ init_done = true;
/* init to possible to not have holes in @cpu_entries */
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 310d0637fe4b..38d4669aa2ef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rq;
-}
-
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@@ -282,66 +275,72 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->on_list) {
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cfs_rq->on_list)
+ return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
+
+ cfs_rq->on_list = 1;
+
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the top of the tree
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
/*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases and a special case for the root
- * cfs_rq. Furthermore, it also means that we will always reset
- * tmp_alone_branch either when the branch is connected
- * to a tree or when we reach the beg of the tree
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
*/
- if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
- /*
- * If parent is already on the list, we add the child
- * just before. Thanks to circular linked property of
- * the list, this means to put the child at the tail
- * of the list that starts by parent.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
- /*
- * The branch is now connected to its tree so we can
- * reset tmp_alone_branch to the beginning of the
- * list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else if (!cfs_rq->tg->parent) {
- /*
- * cfs rq without parent should be put
- * at the tail of the list.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq->leaf_cfs_rq_list);
- /*
- * We have reach the beg of a tree so we can reset
- * tmp_alone_branch to the beginning of the list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else {
- /*
- * The parent has not already been added so we want to
- * make sure that it will be put after us.
- * tmp_alone_branch points to the beg of the branch
- * where we will add parent.
- */
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- rq->tmp_alone_branch);
- /*
- * update tmp_alone_branch to points to the new beg
- * of the branch
- */
- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
- }
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
+ }
- cfs_rq->on_list = 1;
+ if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the top of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
}
+
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the begin of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new begin
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return false;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
@@ -352,7 +351,12 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
}
-/* Iterate through all leaf cfs_rq's on a runqueue: */
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+ SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+}
+
+/* Iterate through all cfs_rq's on a runqueue in bottom-up order */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -410,12 +414,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
return container_of(se, struct task_struct, se);
}
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return container_of(cfs_rq, struct rq, cfs);
-}
-
-
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -438,14 +436,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
+ return true;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+}
+
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
@@ -686,9 +689,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-#ifdef CONFIG_SMP
#include "pelt.h"
-#include "sched-pelt.h"
+#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -776,7 +778,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
* such that the next switched_to_fair() has the
* expected state.
*/
- se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
return;
}
}
@@ -1035,7 +1037,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
unsigned int sysctl_numa_balancing_scan_delay = 1000;
struct numa_group {
- atomic_t refcount;
+ refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
@@ -1104,7 +1106,7 @@ static unsigned int task_scan_start(struct task_struct *p)
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
@@ -1127,7 +1129,7 @@ static unsigned int task_scan_max(struct task_struct *p)
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
@@ -2203,12 +2205,12 @@ static void task_numa_placement(struct task_struct *p)
static inline int get_numa_group(struct numa_group *grp)
{
- return atomic_inc_not_zero(&grp->refcount);
+ return refcount_inc_not_zero(&grp->refcount);
}
static inline void put_numa_group(struct numa_group *grp)
{
- if (atomic_dec_and_test(&grp->refcount))
+ if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
@@ -2229,7 +2231,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
return;
- atomic_set(&grp->refcount, 1);
+ refcount_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
@@ -3122,7 +3124,7 @@ void set_task_rq_fair(struct sched_entity *se,
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ __update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
@@ -3257,11 +3259,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/*
* runnable_sum can't be lower than running_sum
- * As running sum is scale with CPU capacity wehreas the runnable sum
- * is not we rescale running_sum 1st
+ * Rescale running sum to be in the same range as runnable sum
+ * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
+ * runnable_sum is in [0 : LOAD_AVG_MAX]
*/
- running_sum = se->avg.util_sum /
- arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3364,7 +3366,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
- * @now: current time, as per cfs_rq_clock_task()
+ * @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3409,7 +3411,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
decayed = 1;
}
- decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -3499,9 +3501,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
/*
@@ -3509,7 +3509,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cpu, cfs_rq, se);
+ __update_load_avg_se(now, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
@@ -3561,7 +3561,7 @@ void sync_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
+ __update_load_avg_blocked_se(last_update_time, se);
}
/*
@@ -3654,6 +3654,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
+ int cpu;
if (!sched_feat(UTIL_EST))
return;
@@ -3688,6 +3689,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * To avoid overestimation of actual task utilization, skip updates if
+ * we cannot grant there is idle time in this CPU.
+ */
+ cpu = cpu_of(rq_of(cfs_rq));
+ if (task_util(p) > capacity_orig_of(cpu))
+ return;
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
@@ -4565,7 +4574,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4582,7 +4591,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq);
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
@@ -4598,7 +4607,7 @@ next:
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
u64 runtime, runtime_expires;
int throttled;
@@ -4640,11 +4649,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
@@ -4753,17 +4762,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ unsigned long flags;
u64 expires;
/* confirm we're still not at a refresh boundary */
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (cfs_b->distribute_running) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
@@ -4774,18 +4784,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (runtime)
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (expires == cfs_b->runtime_expires)
lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
@@ -4863,20 +4873,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
+ unsigned long flags;
int overrun;
int idle = 0;
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -4986,6 +4997,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return false;
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
return rq_clock_task(rq_of(cfs_rq));
@@ -5177,6 +5194,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
+ if (cfs_bandwidth_used()) {
+ /*
+ * When bandwidth control is enabled; the cfs_rq_throttled()
+ * breaks in the above iteration can result in incomplete
+ * leaf list maintenance, resulting in triggering the assertion
+ * below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+ }
+
+ assert_list_leaf_cfs_rq(rq);
+
hrtick_update(rq);
}
@@ -5556,11 +5590,6 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6608,7 +6637,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- if (static_branch_unlikely(&sched_energy_present)) {
+ if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
@@ -7027,6 +7056,12 @@ idle:
if (new_tasks > 0)
goto again;
+ /*
+ * rq is about to be idle, check if we need to update the
+ * lost_idle_time of clock_pelt
+ */
+ update_idle_rq_clock_pelt(rq);
+
return NULL;
}
@@ -7669,7 +7704,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -7683,8 +7718,8 @@ static void update_blocked_averages(int cpu)
}
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7754,11 +7789,11 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -8452,9 +8487,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
- env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
- SCHED_CAPACITY_SCALE);
+ env->imbalance = sds->busiest_stat.group_load;
return 1;
}
@@ -8636,7 +8669,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (static_branch_unlikely(&sched_energy_present)) {
+ if (sched_energy_enabled()) {
struct root_domain *rd = env->dst_rq->rd;
if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
@@ -8827,21 +8860,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-static int need_active_balance(struct lb_env *env)
+static inline bool
+asym_active_balance(struct lb_env *env)
{
- struct sched_domain *sd = env->sd;
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
+ */
+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
+}
- if (env->idle == CPU_NEWLY_IDLE) {
+static inline bool
+voluntary_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
- /*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
- */
- if ((sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu))
- return 1;
- }
+ if (asym_active_balance(env))
+ return 1;
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
@@ -8859,6 +8896,16 @@ static int need_active_balance(struct lb_env *env)
if (env->src_grp_type == group_misfit_task)
return 1;
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ if (voluntary_active_balance(env))
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -9120,7 +9167,7 @@ more_balance:
} else
sd->nr_balance_failed = 0;
- if (likely(!active_balance)) {
+ if (likely(!active_balance) || voluntary_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 90fb5bc12ad4..befce29bd882 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -26,7 +26,6 @@
#include <linux/sched.h>
#include "sched.h"
-#include "sched-pelt.h"
#include "pelt.h"
/*
@@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
* n=1
*/
static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+accumulate_sum(u64 delta, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
- unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- scale_freq = arch_scale_freq_capacity(cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */
@@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
}
sa->period_contrib = delta;
- contrib = cap_scale(contrib, scale_freq);
if (load)
sa->load_sum += load * contrib;
if (runnable)
sa->runnable_load_sum += runnable * contrib;
if (running)
- sa->util_sum += contrib * scale_cpu;
+ sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
return periods;
}
@@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+___update_load_sum(u64 now, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
u64 delta;
@@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+ if (!accumulate_sum(delta, sa, load, runnable, running))
return 0;
return 1;
@@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
* runnable_load_avg = \Sum se->avg.runable_load_avg
*/
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}
@@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
return 0;
}
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
@@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e
return 0;
}
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
- if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
scale_load_down(cfs_rq->runnable_weight),
cfs_rq->curr != NULL)) {
@@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
- if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
+ if (___update_load_sum(now, &rq->avg_rt,
running,
running,
running)) {
@@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
- if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
+ if (___update_load_sum(now, &rq->avg_dl,
running,
running,
running)) {
@@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
int update_irq_load_avg(struct rq *rq, u64 running)
{
int ret = 0;
+
+ /*
+ * We can't use clock_pelt because irq time is not accounted in
+ * clock_task. Instead we directly scale the running time to
+ * reflect the real amount of computation
+ */
+ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
+ running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+
/*
* We know the time that has been used by interrupt since last update
* but we don't when. Let be pessimistic and assume that interrupt has
* happened just before the update. This is not so far from reality
* because interrupt will most probably wake up task and trig an update
- * of rq clock during which the metric si updated.
+ * of rq clock during which the metric is updated.
* We start to decay with normal context time and then we add the
* interrupt context time.
* We can safely remove running from rq->clock because
* rq->clock += delta with delta >= running
*/
- ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+ ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
0,
0,
0);
- ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+ ret += ___update_load_sum(rq->clock, &rq->avg_irq,
1,
1,
1);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7e56b489ff32..7489d5f56960 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,8 +1,9 @@
#ifdef CONFIG_SMP
+#include "sched-pelt.h"
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
@@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
+/*
+ * The clock_pelt scales the time to reflect the effective amount of
+ * computation done during the running delta time but then sync back to
+ * clock_task when rq is idle.
+ *
+ *
+ * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
+ * @ max capacity ------******---------------******---------------
+ * @ half capacity ------************---------************---------
+ * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
+ *
+ */
+static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
+{
+ if (unlikely(is_idle_task(rq->curr))) {
+ /* The rq is idle, we can sync to clock_task */
+ rq->clock_pelt = rq_clock_task(rq);
+ return;
+ }
+
+ /*
+ * When a rq runs at a lower compute capacity, it will need
+ * more time to do the same amount of work than at max
+ * capacity. In order to be invariant, we scale the delta to
+ * reflect how much work has been really done.
+ * Running longer results in stealing idle time that will
+ * disturb the load signal compared to max capacity. This
+ * stolen idle time will be automatically reflected when the
+ * rq will be idle and the clock will be synced with
+ * rq_clock_task.
+ */
+
+ /*
+ * Scale the elapsed time to reflect the real amount of
+ * computation
+ */
+ delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+ delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
+
+ rq->clock_pelt += delta;
+}
+
+/*
+ * When rq becomes idle, we have to check if it has lost idle time
+ * because it was fully busy. A rq is fully used when the /Sum util_sum
+ * is greater or equal to:
+ * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
+ * For optimization and computing rounding purpose, we don't take into account
+ * the position in the current window (period_contrib) and we use the higher
+ * bound of util_sum to decide.
+ */
+static inline void update_idle_rq_clock_pelt(struct rq *rq)
+{
+ u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
+ u32 util_sum = rq->cfs.avg.util_sum;
+ util_sum += rq->avg_rt.util_sum;
+ util_sum += rq->avg_dl.util_sum;
+
+ /*
+ * Reflecting stolen time makes sense only if the idle
+ * phase would be present at max capacity. As soon as the
+ * utilization of a rq has reached the maximum value, it is
+ * considered as an always runnig rq without idle time to
+ * steal. This potential idle time is considered as lost in
+ * this case. We keep track of this lost idle time compare to
+ * rq's clock_task.
+ */
+ if (util_sum >= divider)
+ rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+#else
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_pelt(rq_of(cfs_rq));
+}
+#endif
+
#else
static inline int
@@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ return rq_clock_task(rq);
+}
+
+static inline void
+update_rq_clock_pelt(struct rq *rq, s64 delta) { }
+
+static inline void
+update_idle_rq_clock_pelt(struct rq *rq) { }
+
#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e4f398ad9e73..90fa23d36565 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* rt task
*/
if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* The previous task needs to be made eligible for pushing
@@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
watchdog(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d04530bf251f..c688ef5012e5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -861,7 +861,10 @@ struct rq {
unsigned int clock_update_flags;
u64 clock;
- u64 clock_task;
+ /* Ensure that all clocks are in the same cache line */
+ u64 clock_task ____cacheline_aligned;
+ u64 clock_pelt;
+ unsigned long lost_idle_time;
atomic_t nr_iowait;
@@ -951,6 +954,22 @@ struct rq {
#endif
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* CPU runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+#else
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
#ifdef CONFIG_THREAD_INFO_IN_TASK
- p->cpu = cpu;
+ WRITE_ONCE(p->cpu, cpu);
#else
- task_thread_info(p)->cpu = cpu;
+ WRITE_ONCE(task_thread_info(p)->cpu, cpu);
#endif
p->wake_cpu = cpu;
#endif
@@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p)
static inline int task_on_rq_migrating(struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_MIGRATING;
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
}
/*
@@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
# define arch_scale_freq_invariant() false
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+#endif
+
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
/**
* enum schedutil_type - CPU utilization type
@@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
#endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
-#else
+
+DECLARE_STATIC_KEY_FALSE(sched_energy_present);
+
+static inline bool sched_energy_enabled(void)
+{
+ return static_branch_unlikely(&sched_energy_present);
+}
+
+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+
#define perf_domain_span(pd) NULL
-#endif
+static inline bool sched_energy_enabled(void) { return false; }
-#ifdef CONFIG_SMP
-extern struct static_key_false sched_energy_present;
-#endif
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3f35ba1d8fde..4ae9403420ed 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1;
}
-DEFINE_STATIC_KEY_FALSE(sched_energy_present);
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
+unsigned int sysctl_sched_energy_aware = 1;
DEFINE_MUTEX(sched_energy_mutex);
bool sched_energy_update;
+#ifdef CONFIG_PROC_SYSCTL
+int sched_energy_aware_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, state;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ state = static_branch_unlikely(&sched_energy_present);
+ if (state != sysctl_sched_energy_aware) {
+ mutex_lock(&sched_energy_mutex);
+ sched_energy_update = 1;
+ rebuild_sched_domains();
+ sched_energy_update = 0;
+ mutex_unlock(&sched_energy_mutex);
+ }
+ }
+
+ return ret;
+}
+#endif
+
static void free_pd(struct perf_domain *pd)
{
struct perf_domain *tmp;
@@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
struct cpufreq_policy *policy;
struct cpufreq_governor *gov;
+ if (!sysctl_sched_energy_aware)
+ goto free;
+
/* EAS is enabled for asymmetric CPU capacity topologies. */
if (!per_cpu(sd_asym_cpucapacity, cpu)) {
if (sched_debug()) {