summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c36
-rw-r--r--kernel/sched/cputime.c288
-rw-r--r--kernel/sched/deadline.c12
-rw-r--r--kernel/sched/fair.c1427
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c10
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h25
-rw-r--r--kernel/sched/stop_task.c9
-rw-r--r--kernel/sched/topology.c9
10 files changed, 1185 insertions, 644 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44123b4d14e8..90e4b00ace89 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
#include <asm/tlb.h>
#include "../workqueue_internal.h"
+#include "../../fs/io-wq.h"
#include "../smpboot.h"
#include "pelt.h"
@@ -810,7 +811,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
-static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -853,7 +854,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
}
static inline
-enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
@@ -918,7 +919,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
@@ -3105,7 +3106,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ spin_release(&rq->lock.dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = next;
@@ -3917,13 +3918,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev, rf);
+ p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
- if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev, rf);
+ if (!p) {
+ put_prev_task(rq, prev);
+ p = pick_next_task_idle(rq);
+ }
return p;
}
@@ -3947,7 +3950,7 @@ restart:
put_prev_task(rq, prev);
for_each_class(class) {
- p = class->pick_next_task(rq, NULL, NULL);
+ p = class->pick_next_task(rq);
if (p)
return p;
}
@@ -4112,9 +4115,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
* we disable preemption to avoid it calling schedule() again
* in the possible wakeup of a kworker.
*/
- if (tsk->flags & PF_WQ_WORKER) {
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
- wq_worker_sleeping(tsk);
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else
+ io_wq_worker_sleeping(tsk);
preempt_enable_no_resched();
}
@@ -4131,8 +4137,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
static void sched_update_worker(struct task_struct *tsk)
{
- if (tsk->flags & PF_WQ_WORKER)
- wq_worker_running(tsk);
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+ else
+ io_wq_worker_running(tsk);
+ }
}
asmlinkage __visible void __sched schedule(void)
@@ -6209,7 +6219,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
struct task_struct *next;
for_each_class(class) {
- next = class->pick_next_task(rq, NULL, NULL);
+ next = class->pick_next_task(rq);
if (next) {
next->sched_class->put_prev_task(rq, next);
return next;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 46ed4e1383e2..d43318a489f2 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -405,27 +405,25 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
/*
* Use precise platform statistics if available:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
+void vtime_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
vtime_account_idle(prev);
else
- vtime_account_system(prev);
+ vtime_account_kernel(prev);
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
# endif
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
+ * vtime_account_kernel() and vtime_account_idle(). Archs that
* have other meaning of the idle time (s390 only includes the
* time spent by the CPU when it's in low power mode) must override
* vtime_account().
@@ -436,7 +434,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
if (!in_interrupt() && is_idle_task(tsk))
vtime_account_idle(tsk);
else
- vtime_account_system(tsk);
+ vtime_account_kernel(tsk);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -477,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
u64 cputime, steal;
struct rq *rq = this_rq();
- if (vtime_accounting_cpu_enabled())
+ if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
@@ -711,8 +709,8 @@ static u64 get_vtime_delta(struct vtime *vtime)
return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk,
- struct vtime *vtime)
+static void vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
{
vtime->stime += get_vtime_delta(vtime);
if (vtime->stime >= TICK_NSEC) {
@@ -731,7 +729,17 @@ static void vtime_account_guest(struct task_struct *tsk,
}
}
-void vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_kernel(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ /* We might have scheduled out from guest path */
+ if (vtime->state == VTIME_GUEST)
+ vtime_account_guest(tsk, vtime);
+ else
+ vtime_account_system(tsk, vtime);
+}
+
+void vtime_account_kernel(struct task_struct *tsk)
{
struct vtime *vtime = &tsk->vtime;
@@ -739,11 +747,7 @@ void vtime_account_system(struct task_struct *tsk)
return;
write_seqcount_begin(&vtime->seqcount);
- /* We might have scheduled out from guest path */
- if (tsk->flags & PF_VCPU)
- vtime_account_guest(tsk, vtime);
- else
- __vtime_account_system(tsk, vtime);
+ __vtime_account_kernel(tsk, vtime);
write_seqcount_end(&vtime->seqcount);
}
@@ -752,7 +756,7 @@ void vtime_user_enter(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount);
}
@@ -782,8 +786,9 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
tsk->flags |= PF_VCPU;
+ vtime->state = VTIME_GUEST;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -795,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
tsk->flags &= ~PF_VCPU;
+ vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -804,19 +810,30 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(get_vtime_delta(&tsk->vtime));
}
-void arch_vtime_task_switch(struct task_struct *prev)
+void vtime_task_switch_generic(struct task_struct *prev)
{
struct vtime *vtime = &prev->vtime;
write_seqcount_begin(&vtime->seqcount);
+ if (vtime->state == VTIME_IDLE)
+ vtime_account_idle(prev);
+ else
+ __vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
+ vtime->cpu = -1;
write_seqcount_end(&vtime->seqcount);
vtime = &current->vtime;
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ if (is_idle_task(current))
+ vtime->state = VTIME_IDLE;
+ else if (current->flags & PF_VCPU)
+ vtime->state = VTIME_GUEST;
+ else
+ vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
+ vtime->cpu = smp_processor_id();
write_seqcount_end(&vtime->seqcount);
}
@@ -827,8 +844,9 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ vtime->state = VTIME_IDLE;
vtime->starttime = sched_clock();
+ vtime->cpu = cpu;
write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
@@ -846,7 +864,7 @@ u64 task_gtime(struct task_struct *t)
seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ if (vtime->state == VTIME_GUEST)
gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq));
@@ -877,20 +895,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
*utime = t->utime;
*stime = t->stime;
- /* Task is sleeping, nothing to add */
- if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
+ /* Task is sleeping or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
- * Task runs either in user or kernel space, add pending nohz time to
- * the right place.
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
*/
- if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
- *utime += vtime->utime + delta;
- else if (vtime->state == VTIME_SYS)
+ if (vtime->state == VTIME_SYS)
*stime += vtime->stime + delta;
+ else
+ *utime += vtime->utime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+}
+
+static int vtime_state_check(struct vtime *vtime, int cpu)
+{
+ /*
+ * We raced against a context switch, fetch the
+ * kcpustat task again.
+ */
+ if (vtime->cpu != cpu && vtime->cpu != -1)
+ return -EAGAIN;
+
+ /*
+ * Two possible things here:
+ * 1) We are seeing the scheduling out task (prev) or any past one.
+ * 2) We are seeing the scheduling in task (next) but it hasn't
+ * passed though vtime_task_switch() yet so the pending
+ * cputime of the prev task may not be flushed yet.
+ *
+ * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
+ */
+ if (vtime->state == VTIME_INACTIVE)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static u64 kcpustat_user_vtime(struct vtime *vtime)
+{
+ if (vtime->state == VTIME_USER)
+ return vtime->utime + vtime_delta(vtime);
+ else if (vtime->state == VTIME_GUEST)
+ return vtime->gtime + vtime_delta(vtime);
+ return 0;
+}
+
+static int kcpustat_field_vtime(u64 *cpustat,
+ struct task_struct *tsk,
+ enum cpu_usage_stat usage,
+ int cpu, u64 *val)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+ int err;
+
+ do {
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ err = vtime_state_check(vtime, cpu);
+ if (err < 0)
+ return err;
+
+ *val = cpustat[usage];
+
+ /*
+ * Nice VS unnice cputime accounting may be inaccurate if
+ * the nice value has changed since the last vtime update.
+ * But proper fix would involve interrupting target on nice
+ * updates which is a no go on nohz_full (although the scheduler
+ * may still interrupt the target if rescheduling is needed...)
+ */
+ switch (usage) {
+ case CPUTIME_SYSTEM:
+ if (vtime->state == VTIME_SYS)
+ *val += vtime->stime + vtime_delta(vtime);
+ break;
+ case CPUTIME_USER:
+ if (task_nice(tsk) <= 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_NICE:
+ if (task_nice(tsk) > 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_GUEST:
+ if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ case CPUTIME_GUEST_NICE:
+ if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ default:
+ break;
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return 0;
+}
+
+u64 kcpustat_field(struct kernel_cpustat *kcpustat,
+ enum cpu_usage_stat usage, int cpu)
+{
+ u64 *cpustat = kcpustat->cpustat;
+ struct rq *rq;
+ u64 val;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu))
+ return cpustat[usage];
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ return cpustat[usage];
+ }
+
+ err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
+ rcu_read_unlock();
+
+ if (!err)
+ return val;
+
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(kcpustat_field);
+
+static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
+ const struct kernel_cpustat *src,
+ struct task_struct *tsk, int cpu)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+ int err;
+
+ do {
+ u64 *cpustat;
+ u64 delta;
+
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ err = vtime_state_check(vtime, cpu);
+ if (err < 0)
+ return err;
+
+ *dst = *src;
+ cpustat = dst->cpustat;
+
+ /* Task is sleeping, dead or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
+ continue;
+
+ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
+ */
+ if (vtime->state == VTIME_SYS) {
+ cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
+ } else if (vtime->state == VTIME_USER) {
+ if (task_nice(tsk) > 0)
+ cpustat[CPUTIME_NICE] += vtime->utime + delta;
+ else
+ cpustat[CPUTIME_USER] += vtime->utime + delta;
+ } else {
+ WARN_ON_ONCE(vtime->state != VTIME_GUEST);
+ if (task_nice(tsk) > 0) {
+ cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
+ cpustat[CPUTIME_NICE] += vtime->gtime + delta;
+ } else {
+ cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
+ cpustat[CPUTIME_USER] += vtime->gtime + delta;
+ }
+ }
} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return err;
+}
+
+void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
+{
+ const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
+ struct rq *rq;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu)) {
+ *dst = *src;
+ return;
+ }
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ *dst = *src;
+ return;
+ }
+
+ err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
+ rcu_read_unlock();
+
+ if (!err)
+ return;
+
+ cpu_relax();
+ }
}
+EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a8a08030a8f7..43323f875cb9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1743,13 +1743,16 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
-static void set_next_task_dl(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+ if (!first)
+ return;
+
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
@@ -1770,22 +1773,19 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
- WARN_ON_ONCE(prev || rf);
-
if (!sched_dl_runnable(rq))
return NULL;
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p);
+ set_next_task_dl(rq, p, true);
return p;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69a81a5709ff..08a233e97a01 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
}
}
- /* hint to use a 32x32->64 mul */
- fact = (u64)(u32)fact * lw->inv_weight;
+ fact = mul_u32_u32(fact, lw->inv_weight);
while (fact >> 32) {
fact >>= 1;
@@ -1474,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long cpu_runnable_load(struct rq *rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+
+static unsigned long cpu_runnable_load(struct rq *rq)
+{
+ return cfs_rq_runnable_load_avg(&rq->cfs);
+}
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -3504,9 +3508,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed)
- cfs_rq_util_change(cfs_rq, 0);
-
return decayed;
}
@@ -3616,8 +3617,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
- } else if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq, 0);
+ }
}
#ifndef CONFIG_64BIT
@@ -3764,10 +3769,21 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * Reset EWMA on utilization increases, the moving average is used only
+ * to smooth utilization decreases.
+ */
+ ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ if (sched_feat(UTIL_EST_FASTUP)) {
+ if (ue.ewma < ue.enqueued) {
+ ue.ewma = ue.enqueued;
+ goto done;
+ }
+ }
+
+ /*
* Skip update of task's estimated utilization when its EWMA is
* already ~1% close to its last activation value.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
last_ewma_diff = ue.enqueued - ue.ewma;
if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
return;
@@ -3800,6 +3816,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
WRITE_ONCE(p->se.avg.util_est, ue);
}
@@ -5370,26 +5387,45 @@ static int sched_idle_cpu(int cpu)
rq->nr_running);
}
-static unsigned long cpu_runnable_load(struct rq *rq)
+static unsigned long cpu_load(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&rq->cfs);
+ return cfs_rq_load_avg(&rq->cfs);
}
-static unsigned long capacity_of(int cpu)
+/*
+ * cpu_load_without - compute CPU load without any contributions from *p
+ * @cpu: the CPU which load is requested
+ * @p: the task which load should be discounted
+ *
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
+ * CPU as well as tasks which are currently sleeping after an execution on that
+ * CPU.
+ *
+ * This method returns the load of the specified CPU by discounting the load of
+ * the specified task, whenever the task is currently contributing to the CPU
+ * load.
+ */
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
- return cpu_rq(cpu)->cpu_capacity;
-}
+ struct cfs_rq *cfs_rq;
+ unsigned int load;
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = cpu_runnable_load(rq);
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_load(rq);
- if (nr_running)
- return load_avg / nr_running;
+ cfs_rq = &rq->cfs;
+ load = READ_ONCE(cfs_rq->avg.load_avg);
- return 0;
+ /* Discount task's util from CPU's util */
+ lsub_positive(&load, task_h_load(p));
+
+ return load;
+}
+
+static unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
}
static void record_wakee(struct task_struct *p)
@@ -5482,7 +5518,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5500,7 +5536,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5538,149 +5574,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
-{
- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- *
- * Assumes p is allowed on at least one CPU in sd.
- */
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
-{
- struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX;
- unsigned long this_runnable_load = ULONG_MAX;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
- unsigned long most_spare = 0, this_spare = 0;
- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
- (sd->imbalance_pct-100) / 100;
-
- do {
- unsigned long load, avg_load, runnable_load;
- unsigned long spare_cap, max_spare_cap;
- int local_group;
- int i;
-
- /* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_span(group),
- p->cpus_ptr))
- continue;
-
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_span(group));
-
- /*
- * Tally up the load of all CPUs in the group and find
- * the group containing the CPU with most spare capacity.
- */
- avg_load = 0;
- runnable_load = 0;
- max_spare_cap = 0;
-
- for_each_cpu(i, sched_group_span(group)) {
- load = cpu_runnable_load(cpu_rq(i));
- runnable_load += load;
-
- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
-
- spare_cap = capacity_spare_without(i, p);
-
- if (spare_cap > max_spare_cap)
- max_spare_cap = spare_cap;
- }
-
- /* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
-
- if (local_group) {
- this_runnable_load = runnable_load;
- this_avg_load = avg_load;
- this_spare = max_spare_cap;
- } else {
- if (min_runnable_load > (runnable_load + imbalance)) {
- /*
- * The runnable load is significantly smaller
- * so we can pick this new CPU:
- */
- min_runnable_load = runnable_load;
- min_avg_load = avg_load;
- idlest = group;
- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
- (100*min_avg_load > imbalance_scale*avg_load)) {
- /*
- * The runnable loads are close so take the
- * blocked load into account through avg_load:
- */
- min_avg_load = avg_load;
- idlest = group;
- }
-
- if (most_spare < max_spare_cap) {
- most_spare = max_spare_cap;
- most_spare_sg = group;
- }
- }
- } while (group = group->next, group != sd->groups);
-
- /*
- * The cross-over point between using spare capacity or least load
- * is too conservative for high utilization tasks on partially
- * utilized systems if we require spare_capacity > task_util(p),
- * so we allow for some task stuffing by using
- * spare_capacity > task_util(p)/2.
- *
- * Spare capacity can't be used for fork because the utilization has
- * not been set yet, we must first select a rq to compute the initial
- * utilization.
- */
- if (sd_flag & SD_BALANCE_FORK)
- goto skip_spare;
-
- if (this_spare > task_util(p) / 2 &&
- imbalance_scale*this_spare > 100*most_spare)
- return NULL;
-
- if (most_spare > task_util(p) / 2)
- return most_spare_sg;
-
-skip_spare:
- if (!idlest)
- return NULL;
-
- /*
- * When comparing groups across NUMA domains, it's possible for the
- * local domain to be very lightly loaded relative to the remote
- * domains but "imbalance" skews the comparison making remote CPUs
- * look much more favourable. When considering cross-domain, add
- * imbalance to the runnable load on the remote node and consider
- * staying local.
- */
- if ((sd->flags & SD_NUMA) &&
- min_runnable_load + imbalance >= this_runnable_load)
- return NULL;
-
- if (min_runnable_load > (this_runnable_load + imbalance))
- return NULL;
-
- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
- (100*this_avg_load < imbalance_scale*min_avg_load))
- return NULL;
-
- return idlest;
-}
+ int this_cpu, int sd_flag);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5729,7 +5625,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
continue;
}
- load = cpu_runnable_load(cpu_rq(i));
+ load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
@@ -5753,7 +5649,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return prev_cpu;
/*
- * We need task's util for capacity_spare_without, sync it up to
+ * We need task's util for cpu_util_without, sync it up to
* prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
@@ -6746,7 +6642,7 @@ preempt:
set_last_buddy(se);
}
-static struct task_struct *
+struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -6890,6 +6786,11 @@ idle:
return NULL;
}
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
+{
+ return pick_next_task_fair(rq, NULL, NULL);
+}
+
/*
* Account for a descheduled task:
*/
@@ -7079,11 +6980,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+/*
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
+ *
+ * The enum is ordered by pulling priority, with the group with lowest priority
+ * first so the group_type can simply be compared when selecting the busiest
+ * group. See update_sd_pick_busiest().
+ */
enum group_type {
- group_other = 0,
+ /* The group has spare capacity that can be used to run more tasks. */
+ group_has_spare = 0,
+ /*
+ * The group is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ group_fully_busy,
+ /*
+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
+ * and must be migrated to a more powerful CPU.
+ */
group_misfit_task,
+ /*
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
+ * and the task should be migrated to it instead of running on the
+ * current CPU.
+ */
+ group_asym_packing,
+ /*
+ * The tasks' affinity constraints previously prevented the scheduler
+ * from balancing the load across the system.
+ */
group_imbalanced,
- group_overloaded,
+ /*
+ * The CPU is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ group_overloaded
+};
+
+enum migration_type {
+ migrate_load = 0,
+ migrate_util,
+ migrate_task,
+ migrate_misfit
};
#define LBF_ALL_PINNED 0x01
@@ -7116,7 +7055,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
- enum group_type src_grp_type;
+ enum migration_type migration_type;
struct list_head tasks;
};
@@ -7339,7 +7278,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
- * detach_tasks() -- tries to detach up to imbalance runnable load from
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -7347,8 +7286,8 @@ static const unsigned int sched_nr_migrate_break = 32;
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
+ unsigned long util, load;
struct task_struct *p;
- unsigned long load;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
@@ -7381,19 +7320,46 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ switch (env->migration_type) {
+ case migrate_load:
+ load = task_h_load(p);
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
- goto next;
+ if (sched_feat(LB_MIN) &&
+ load < 16 && !env->sd->nr_balance_failed)
+ goto next;
- if ((load / 2) > env->imbalance)
- goto next;
+ if (load/2 > env->imbalance)
+ goto next;
+
+ env->imbalance -= load;
+ break;
+
+ case migrate_util:
+ util = task_util_est(p);
+
+ if (util > env->imbalance)
+ goto next;
+
+ env->imbalance -= util;
+ break;
+
+ case migrate_task:
+ env->imbalance--;
+ break;
+
+ case migrate_misfit:
+ /* This is not a misfit task */
+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ goto next;
+
+ env->imbalance = 0;
+ break;
+ }
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
- env->imbalance -= load;
#ifdef CONFIG_PREEMPTION
/*
@@ -7407,7 +7373,7 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amount of
- * runnable load.
+ * load/util/tasks.
*/
if (env->imbalance <= 0)
break;
@@ -7517,6 +7483,28 @@ static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+ const struct sched_class *curr_class;
+ u64 now = rq_clock_pelt(rq);
+ bool decayed;
+
+ /*
+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+ * DL and IRQ signals have been updated before updating CFS.
+ */
+ curr_class = rq->curr->sched_class;
+
+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_irq_load_avg(rq, 0);
+
+ if (others_have_blocked(rq))
+ *done = false;
+
+ return decayed;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7536,29 +7524,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}
-static void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
- const struct sched_class *curr_class;
- struct rq_flags rf;
- bool done = true;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
-
- /*
- * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
- * that RT, DL and IRQ signals have been updated before updating CFS.
- */
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
-
- /* Don't need periodic decay once load/util_avg are null */
- if (others_have_blocked(rq))
- done = false;
+ bool decayed = false;
+ int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
@@ -7567,9 +7537,13 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
+ if (cfs_rq == &rq->cfs)
+ decayed = true;
+ }
+
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
@@ -7584,11 +7558,10 @@ static void update_blocked_averages(int cpu)
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
- done = false;
+ *done = false;
}
- update_blocked_load_status(rq, !done);
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
/*
@@ -7638,29 +7611,16 @@ static unsigned long task_h_load(struct task_struct *p)
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
-static inline void update_blocked_averages(int cpu)
+static bool __update_blocked_fair(struct rq *rq, bool *done)
{
- struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- const struct sched_class *curr_class;
- struct rq_flags rf;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
-
- /*
- * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
- * that RT, DL and IRQ signals have been updated before updating CFS.
- */
- curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
+ bool decayed;
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ if (cfs_rq_has_blocked(cfs_rq))
+ *done = false;
- update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
- rq_unlock_irqrestore(rq, &rf);
+ return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7669,6 +7629,24 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
+static void update_blocked_averages(int cpu)
+{
+ bool decayed = false, done = true;
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+
+ decayed |= __update_blocked_others(rq, &done);
+ decayed |= __update_blocked_fair(rq, &done);
+
+ update_blocked_load_status(rq, !done);
+ if (decayed)
+ cpufreq_update_util(rq, 0);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/********** Helpers for find_busiest_group ************************/
/*
@@ -7677,14 +7655,14 @@ static unsigned long task_h_load(struct task_struct *p)
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long load_per_task;
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
- unsigned int sum_nr_running; /* Nr tasks running in the group */
+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_no_capacity;
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -7699,10 +7677,10 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
- unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* tasks should go to sibling first */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
@@ -7713,19 +7691,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
/*
* Skimp on the clearing to avoid duplicate work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
- * We must however clear busiest_stat::avg_load because
- * update_sd_pick_busiest() reads this before assignment.
+ * We must however set busiest_stat::group_type and
+ * busiest_stat::idle_cpus to the worst busiest group because
+ * update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
- .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
- .avg_load = 0UL,
- .sum_nr_running = 0,
- .group_type = group_other,
+ .idle_cpus = UINT_MAX,
+ .group_type = group_has_spare,
},
};
}
@@ -7913,13 +7890,13 @@ static inline int sg_imbalanced(struct sched_group *group)
* any benefit for the load balance.
*/
static inline bool
-group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running < sgs->group_weight)
return true;
if ((sgs->group_capacity * 100) >
- (sgs->group_util * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
return true;
return false;
@@ -7934,13 +7911,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
* false.
*/
static inline bool
-group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
if ((sgs->group_capacity * 100) <
- (sgs->group_util * env->sd->imbalance_pct))
+ (sgs->group_util * imbalance_pct))
return true;
return false;
@@ -7967,19 +7944,26 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
}
static inline enum
-group_type group_classify(struct sched_group *group,
+group_type group_classify(unsigned int imbalance_pct,
+ struct sched_group *group,
struct sg_lb_stats *sgs)
{
- if (sgs->group_no_capacity)
+ if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_asym_packing)
+ return group_asym_packing;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
- return group_other;
+ if (!group_has_capacity(imbalance_pct, sgs))
+ return group_fully_busy;
+
+ return group_has_spare;
}
static bool update_nohz_stats(struct rq *rq, bool force)
@@ -8016,21 +8000,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
- int i, nr_running;
+ int i, nr_running, local_group;
memset(sgs, 0, sizeof(*sgs));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
- sgs->group_load += cpu_runnable_load(rq);
+ sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
- sgs->sum_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
+ sgs->sum_nr_running += nr_running;
+
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
@@ -8044,9 +8032,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/*
* No need to call idle_cpu() if nr_running is not 0
*/
- if (!nr_running && idle_cpu(i))
+ if (!nr_running && idle_cpu(i)) {
sgs->idle_cpus++;
+ /* Idle cpu can't have misfit task */
+ continue;
+ }
+ if (local_group)
+ continue;
+
+ /* Check for a misfit task on the cpu */
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -8054,17 +8049,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
}
}
- /* Adjust by relative CPU capacity of the group */
- sgs->group_capacity = group->sgc->capacity;
- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+ /* Check if dst CPU is idle and preferred to this group */
+ if (env->sd->flags & SD_ASYM_PACKING &&
+ env->idle != CPU_NOT_IDLE &&
+ sgs->sum_h_nr_running &&
+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
+ sgs->group_asym_packing = 1;
+ }
- if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
+ sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
- sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(group, sgs);
+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+
+ /* Computing avg_load makes sense only when group is overloaded */
+ if (sgs->group_type == group_overloaded)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
}
/**
@@ -8087,6 +8089,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ /* Make sure that there is at least one task to pull */
+ if (!sgs->sum_h_nr_running)
+ return false;
+
/*
* Don't try to pull misfit tasks we can't help.
* We can use max_capacity here as reduction in capacity on some
@@ -8095,7 +8101,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if (sgs->group_type == group_misfit_task &&
(!group_smaller_max_cpu_capacity(sg, sds->local) ||
- !group_has_capacity(env, &sds->local_stat)))
+ sds->local_stat.group_type != group_has_spare))
return false;
if (sgs->group_type > busiest->group_type)
@@ -8104,62 +8110,88 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
- if (sgs->avg_load <= busiest->avg_load)
- return false;
-
- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
- goto asym_packing;
-
/*
- * Candidate sg has no more than one task per CPU and
- * has higher per-CPU capacity. Migrating tasks to less
- * capable CPUs may harm throughput. Maximize throughput,
- * power/energy consequences are not considered.
+ * The candidate and the current busiest group are the same type of
+ * group. Let check which one is the busiest according to the type.
*/
- if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_min_cpu_capacity(sds->local, sg))
- return false;
- /*
- * If we have more than one misfit sg go with the biggest misfit.
- */
- if (sgs->group_type == group_misfit_task &&
- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+ switch (sgs->group_type) {
+ case group_overloaded:
+ /* Select the overloaded group with highest avg_load. */
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ /*
+ * Select the 1st imbalanced group as we don't have any way to
+ * choose one more than another.
+ */
return false;
-asym_packing:
- /* This is the busiest node in its class. */
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return true;
+ case group_asym_packing:
+ /* Prefer to move from lowest priority CPU's work */
+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
+ return false;
+ break;
- /* No ASYM_PACKING if target CPU is already busy */
- if (env->idle == CPU_NOT_IDLE)
- return true;
- /*
- * ASYM_PACKING needs to move all the work to the highest
- * prority CPUs in the group, therefore mark all groups
- * of lower priority than ourself as busy.
- */
- if (sgs->sum_nr_running &&
- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
- if (!sds->busiest)
- return true;
+ case group_misfit_task:
+ /*
+ * If we have more than one misfit sg go with the biggest
+ * misfit.
+ */
+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+ return false;
+ break;
- /* Prefer to move from lowest priority CPU's work */
- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
- sg->asym_prefer_cpu))
- return true;
+ case group_fully_busy:
+ /*
+ * Select the fully busy group with highest avg_load. In
+ * theory, there is no need to pull task from such kind of
+ * group because tasks have all compute capacity that they need
+ * but we can still improve the overall throughput by reducing
+ * contention when accessing shared HW resources.
+ *
+ * XXX for now avg_load is not computed and always 0 so we
+ * select the 1st one.
+ */
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+ break;
+
+ case group_has_spare:
+ /*
+ * Select not overloaded group with lowest number of
+ * idle cpus. We could also compare the spare capacity
+ * which is more stable but it can end up that the
+ * group has less spare capacity but finally more idle
+ * CPUs which means less opportunity to pull tasks.
+ */
+ if (sgs->idle_cpus >= busiest->idle_cpus)
+ return false;
+ break;
}
- return false;
+ /*
+ * Candidate sg has no more than one task per CPU and has higher
+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
+ * throughput. Maximize throughput, power/energy consequences are not
+ * considered.
+ */
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type <= group_fully_busy) &&
+ (group_smaller_min_cpu_capacity(sds->local, sg)))
+ return false;
+
+ return true;
}
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->nr_numa_running)
+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
return regular;
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
return remote;
return all;
}
@@ -8184,18 +8216,310 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
+
+struct sg_lb_stats;
+
+/*
+ * task_running_on_cpu - return 1 if @p is running on @cpu.
+ */
+
+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return 0;
+
+ if (task_on_rq_queued(p))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * idle_cpu_without - would a given CPU be idle without p ?
+ * @cpu: the processor on which idleness is tested.
+ * @p: task which should be ignored.
+ *
+ * Return: 1 if the CPU would be idle. 0 otherwise.
+ */
+static int idle_cpu_without(int cpu, struct task_struct *p)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle && rq->curr != p)
+ return 0;
+
+ /*
+ * rq->nr_running can't be used but an updated version without the
+ * impact of p on cpu must be used instead. The updated nr_running
+ * be computed and tested before calling idle_cpu_without().
+ */
+
+#ifdef CONFIG_SMP
+ if (!llist_empty(&rq->wake_list))
+ return 0;
+#endif
+
+ return 1;
+}
+
+/*
+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
+ * @sd: The sched_domain level to look for idlest group.
+ * @group: sched_group whose statistics are to be updated.
+ * @sgs: variable to hold the statistics for this group.
+ * @p: The task for which we look for the idlest group/CPU.
+ */
+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ struct task_struct *p)
+{
+ int i, nr_running;
+
+ memset(sgs, 0, sizeof(*sgs));
+
+ for_each_cpu(i, sched_group_span(group)) {
+ struct rq *rq = cpu_rq(i);
+ unsigned int local;
+
+ sgs->group_load += cpu_load_without(rq, p);
+ sgs->group_util += cpu_util_without(i, p);
+ local = task_running_on_cpu(i, p);
+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+
+ nr_running = rq->nr_running - local;
+ sgs->sum_nr_running += nr_running;
+
+ /*
+ * No need to call idle_cpu_without() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu_without(i, p))
+ sgs->idle_cpus++;
+
+ }
+
+ /* Check if task fits in the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ !task_fits_capacity(p, group->sgc->max_capacity)) {
+ sgs->group_misfit_task_load = 1;
+ }
+
+ sgs->group_capacity = group->sgc->capacity;
+
+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
+
+ /*
+ * Computing avg_load makes sense only when group is fully busy or
+ * overloaded
+ */
+ if (sgs->group_type < group_fully_busy)
+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+ sgs->group_capacity;
+}
+
+static bool update_pick_idlest(struct sched_group *idlest,
+ struct sg_lb_stats *idlest_sgs,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
+{
+ if (sgs->group_type < idlest_sgs->group_type)
+ return true;
+
+ if (sgs->group_type > idlest_sgs->group_type)
+ return false;
+
+ /*
+ * The candidate and the current idlest group are the same type of
+ * group. Let check which one is the idlest according to the type.
+ */
+
+ switch (sgs->group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /* Select the group with lowest avg_load. */
+ if (idlest_sgs->avg_load <= sgs->avg_load)
+ return false;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ /* Those types are not used in the slow wakeup path */
+ return false;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
+ return false;
+ break;
+
+ case group_has_spare:
+ /* Select group with most idle CPUs */
+ if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+ return false;
+ break;
+ }
+
+ return true;
+}
+
+/*
+ * find_idlest_group() finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int sd_flag)
+{
+ struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
+ struct sg_lb_stats local_sgs, tmp_sgs;
+ struct sg_lb_stats *sgs;
+ unsigned long imbalance;
+ struct sg_lb_stats idlest_sgs = {
+ .avg_load = UINT_MAX,
+ .group_type = group_overloaded,
+ };
+
+ imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
+
+ do {
+ int local_group;
+
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpumask_intersects(sched_group_span(group),
+ p->cpus_ptr))
+ continue;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_span(group));
+
+ if (local_group) {
+ sgs = &local_sgs;
+ local = group;
+ } else {
+ sgs = &tmp_sgs;
+ }
+
+ update_sg_wakeup_stats(sd, group, sgs, p);
+
+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
+ idlest = group;
+ idlest_sgs = *sgs;
+ }
+
+ } while (group = group->next, group != sd->groups);
+
+
+ /* There is no idlest group to push tasks to */
+ if (!idlest)
+ return NULL;
+
+ /*
+ * If the local group is idler than the selected idlest group
+ * don't try and push the task.
+ */
+ if (local_sgs.group_type < idlest_sgs.group_type)
+ return NULL;
+
+ /*
+ * If the local group is busier than the selected idlest group
+ * try and push the task.
+ */
+ if (local_sgs.group_type > idlest_sgs.group_type)
+ return idlest;
+
+ switch (local_sgs.group_type) {
+ case group_overloaded:
+ case group_fully_busy:
+ /*
+ * When comparing groups across NUMA domains, it's possible for
+ * the local domain to be very lightly loaded relative to the
+ * remote domains but "imbalance" skews the comparison making
+ * remote CPUs look much more favourable. When considering
+ * cross-domain, add imbalance to the load on the remote node
+ * and consider staying local.
+ */
+
+ if ((sd->flags & SD_NUMA) &&
+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
+ return NULL;
+
+ /*
+ * If the local group is less loaded than the selected
+ * idlest group don't try and push any tasks.
+ */
+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
+ return NULL;
+
+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
+ return NULL;
+ break;
+
+ case group_imbalanced:
+ case group_asym_packing:
+ /* Those type are not used in the slow wakeup path */
+ return NULL;
+
+ case group_misfit_task:
+ /* Select group with the highest max capacity */
+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
+ return NULL;
+ break;
+
+ case group_has_spare:
+ if (sd->flags & SD_NUMA) {
+#ifdef CONFIG_NUMA_BALANCING
+ int idlest_cpu;
+ /*
+ * If there is spare capacity at NUMA, try to select
+ * the preferred node
+ */
+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
+ return NULL;
+
+ idlest_cpu = cpumask_first(sched_group_span(idlest));
+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
+ return idlest;
+#endif
+ /*
+ * Otherwise, keep the task on this node to stay close
+ * its wakeup source and improve locality. If there is
+ * a real need of migration, periodic load balance will
+ * take care of it.
+ */
+ if (local_sgs.idle_cpus)
+ return NULL;
+ }
+
+ /*
+ * Select group with highest number of idle CPUs. We could also
+ * compare the utilization which is more stable but it can end
+ * up that the group has less spare capacity but finally more
+ * idle CPUs which means more opportunity to run task.
+ */
+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
+ return NULL;
+ break;
+ }
+
+ return idlest;
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
+
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
@@ -8222,22 +8546,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (local_group)
goto next_group;
- /*
- * In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity so that we'll try
- * and move all the excess tasks away. We lower the capacity
- * of a group only if the local group has the capacity to fit
- * these excess tasks. The extra check prevents the case where
- * you always pull from the heaviest group when it is already
- * under-utilized (possible with a large weight task outweighs
- * the tasks on the system).
- */
- if (prefer_sibling && sds->local &&
- group_has_capacity(env, local) &&
- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
- sgs->group_no_capacity = 1;
- sgs->group_type = group_classify(sg, sgs);
- }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -8246,13 +8554,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
- sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
+ /* Tag domain that child domain prefers tasks go to siblings first */
+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+
#ifdef CONFIG_NO_HZ_COMMON
if ((env->flags & LBF_NOHZ_AGAIN) &&
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
@@ -8283,203 +8593,160 @@ next_group:
}
/**
- * check_asym_packing - Check to see if the group is packed into the
- * sched domain.
- *
- * This is primarily intended to used at the sibling level. Some
- * cores like POWER7 prefer to use lower numbered SMT threads. In the
- * case of POWER7, it can move to lower SMT modes only when higher
- * threads are idle. When in lower SMT modes, the threads will
- * perform better since they share less core resources. Hence when we
- * have idle threads, we want them to be the higher ones.
- *
- * This packing function is run on idle threads. It checks to see if
- * the busiest CPU in this domain (core in the P7 case) has a higher
- * CPU number than the packing function is being run on. Here we are
- * assuming lower CPU number will be equivalent to lower a SMT thread
- * number.
- *
- * Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in env->imbalance.
- *
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain which is to be packed
- */
-static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
-{
- int busiest_cpu;
-
- if (!(env->sd->flags & SD_ASYM_PACKING))
- return 0;
-
- if (env->idle == CPU_NOT_IDLE)
- return 0;
-
- if (!sds->busiest)
- return 0;
-
- busiest_cpu = sds->busiest->asym_prefer_cpu;
- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
- return 0;
-
- env->imbalance = sds->busiest_stat.group_load;
-
- return 1;
-}
-
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- * amongst the groups of a sched_domain, during
- * load balancing.
- * @env: The load balancing environment.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ * groups of a given sched_domain during load balance.
+ * @env: load balance environment
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/
-static inline
-void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, capa_now = 0, capa_move = 0;
- unsigned int imbn = 2;
- unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
- if (!local->sum_nr_running)
- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
- else if (busiest->load_per_task > local->load_per_task)
- imbn = 1;
+ if (busiest->group_type == group_misfit_task) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ return;
+ }
- scaled_busy_load_per_task =
- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- busiest->group_capacity;
+ if (busiest->group_type == group_asym_packing) {
+ /*
+ * In case of asym capacity, we will try to migrate all load to
+ * the preferred CPU.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = busiest->sum_h_nr_running;
+ return;
+ }
- if (busiest->avg_load + scaled_busy_load_per_task >=
- local->avg_load + (scaled_busy_load_per_task * imbn)) {
- env->imbalance = busiest->load_per_task;
+ if (busiest->group_type == group_imbalanced) {
+ /*
+ * In the group_imb case we cannot rely on group-wide averages
+ * to ensure CPU-load equilibrium, try to move any task to fix
+ * the imbalance. The next load balance will take care of
+ * balancing back the system.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
return;
}
/*
- * OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU capacity used by
- * moving them.
+ * Try to use spare capacity of local group without overloading it or
+ * emptying busiest.
+ * XXX Spreading tasks across NUMA nodes is not always the best policy
+ * and special care should be taken for SD_NUMA domain level before
+ * spreading the tasks. For now, load_balance() fully relies on
+ * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
*/
+ if (local->group_type == group_has_spare) {
+ if (busiest->group_type > group_fully_busy) {
+ /*
+ * If busiest is overloaded, try to fill spare
+ * capacity. This might end up creating spare capacity
+ * in busiest or busiest still being overloaded but
+ * there is no simple way to directly compute the
+ * amount of load to migrate in order to balance the
+ * system.
+ */
+ env->migration_type = migrate_util;
+ env->imbalance = max(local->group_capacity, local->group_util) -
+ local->group_util;
- capa_now += busiest->group_capacity *
- min(busiest->load_per_task, busiest->avg_load);
- capa_now += local->group_capacity *
- min(local->load_per_task, local->avg_load);
- capa_now /= SCHED_CAPACITY_SCALE;
-
- /* Amount of load we'd subtract */
- if (busiest->avg_load > scaled_busy_load_per_task) {
- capa_move += busiest->group_capacity *
- min(busiest->load_per_task,
- busiest->avg_load - scaled_busy_load_per_task);
- }
-
- /* Amount of load we'd add */
- if (busiest->avg_load * busiest->group_capacity <
- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
- tmp = (busiest->avg_load * busiest->group_capacity) /
- local->group_capacity;
- } else {
- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
- local->group_capacity;
- }
- capa_move += local->group_capacity *
- min(local->load_per_task, local->avg_load + tmp);
- capa_move /= SCHED_CAPACITY_SCALE;
-
- /* Move if we gain throughput */
- if (capa_move > capa_now)
- env->imbalance = busiest->load_per_task;
-}
+ /*
+ * In some cases, the group's utilization is max or even
+ * higher than capacity because of migrations but the
+ * local CPU is (newly) idle. There is at least one
+ * waiting task in this overloaded busiest group. Let's
+ * try to pull it.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ }
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- * groups of a given sched_domain during load balance.
- * @env: load balance environment
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- */
-static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-{
- unsigned long max_pull, load_above_capacity = ~0UL;
- struct sg_lb_stats *local, *busiest;
+ return;
+ }
- local = &sds->local_stat;
- busiest = &sds->busiest_stat;
+ if (busiest->group_weight == 1 || sds->prefer_sibling) {
+ unsigned int nr_diff = busiest->sum_nr_running;
+ /*
+ * When prefer sibling, evenly spread running tasks on
+ * groups.
+ */
+ env->migration_type = migrate_task;
+ lsub_positive(&nr_diff, local->sum_nr_running);
+ env->imbalance = nr_diff >> 1;
+ return;
+ }
- if (busiest->group_type == group_imbalanced) {
/*
- * In the group_imb case we cannot rely on group-wide averages
- * to ensure CPU-load equilibrium, look at wider averages. XXX
+ * If there is no overload, we just want to even the number of
+ * idle cpus.
*/
- busiest->load_per_task =
- min(busiest->load_per_task, sds->avg_load);
+ env->migration_type = migrate_task;
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
+ busiest->idle_cpus) >> 1);
+ return;
}
/*
- * Avg load of busiest sg can be less and avg load of local sg can
- * be greater than avg load across all sgs of sd because avg load
- * factors in sg capacity and sgs with smaller group_type are
- * skipped when updating the busiest sg:
+ * Local is fully busy but has to take more load to relieve the
+ * busiest group
*/
- if (busiest->group_type != group_misfit_task &&
- (busiest->avg_load <= sds->avg_load ||
- local->avg_load >= sds->avg_load)) {
- env->imbalance = 0;
- return fix_small_imbalance(env, sds);
- }
+ if (local->group_type < group_overloaded) {
+ /*
+ * Local will become overloaded so the avg_load metrics are
+ * finally needed.
+ */
- /*
- * If there aren't any idle CPUs, avoid creating some.
- */
- if (busiest->group_type == group_overloaded &&
- local->group_type == group_overloaded) {
- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
- if (load_above_capacity > busiest->group_capacity) {
- load_above_capacity -= busiest->group_capacity;
- load_above_capacity *= scale_load_down(NICE_0_LOAD);
- load_above_capacity /= busiest->group_capacity;
- } else
- load_above_capacity = ~0UL;
+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
}
/*
- * We're trying to get all the CPUs to the average_load, so we don't
- * want to push ourselves above the average load, nor do we wish to
- * reduce the max loaded CPU below the average load. At the same time,
- * we also don't want to reduce the group load below the group
- * capacity. Thus we look for the minimum possible imbalance.
+ * Both group are or will become overloaded and we're trying to get all
+ * the CPUs to the average_load, so we don't want to push ourselves
+ * above the average load, nor do we wish to reduce the max loaded CPU
+ * below the average load. At the same time, we also don't want to
+ * reduce the group load below the group capacity. Thus we look for
+ * the minimum possible imbalance.
*/
- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
-
- /* How much load to actually move to equalise the imbalance */
+ env->migration_type = migrate_load;
env->imbalance = min(
- max_pull * busiest->group_capacity,
+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
-
- /* Boost imbalance to allow misfit task to be balanced. */
- if (busiest->group_type == group_misfit_task) {
- env->imbalance = max_t(long, env->imbalance,
- busiest->group_misfit_task_load);
- }
-
- /*
- * if *imbalance is less than the average load per runnable task
- * there is no guarantee that any tasks will be moved so we'll have
- * a think about bumping its value to force at least one task to be
- * moved
- */
- if (env->imbalance < busiest->load_per_task)
- return fix_small_imbalance(env, sds);
}
/******* find_busiest_group() helpers end here *********************/
+/*
+ * Decision matrix according to the local and busiest group type:
+ *
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
+ * has_spare nr_idle balanced N/A N/A balanced balanced
+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
+ * misfit_task force N/A N/A N/A force force
+ * asym_packing force force N/A N/A force force
+ * imbalanced force force N/A N/A force force
+ * overloaded force force N/A N/A force avg_load
+ *
+ * N/A : Not Applicable because already filtered while updating
+ * statistics.
+ * balanced : The system is balanced for these 2 groups.
+ * force : Calculate the imbalance as load migration is probably needed.
+ * avg_load : Only if imbalance is significant enough.
+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
+ * different in groups.
+ */
+
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
@@ -8499,7 +8766,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
init_sd_lb_stats(&sds);
/*
- * Compute the various statistics relavent for load balancing at
+ * Compute the various statistics relevant for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
@@ -8514,17 +8781,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
- /* ASYM feature bypasses nice load balance check */
- if (check_asym_packing(env, &sds))
- return sds.busiest;
-
/* There is no busy sibling group to pull tasks from */
- if (!sds.busiest || busiest->sum_nr_running == 0)
+ if (!sds.busiest)
goto out_balanced;
- /* XXX broken for overlapping NUMA groups */
- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
- / sds.total_capacity;
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
+
+ /* ASYM feature bypasses nice load balance check */
+ if (busiest->group_type == group_asym_packing)
+ goto force_balance;
/*
* If the busiest group is imbalanced the below checks don't
@@ -8535,55 +8802,80 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/*
- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
- * capacities from resulting in underutilization due to avg_load.
- */
- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
- busiest->group_no_capacity)
- goto force_balance;
-
- /* Misfit tasks should be dealt with regardless of the avg load */
- if (busiest->group_type == group_misfit_task)
- goto force_balance;
-
- /*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
- if (local->avg_load >= busiest->avg_load)
+ if (local->group_type > busiest->group_type)
goto out_balanced;
/*
- * Don't pull any tasks if this group is already above the domain
- * average load.
+ * When groups are overloaded, use the avg_load to ensure fairness
+ * between tasks.
*/
- if (local->avg_load >= sds.avg_load)
- goto out_balanced;
+ if (local->group_type == group_overloaded) {
+ /*
+ * If the local group is more loaded than the selected
+ * busiest group don't try to pull any tasks.
+ */
+ if (local->avg_load >= busiest->avg_load)
+ goto out_balanced;
+
+ /* XXX broken for overlapping NUMA groups */
+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
+ sds.total_capacity;
- if (env->idle == CPU_IDLE) {
/*
- * This CPU is idle. If the busiest group is not overloaded
- * and there is no imbalance between this and busiest group
- * wrt idle CPUs, it is balanced. The imbalance becomes
- * significant if the diff is greater than 1 otherwise we
- * might end up to just move the imbalance on another group
+ * Don't pull any tasks if this group is already above the
+ * domain average load.
*/
- if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ if (local->avg_load >= sds.avg_load)
goto out_balanced;
- } else {
+
/*
- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
- * imbalance_pct to be conservative.
+ * If the busiest group is more loaded, use imbalance_pct to be
+ * conservative.
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
+ /* Try to move all excess tasks to child's sibling domain */
+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
+ busiest->sum_nr_running > local->sum_nr_running + 1)
+ goto force_balance;
+
+ if (busiest->group_type != group_overloaded) {
+ if (env->idle == CPU_NOT_IDLE)
+ /*
+ * If the busiest group is not overloaded (and as a
+ * result the local one too) but this CPU is already
+ * busy, let another idle CPU try to pull task.
+ */
+ goto out_balanced;
+
+ if (busiest->group_weight > 1 &&
+ local->idle_cpus <= (busiest->idle_cpus + 1))
+ /*
+ * If the busiest group is not overloaded
+ * and there is no imbalance between this and busiest
+ * group wrt idle CPUs, it is balanced. The imbalance
+ * becomes significant if the diff is greater than 1
+ * otherwise we might end up to just move the imbalance
+ * on another group. Of course this applies only if
+ * there is more than 1 CPU per group.
+ */
+ goto out_balanced;
+
+ if (busiest->sum_h_nr_running == 1)
+ /*
+ * busiest doesn't have any tasks waiting to run
+ */
+ goto out_balanced;
+ }
+
force_balance:
/* Looks like there is an imbalance. Compute it */
- env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
@@ -8599,11 +8891,13 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long busiest_load = 0, busiest_capacity = 1;
+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
+ unsigned int busiest_nr = 0;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
- unsigned long capacity, load;
+ unsigned long capacity, load, util;
+ unsigned int nr_running;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -8631,20 +8925,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- /*
- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
- * seek the "biggest" misfit task.
- */
- if (env->src_grp_type == group_misfit_task) {
- if (rq->misfit_task_load > busiest_load) {
- busiest_load = rq->misfit_task_load;
- busiest = rq;
- }
-
- continue;
- }
-
capacity = capacity_of(i);
+ nr_running = rq->cfs.h_nr_running;
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -8654,35 +8936,69 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
capacity_of(env->dst_cpu) < capacity &&
- rq->nr_running == 1)
+ nr_running == 1)
continue;
- load = cpu_runnable_load(rq);
+ switch (env->migration_type) {
+ case migrate_load:
+ /*
+ * When comparing with load imbalance, use cpu_load()
+ * which is not scaled with the CPU capacity.
+ */
+ load = cpu_load(rq);
- /*
- * When comparing with imbalance, use cpu_runnable_load()
- * which is not scaled with the CPU capacity.
- */
+ if (nr_running == 1 && load > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
+ break;
- if (rq->nr_running == 1 && load > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
- continue;
+ /*
+ * For the load comparisons with the other CPUs,
+ * consider the cpu_load() scaled with the CPU
+ * capacity, so that the load can be moved away
+ * from the CPU that is potentially running at a
+ * lower capacity.
+ *
+ * Thus we're looking for max(load_i / capacity_i),
+ * crosswise multiplication to rid ourselves of the
+ * division works out to:
+ * load_i * capacity_j > load_j * capacity_i;
+ * where j is our previous maximum.
+ */
+ if (load * busiest_capacity > busiest_load * capacity) {
+ busiest_load = load;
+ busiest_capacity = capacity;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_util:
+ util = cpu_util(cpu_of(rq));
+
+ if (busiest_util < util) {
+ busiest_util = util;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_task:
+ if (busiest_nr < nr_running) {
+ busiest_nr = nr_running;
+ busiest = rq;
+ }
+ break;
+
+ case migrate_misfit:
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we
+ * simply seek the "biggest" misfit task.
+ */
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+
+ break;
- /*
- * For the load comparisons with the other CPU's, consider
- * the cpu_runnable_load() scaled with the CPU capacity, so
- * that the load can be moved away from the CPU that is
- * potentially running at a lower capacity.
- *
- * Thus we're looking for max(load_i / capacity_i), crosswise
- * multiplication to rid ourselves of the division works out
- * to: load_i * capacity_j > load_j * capacity_i; where j is
- * our previous maximum.
- */
- if (load * busiest_capacity > busiest_load * capacity) {
- busiest_load = load;
- busiest_capacity = capacity;
- busiest = rq;
}
}
@@ -8728,7 +9044,7 @@ voluntary_active_balance(struct lb_env *env)
return 1;
}
- if (env->src_grp_type == group_misfit_task)
+ if (env->migration_type == migrate_misfit)
return 1;
return 0;
@@ -9757,6 +10073,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
+ *
+ * Returns:
+ * < 0 - we released the lock and there are !fair tasks present
+ * 0 - failed, no new tasks
+ * > 0 - success, new (fair) tasks present
*/
int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
@@ -10151,7 +10472,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_next_task_fair(struct rq *rq, struct task_struct *p)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
@@ -10433,7 +10754,7 @@ const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
- .pick_next_task = pick_next_task_fair,
+ .pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 2410db5e9a35..7481cd96f391 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,3 +89,4 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
+SCHED_FEAT(UTIL_EST_FASTUP, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80167b1bd98a..ffa959e91227 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -391,21 +391,17 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
}
-static void set_next_task_idle(struct rq *rq, struct task_struct *next)
+static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
}
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
- if (prev)
- put_prev_task(rq, prev);
-
- set_next_task_idle(rq, next);
+ set_next_task_idle(rq, next, true);
return next;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9b8adc01be3d..e591d40fd645 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1515,13 +1515,16 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
-static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+ if (!first)
+ return;
+
/*
* If prev task was rt, put_prev_task() has already updated the
* utilization. We only care of the case where we start to schedule a
@@ -1564,18 +1567,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p;
- WARN_ON_ONCE(prev || rf);
-
if (!sched_rt_runnable(rq))
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p);
+ set_next_task_rt(rq, p, true);
return p;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8870c5bd7df..280a3c735935 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1713,22 +1713,10 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
- /*
- * Both @prev and @rf are optional and may be NULL, in which case the
- * caller must already have invoked put_prev_task(rq, prev, rf).
- *
- * Otherwise it is the responsibility of the pick_next_task() to call
- * put_prev_task() on the @prev task or something equivalent, IFF it
- * returns a next task.
- *
- * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
- * higher prio class has runnable tasks.
- */
- struct task_struct * (*pick_next_task)(struct rq *rq,
- struct task_struct *prev,
- struct rq_flags *rf);
+ struct task_struct *(*pick_next_task)(struct rq *rq);
+
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
- void (*set_next_task)(struct rq *rq, struct task_struct *p);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
@@ -1780,7 +1768,7 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
WARN_ON_ONCE(rq->curr != next);
- next->sched_class->set_next_task(rq, next);
+ next->sched_class->set_next_task(rq, next, false);
}
#ifdef CONFIG_SMP
@@ -1821,6 +1809,9 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_running > 0;
}
+extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+extern struct task_struct *pick_next_task_idle(struct rq *rq);
+
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
@@ -2309,7 +2300,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c0640739e05e..4c9e9975684f 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -29,20 +29,17 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
-static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first)
{
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_stop(struct rq *rq)
{
- WARN_ON_ONCE(prev || rf);
-
if (!sched_stop_runnable(rq))
return NULL;
- set_next_task_stop(rq, rq->stop);
+ set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 49b835f1305f..6ec1e595b1d4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1201,16 +1201,13 @@ static void set_domain_attribute(struct sched_domain *sd,
if (!attr || attr->relax_domain_level < 0) {
if (default_relax_domain_level < 0)
return;
- else
- request = default_relax_domain_level;
+ request = default_relax_domain_level;
} else
request = attr->relax_domain_level;
- if (request < sd->level) {
+
+ if (sd->level > request) {
/* Turn off idle balance on this domain: */
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
- } else {
- /* Turn on idle balance on this domain: */
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
}
}