summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c341
1 files changed, 268 insertions, 73 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b39fb596f6c1..ee271bb661cc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
+static unsigned long capacity_of(int cpu);
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
@@ -1392,6 +1393,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int last_cpupid, this_cpupid;
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+
+ /*
+ * Allow first faults or private faults to migrate immediately early in
+ * the lifetime of a task. The magic number 4 is based on waiting for
+ * two full passes of the "multi-stage node selection" test that is
+ * executed below.
+ */
+ if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+ return true;
/*
* Multi-stage node selection is used in conjunction with a periodic
@@ -1410,7 +1422,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* This quadric squishes small probabilities, making it less likely we
* act on an unlikely task<->page relation.
*/
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid)
return false;
@@ -1446,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -1454,8 +1464,6 @@ struct numa_stats {
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
-
- unsigned int nr_running;
};
/*
@@ -1463,36 +1471,16 @@ struct numa_stats {
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int smt, cpu, cpus = 0;
- unsigned long capacity;
+ int cpu;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
- ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
-
- cpus++;
}
- /*
- * If we raced with hotplug and there are no CPUs left in our mask
- * the @ns structure is NULL'ed and task_numa_compare() will
- * not find this node attractive.
- *
- * We'll detect a huge imbalance and bail there.
- */
- if (!cpus)
- return;
-
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
- capacity = cpus / smt; /* cores */
-
- capacity = min_t(unsigned, capacity,
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
}
struct task_numa_env {
@@ -1514,6 +1502,21 @@ struct task_numa_env {
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
+ struct rq *rq = cpu_rq(env->dst_cpu);
+
+ /* Bail out if run-queue part of active NUMA balance. */
+ if (xchg(&rq->numa_migrate_on, 1))
+ return;
+
+ /*
+ * Clear previous best_cpu/rq numa-migrate flag, since task now
+ * found a better CPU to move/swap.
+ */
+ if (env->best_cpu != -1) {
+ rq = cpu_rq(env->best_cpu);
+ WRITE_ONCE(rq->numa_migrate_on, 0);
+ }
+
if (env->best_task)
put_task_struct(env->best_task);
if (p)
@@ -1553,6 +1556,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
}
/*
+ * Maximum NUMA importance can be 1998 (2*999);
+ * SMALLIMP @ 30 would be close to 1998/64.
+ * Used to deter task migration.
+ */
+#define SMALLIMP 30
+
+/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
* into account that it might be best if task running on the dst_cpu should
@@ -1569,6 +1579,9 @@ static void task_numa_compare(struct task_numa_env *env,
long moveimp = imp;
int dist = env->dist;
+ if (READ_ONCE(dst_rq->numa_migrate_on))
+ return;
+
rcu_read_lock();
cur = task_rcu_dereference(&dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
@@ -1582,7 +1595,7 @@ static void task_numa_compare(struct task_numa_env *env,
goto unlock;
if (!cur) {
- if (maymove || imp > env->best_imp)
+ if (maymove && moveimp >= env->best_imp)
goto assign;
else
goto unlock;
@@ -1625,16 +1638,22 @@ static void task_numa_compare(struct task_numa_env *env,
task_weight(cur, env->dst_nid, dist);
}
- if (imp <= env->best_imp)
- goto unlock;
-
if (maymove && moveimp > imp && moveimp > env->best_imp) {
- imp = moveimp - 1;
+ imp = moveimp;
cur = NULL;
goto assign;
}
/*
+ * If the NUMA importance is less than SMALLIMP,
+ * task migration might only result in ping pong
+ * of tasks and also hurt performance due to cache
+ * misses.
+ */
+ if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
+ goto unlock;
+
+ /*
* In the overloaded case, try and keep the load balanced.
*/
load = task_h_load(env->p) - task_h_load(cur);
@@ -1710,6 +1729,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_cpu = -1,
};
struct sched_domain *sd;
+ struct rq *best_rq;
unsigned long taskweight, groupweight;
int nid, ret, dist;
long taskimp, groupimp;
@@ -1805,20 +1825,17 @@ static int task_numa_migrate(struct task_struct *p)
if (env.best_cpu == -1)
return -EAGAIN;
- /*
- * Reset the scan period if the task is being rescheduled on an
- * alternative node to recheck if the tasks is now properly placed.
- */
- p->numa_scan_period = task_scan_start(p);
-
+ best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
return ret;
}
ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+ WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
@@ -2596,6 +2613,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
+static void update_scan_period(struct task_struct *p, int new_cpu)
+{
+ int src_nid = cpu_to_node(task_cpu(p));
+ int dst_nid = cpu_to_node(new_cpu);
+
+ if (!static_branch_likely(&sched_numa_balancing))
+ return;
+
+ if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
+ return;
+
+ if (src_nid == dst_nid)
+ return;
+
+ /*
+ * Allow resets if faults have been trapped before one scan
+ * has completed. This is most likely due to a new task that
+ * is pulled cross-node due to wakeups or load balancing.
+ */
+ if (p->numa_scan_seq) {
+ /*
+ * Avoid scan adjustments if moving to the preferred
+ * node or if the task was not previously running on
+ * the preferred node.
+ */
+ if (dst_nid == p->numa_preferred_nid ||
+ (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ return;
+ }
+
+ p->numa_scan_period = task_scan_start(p);
+}
+
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2609,6 +2659,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
+static inline void update_scan_period(struct task_struct *p, int new_cpu)
+{
+}
+
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -3362,6 +3416,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
+ * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
@@ -3646,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
WRITE_ONCE(p->se.avg.util_est, ue);
}
+static inline int task_fits_capacity(struct task_struct *p, long capacity)
+{
+ return capacity * 1024 > task_util_est(p) * capacity_margin;
+}
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return;
+
+ if (!p) {
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ rq->misfit_task_load = task_h_load(p);
+}
+
#else /* CONFIG_SMP */
#define UPDATE_TG 0x0
@@ -3675,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
bool task_sleep) {}
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
@@ -3924,7 +4003,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
}
@@ -4399,9 +4478,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
/*
* Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us
+ * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+ * not running add to the tail so that later runqueues don't get starved.
*/
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (cfs_b->distribute_running)
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ else
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
/*
* If we're the first throttled task, make sure the bandwidth
@@ -4545,14 +4628,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* in us over-using our runtime if it is all used during this loop, but
* only by limited amounts in that extreme case.
*/
- while (throttled && cfs_b->runtime > 0) {
+ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
+ cfs_b->distribute_running = 1;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
raw_spin_lock(&cfs_b->lock);
+ cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4663,6 +4748,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/* confirm we're still not at a refresh boundary */
raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->distribute_running) {
+ raw_spin_unlock(&cfs_b->lock);
+ return;
+ }
+
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock(&cfs_b->lock);
return;
@@ -4672,6 +4762,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
runtime = cfs_b->runtime;
expires = cfs_b->runtime_expires;
+ if (runtime)
+ cfs_b->distribute_running = 1;
+
raw_spin_unlock(&cfs_b->lock);
if (!runtime)
@@ -4682,6 +4775,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock);
}
@@ -4790,6 +4884,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ cfs_b->distribute_running = 0;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -6187,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
{
long min_cap, max_cap;
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return 0;
+
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
@@ -6197,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
/* Bring task utilization in sync with prev_cpu */
sync_entity_load_avg(&p->se);
- return min_cap * 1024 < task_util(p) * capacity_margin;
+ return !task_fits_capacity(p, min_cap);
}
/*
@@ -6274,7 +6372,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
-static void migrate_task_rq_fair(struct task_struct *p)
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
/*
* As blocked tasks retain absolute vruntime the migration needs to
@@ -6327,6 +6425,8 @@ static void migrate_task_rq_fair(struct task_struct *p)
/* We have migrated, no longer consider this task hot */
p->se.exec_start = 0;
+
+ update_scan_period(p, new_cpu);
}
static void task_dead_fair(struct task_struct *p)
@@ -6614,9 +6714,12 @@ done: __maybe_unused;
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ update_misfit_status(p, rq);
+
return p;
idle:
+ update_misfit_status(NULL, rq);
new_tasks = idle_balance(rq, rf);
/*
@@ -6822,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -6852,6 +6962,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type src_grp_type;
struct list_head tasks;
};
@@ -7232,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq)
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if (READ_ONCE(rq->avg_irq.util_avg))
return true;
#endif
@@ -7263,6 +7374,7 @@ static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
+ const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7299,8 +7411,10 @@ static void update_blocked_averages(int cpu)
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7365,13 +7479,16 @@ static inline void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
+ const struct sched_class *curr_class;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
+ curr_class = rq->curr->sched_class;
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -7389,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p)
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -7410,6 +7521,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -7482,10 +7594,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long scale_rt_capacity(int cpu)
+static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+ unsigned long max = arch_scale_cpu_capacity(sd, cpu);
unsigned long used, free;
unsigned long irq;
@@ -7507,7 +7619,7 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(cpu);
+ unsigned long capacity = scale_rt_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
@@ -7518,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, min_capacity;
+ unsigned long capacity, min_capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -7538,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity = 0;
min_capacity = ULONG_MAX;
+ max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -7568,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
min_capacity = min(capacity, min_capacity);
+ max_capacity = max(capacity, max_capacity);
}
} else {
/*
@@ -7581,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity += sgc->capacity;
min_capacity = min(sgc->min_capacity, min_capacity);
+ max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = min_capacity;
+ sdg->sgc->max_capacity = max_capacity;
}
/*
@@ -7682,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
}
/*
- * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
* per-CPU capacity than sched_group ref.
*/
static inline bool
-group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
return sg->sgc->min_capacity * capacity_margin <
ref->sgc->min_capacity * 1024;
}
+/*
+ * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity_orig than sched_group ref.
+ */
+static inline bool
+group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity * capacity_margin <
+ ref->sgc->max_capacity * 1024;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
@@ -7702,6 +7830,9 @@ group_type group_classify(struct sched_group *group,
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task_load)
+ return group_misfit_task;
+
return group_other;
}
@@ -7734,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
- * @overload: Indicate more than one runnable task for any CPU.
+ * @overload: Indicate pullable load (e.g. >1 runnable task).
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
@@ -7776,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*/
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *overload = 1;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -7811,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ /*
+ * Don't try to pull misfit tasks we can't help.
+ * We can use max_capacity here as reduction in capacity on some
+ * CPUs in the group should either be possible to resolve
+ * internally or be covered by avg_load imbalance (eventually).
+ */
+ if (sgs->group_type == group_misfit_task &&
+ (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+ !group_has_capacity(env, &sds->local_stat)))
+ return false;
+
if (sgs->group_type > busiest->group_type)
return true;
@@ -7830,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* power/energy consequences are not considered.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_cpu_capacity(sds->local, sg))
+ group_smaller_min_cpu_capacity(sds->local, sg))
+ return false;
+
+ /*
+ * If we have more than one misfit sg go with the biggest misfit.
+ */
+ if (sgs->group_type == group_misfit_task &&
+ sgs->group_misfit_task_load < busiest->group_misfit_task_load)
return false;
asym_packing:
@@ -7901,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- int load_idx, prefer_sibling = 0;
+ int load_idx;
bool overload = false;
-
- if (child && child->flags & SD_PREFER_SIBLING)
- prefer_sibling = 1;
+ bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
@@ -7979,8 +8132,8 @@ next_group:
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
- if (env->dst_rq->rd->overload != overload)
- env->dst_rq->rd->overload = overload;
+ if (READ_ONCE(env->dst_rq->rd->overload) != overload)
+ WRITE_ONCE(env->dst_rq->rd->overload, overload);
}
}
@@ -8130,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* factors in sg capacity and sgs with smaller group_type are
* skipped when updating the busiest sg:
*/
- if (busiest->avg_load <= sds->avg_load ||
- local->avg_load >= sds->avg_load) {
+ if (busiest->group_type != group_misfit_task &&
+ (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load)) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
@@ -8165,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task_load);
+ }
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
@@ -8231,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
busiest->group_no_capacity)
goto force_balance;
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
+
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -8268,8 +8432,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
force_balance:
/* Looks like there is an imbalance. Compute it */
+ env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
- return sds.busiest;
+ return env->imbalance ? sds.busiest : NULL;
out_balanced:
env->imbalance = 0;
@@ -8315,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we simply
+ * seek the "biggest" misfit task.
+ */
+ if (env->src_grp_type == group_misfit_task) {
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+
+ continue;
+ }
+
capacity = capacity_of(i);
+ /*
+ * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
+ * eventually lead to active_balancing high->low capacity.
+ * Higher per-CPU capacity is considered better than balancing
+ * average load.
+ */
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ capacity_of(env->dst_cpu) < capacity &&
+ rq->nr_running == 1)
+ continue;
+
wl = weighted_cpuload(rq);
/*
@@ -8384,6 +8573,9 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ if (env->src_grp_type == group_misfit_task)
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -9026,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq)
if (time_before(now, nohz.next_balance))
goto out;
- if (rq->nr_running >= 2) {
+ if (rq->nr_running >= 2 || rq->misfit_task_load) {
flags = NOHZ_KICK_MASK;
goto out;
}
@@ -9395,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
rq_unpin_lock(this_rq, rf);
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ !READ_ONCE(this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
@@ -9557,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+ update_misfit_status(curr, rq);
}
/*
@@ -9638,7 +9832,8 @@ static inline bool vruntime_normalized(struct task_struct *p)
* - A task which has been woken up by try_to_wake_up() and
* waiting for actually being woken up by sched_ttwu_pending().
*/
- if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ if (!se->sum_exec_runtime ||
+ (p->state == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;