summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c329
1 files changed, 203 insertions, 126 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..a80a73909dc2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
+
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
+
+ if (!idle_cpu(sibling))
+ return false;
+ }
+#endif
+
+ return true;
+}
+
#ifdef CONFIG_NUMA
#define NUMA_IMBALANCE_MIN 2
@@ -1700,23 +1717,6 @@ struct numa_stats {
int idle_cpu;
};
-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- int sibling;
-
- for_each_cpu(sibling, cpu_smt_mask(cpu)) {
- if (cpu == sibling)
- continue;
-
- if (!idle_cpu(sibling))
- return false;
- }
-#endif
-
- return true;
-}
-
struct task_numa_env {
struct task_struct *p;
@@ -5577,6 +5577,14 @@ static void __cfsb_csd_unthrottle(void *arg)
rq_lock(rq, &rf);
/*
+ * Iterating over the list can trigger several call to
+ * update_rq_clock() in unthrottle_cfs_rq().
+ * Do it once and skip the potential next ones.
+ */
+ update_rq_clock(rq);
+ rq_clock_start_loop_update(rq);
+
+ /*
* Since we hold rq lock we're safe from concurrent manipulation of
* the CSD list. However, this RCU critical section annotates the
* fact that we pair with sched_free_group_rcu(), so that we cannot
@@ -5595,6 +5603,7 @@ static void __cfsb_csd_unthrottle(void *arg)
rcu_read_unlock();
+ rq_clock_stop_loop_update(rq);
rq_unlock(rq, &rf);
}
@@ -6115,6 +6124,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
+ /*
+ * The rq clock has already been updated in the
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
@@ -6137,6 +6153,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
}
#else /* CONFIG_CFS_BANDWIDTH */
@@ -7202,14 +7220,58 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/*
- * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
- * (@dst_cpu = -1) or migrated to @dst_cpu.
- */
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+/**
+ * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for
+ * @p: task for which the CPU utilization should be predicted or NULL
+ * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
+ * @boost: 1 to enable boosting, otherwise 0
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
+ * CPU contention for CFS tasks can be detected by CPU runnable > CPU
+ * utilization. Boosting is implemented in cpu_util() so that internal
+ * users (e.g. EAS) can use it next to external users (e.g. schedutil),
+ * latter via cpu_util_cfs_boost().
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Boosted) (estimated) utilization for the specified CPU.
+ */
+static unsigned long
+cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long runnable;
+
+ if (boost) {
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ util = max(util, runnable);
+ }
/*
* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
@@ -7217,9 +7279,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* contribution. In all the other cases @cpu is not impacted by the
* migration so its util_avg is already correct.
*/
- if (task_cpu(p) == cpu && dst_cpu != cpu)
+ if (p && task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
+ else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
@@ -7227,6 +7289,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ if (boost)
+ util_est = max(util_est, runnable);
+
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
@@ -7255,7 +7320,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
- else if (unlikely(task_on_rq_queued(p) || current == p))
+ else if (p && unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
@@ -7264,6 +7329,16 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
return min(util, capacity_orig_of(cpu));
}
+unsigned long cpu_util_cfs(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 0);
+}
+
+unsigned long cpu_util_cfs_boost(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 1);
+}
+
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -7281,9 +7356,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util_cfs(cpu);
+ p = NULL;
- return cpu_util_next(cpu, p, -1);
+ return cpu_util(cpu, p, -1, 0);
}
/*
@@ -7330,7 +7405,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv,
* cpu_capacity.
*
* The contribution of the task @p for which we want to estimate the
- * energy cost is removed (by cpu_util_next()) and must be calculated
+ * energy cost is removed (by cpu_util()) and must be calculated
* separately (see eenv_task_busy_time). This ensures:
*
* - A stable PD utilization, no matter which CPU of that PD we want to place
@@ -7351,7 +7426,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
int cpu;
for_each_cpu(cpu, pd_cpus) {
- unsigned long util = cpu_util_next(cpu, p, -1);
+ unsigned long util = cpu_util(cpu, p, -1, 0);
busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
}
@@ -7375,8 +7450,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
for_each_cpu(cpu, pd_cpus) {
struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
- unsigned long util = cpu_util_next(cpu, p, dst_cpu);
- unsigned long cpu_util;
+ unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ unsigned long eff_util;
/*
* Performance domain frequency: utilization clamping
@@ -7385,8 +7460,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, eff_util);
}
return min(max_util, eenv->cpu_cap);
@@ -7521,7 +7596,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- util = cpu_util_next(cpu, p, cpu);
+ util = cpu_util(cpu, p, cpu, 0);
cpu_cap = capacity_of(cpu);
/*
@@ -9331,96 +9406,61 @@ group_type group_classify(unsigned int imbalance_pct,
}
/**
- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
- * @dst_cpu: Destination CPU of the load balancing
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd: The scheduling domain of the load balancing
+ * @cpu: A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
+{
+ if (!sched_smt_active())
+ return true;
+
+ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+/**
+ * sched_asym - Check if the destination CPU can do asym_packing load balance
+ * @env: The load balancing environment
* @sds: Load-balancing data with statistics of the local group
* @sgs: Load-balancing statistics of the candidate busiest group
- * @sg: The candidate busiest group
+ * @group: The candidate busiest group
*
- * Check the state of the SMT siblings of both @sds::local and @sg and decide
- * if @dst_cpu can pull tasks.
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
*
- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
- * only if @dst_cpu has higher priority.
+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
+ * imbalances in the number of CPUS are dealt with in find_busiest_group().
*
- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
- * Bigger imbalances in the number of busy CPUs will be dealt with in
- * update_sd_pick_busiest().
+ * If we are balancing load within an SMT core, or at DIE domain level, always
+ * proceed.
*
- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
- * of @dst_cpu are idle and @sg has lower priority.
- *
- * Return: true if @dst_cpu can pull tasks, false otherwise.
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
*/
-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
- struct sg_lb_stats *sgs,
- struct sched_group *sg)
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
+ struct sched_group *group)
{
-#ifdef CONFIG_SCHED_SMT
- bool local_is_smt, sg_is_smt;
- int sg_busy_cpus;
-
- local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
- sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
-
- sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
-
- if (!local_is_smt) {
- /*
- * If we are here, @dst_cpu is idle and does not have SMT
- * siblings. Pull tasks if candidate group has two or more
- * busy CPUs.
- */
- if (sg_busy_cpus >= 2) /* implies sg_is_smt */
- return true;
-
- /*
- * @dst_cpu does not have SMT siblings. @sg may have SMT
- * siblings and only one is busy. In such case, @dst_cpu
- * can help if it has higher priority and is idle (i.e.,
- * it has no running tasks).
- */
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
- }
-
- /* @dst_cpu has SMT siblings. */
-
- if (sg_is_smt) {
- int local_busy_cpus = sds->local->group_weight -
- sds->local_stat.idle_cpus;
- int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
-
- if (busy_cpus_delta == 1)
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
+ /* Ensure that the whole local core is idle, if applicable. */
+ if (!sched_use_asym_prio(env->sd, env->dst_cpu))
return false;
- }
/*
- * @sg does not have SMT siblings. Ensure that @sds::local does not end
- * up with more than one busy SMT sibling and only pull tasks if there
- * are not busy CPUs (i.e., no CPU has running tasks).
+ * CPU priorities does not make sense for SMT cores with more than one
+ * busy sibling.
*/
- if (!sds->local_stat.sum_nr_running)
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
- return false;
-#else
- /* Always return false so that callers deal with non-SMT cases. */
- return false;
-#endif
-}
-
-static inline bool
-sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
- struct sched_group *group)
-{
- /* Only do SMT checks if either local or candidate have SMT siblings */
- if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
- (group->flags & SD_SHARE_CPUCAPACITY))
- return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+ if (group->flags & SD_SHARE_CPUCAPACITY) {
+ if (sgs->group_weight - sgs->idle_cpus != 1)
+ return false;
+ }
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
@@ -9610,10 +9650,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
- * select the 1st one.
+ * select the 1st one, except if @sg is composed of SMT
+ * siblings.
*/
- if (sgs->avg_load <= busiest->avg_load)
+
+ if (sgs->avg_load < busiest->avg_load)
return false;
+
+ if (sgs->avg_load == busiest->avg_load) {
+ /*
+ * SMT sched groups need more help than non-SMT groups.
+ * If @sg happens to also be SMT, either choice is good.
+ */
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+ return false;
+ }
+
break;
case group_has_spare:
@@ -10088,7 +10140,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
@@ -10129,8 +10180,13 @@ next_group:
sg = sg->next;
} while (sg != env->sd->groups);
- /* Tag domain that child domain prefers tasks go to siblings first */
- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+ /*
+ * Indicate that the child domain of the busiest group prefers tasks
+ * go to a child's sibling domains first. NB the flags of a sched group
+ * are those of the child domain.
+ */
+ if (sds->busiest)
+ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
if (env->sd->flags & SD_NUMA)
@@ -10440,7 +10496,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto out_balanced;
}
- /* Try to move all excess tasks to child's sibling domain */
+ /*
+ * Try to move all excess tasks to a sibling domain of the busiest
+ * group's child domain.
+ */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
busiest->sum_nr_running > local->sum_nr_running + 1)
goto force_balance;
@@ -10542,8 +10601,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
nr_running == 1)
continue;
- /* Make sure we only pull tasks from a CPU of lower priority */
+ /*
+ * Make sure we only pull tasks from a CPU of lower priority
+ * when balancing between SMT siblings.
+ *
+ * If balancing between cores, let lower priority CPUs help
+ * SMT cores with more than one busy sibling.
+ */
if ((env->sd->flags & SD_ASYM_PACKING) &&
+ sched_use_asym_prio(env->sd, i) &&
sched_asym_prefer(i, env->dst_cpu) &&
nr_running == 1)
continue;
@@ -10581,7 +10647,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util_cfs(i);
+ util = cpu_util_cfs_boost(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -10632,12 +10698,19 @@ static inline bool
asym_active_balance(struct lb_env *env)
{
/*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
+ * ASYM_PACKING needs to force migrate tasks from busy but lower
+ * priority CPUs in order to pack all tasks in the highest priority
+ * CPUs. When done between cores, do it only if the whole core if the
+ * whole core is idle.
+ *
+ * If @env::src_cpu is an SMT core with busy siblings, let
+ * the lower priority @env::dst_cpu help it. Do not follow
+ * CPU priority.
*/
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu);
+ sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+ !sched_use_asym_prio(env->sd, env->src_cpu));
}
static inline bool
@@ -10744,7 +10817,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_span(sd->groups),
+ .dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
.loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
@@ -11371,9 +11444,13 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
+ *
+ * When balancing betwen cores, all the SMT siblings of the
+ * preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_asym_prefer(i, cpu)) {
+ if (sched_use_asym_prio(sd, i) &&
+ sched_asym_prefer(i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}