summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c60
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/fair.c119
4 files changed, 123 insertions, 61 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ead464a0f2e5..4778c48a7fda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6998,7 +6998,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
{
char tok[21]; /* U64_MAX */
- if (!sscanf(buf, "%s %llu", tok, periodp))
+ if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
return -EINVAL;
*periodp *= NSEC_PER_USEC;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2efe629425be..3638d2377e3c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -48,10 +48,10 @@ struct sugov_cpu {
bool iowait_boost_pending;
unsigned int iowait_boost;
- unsigned int iowait_boost_max;
u64 last_update;
unsigned long bw_dl;
+ unsigned long min;
unsigned long max;
/* The field below is for single-CPU policies only: */
@@ -303,8 +303,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
if (delta_ns <= TICK_NSEC)
return false;
- sg_cpu->iowait_boost = set_iowait_boost
- ? sg_cpu->sg_policy->policy->min : 0;
+ sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0;
sg_cpu->iowait_boost_pending = set_iowait_boost;
return true;
@@ -344,14 +343,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
- sg_cpu->iowait_boost <<= 1;
- if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ sg_cpu->iowait_boost =
+ min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
return;
}
/* First wakeup after IO: start with minimum boost */
- sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ sg_cpu->iowait_boost = sg_cpu->min;
}
/**
@@ -373,47 +371,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
- unsigned long *util, unsigned long *max)
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long util, unsigned long max)
{
- unsigned int boost_util, boost_max;
+ unsigned long boost;
/* No boost currently required */
if (!sg_cpu->iowait_boost)
- return;
+ return util;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
- return;
+ return util;
- /*
- * An IO waiting task has just woken up:
- * allow to further double the boost value
- */
- if (sg_cpu->iowait_boost_pending) {
- sg_cpu->iowait_boost_pending = false;
- } else {
+ if (!sg_cpu->iowait_boost_pending) {
/*
- * Otherwise: reduce the boost value and disable it when we
- * reach the minimum.
+ * No boost pending; reduce the boost value.
*/
sg_cpu->iowait_boost >>= 1;
- if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ if (sg_cpu->iowait_boost < sg_cpu->min) {
sg_cpu->iowait_boost = 0;
- return;
+ return util;
}
}
+ sg_cpu->iowait_boost_pending = false;
+
/*
- * Apply the current boost value: a CPU is boosted only if its current
- * utilization is smaller then the current IO boost level.
+ * @util is already in capacity scale; convert iowait_boost
+ * into the same scale so we can compare.
*/
- boost_util = sg_cpu->iowait_boost;
- boost_max = sg_cpu->iowait_boost_max;
- if (*util * boost_max < *max * boost_util) {
- *util = boost_util;
- *max = boost_max;
- }
+ boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
+ return max(boost, util);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -460,7 +449,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
- sugov_iowait_apply(sg_cpu, time, &util, &max);
+ util = sugov_iowait_apply(sg_cpu, time, util, max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
@@ -500,7 +489,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
- sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
+ j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
if (j_util * max > j_max * util) {
util = j_util;
@@ -782,6 +771,7 @@ out:
return 0;
fail:
+ kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
sugov_tunables_free(tunables);
@@ -837,7 +827,9 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
- sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ sg_cpu->min =
+ (SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) /
+ policy->cpuinfo.max_freq;
}
for_each_cpu(cpu, policy->cpus) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_entity_is_special(dl_se))
return;
- WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
* If the "0-lag time" already passed, decrease the active
* utilization now, instead of starting a timer
*/
- if (zerolag_time < 0) {
+ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea74d43924b2..35f3ea375084 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+ /* Avoid time going backwards, prevent potential divide error: */
+ if (unlikely((s64)*period < 0))
+ *period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
@@ -4885,6 +4889,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -4892,6 +4898,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
unsigned long flags;
int overrun;
int idle = 0;
+ int count = 0;
raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
@@ -4899,6 +4906,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ new = (old * 147) / 128; /* ~115% */
+ new = min(new, max_cfs_quota_period);
+
+ cfs_b->period = ns_to_ktime(new);
+
+ /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+ cfs_b->quota *= new;
+ cfs_b->quota = div64_u64(cfs_b->quota, old);
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
@@ -7784,10 +7813,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
- cfs_rq->h_load_next = NULL;
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_load_next = se;
+ WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
@@ -7797,7 +7826,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
cfs_rq->last_h_load_update = now;
}
- while ((se = cfs_rq->h_load_next) != NULL) {
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
@@ -8060,6 +8089,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
}
/*
+ * Check whether a rq has a misfit task and if it looks like we can actually
+ * help that task: we can migrate the task to a CPU of higher capacity, or
+ * the task's current CPU is heavily pressured.
+ */
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+{
+ return rq->misfit_task_load &&
+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+ check_cpu_capacity(rq, sd));
+}
+
+/*
* Group imbalance indicates (and tries to solve) the problem where balancing
* groups is inadequate due to ->cpus_allowed constraints.
*
@@ -9586,35 +9627,21 @@ static void nohz_balancer_kick(struct rq *rq)
if (time_before(now, nohz.next_balance))
goto out;
- if (rq->nr_running >= 2 || rq->misfit_task_load) {
+ if (rq->nr_running >= 2) {
flags = NOHZ_KICK_MASK;
goto out;
}
rcu_read_lock();
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds) {
- /*
- * If there is an imbalance between LLC domains (IOW we could
- * increase the overall cache use), we need some less-loaded LLC
- * domain to pull some load. Likewise, we may need to spread
- * load within the current LLC domain (e.g. packed SMT cores but
- * other CPUs are idle). We can't really know from here how busy
- * the others are - so just get a nohz balance going if it looks
- * like this LLC domain has tasks we could move.
- */
- nr_busy = atomic_read(&sds->nr_busy_cpus);
- if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
- goto unlock;
- }
-
- }
sd = rcu_dereference(rq->sd);
if (sd) {
- if ((rq->cfs.h_nr_running >= 1) &&
- check_cpu_capacity(rq, sd)) {
+ /*
+ * If there's a CFS task and the current CPU has reduced
+ * capacity; kick the ILB to see if there's a better CPU to run
+ * on.
+ */
+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
@@ -9622,6 +9649,11 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) {
+ /*
+ * When ASYM_PACKING; see if there's a more preferred CPU
+ * currently idle; in which case, kick the ILB to move tasks
+ * around.
+ */
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
flags = NOHZ_KICK_MASK;
@@ -9629,6 +9661,45 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
}
+
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ if (sd) {
+ /*
+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
+ * to run the misfit task on.
+ */
+ if (check_misfit_status(rq, sd)) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+
+ /*
+ * For asymmetric systems, we do not want to nicely balance
+ * cache use, instead we want to embrace asymmetry and only
+ * ensure tasks have enough CPU capacity.
+ *
+ * Skip the LLC logic because it's not relevant in that case.
+ */
+ goto unlock;
+ }
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * If there is an imbalance between LLC domains (IOW we could
+ * increase the overall cache use), we need some less-loaded LLC
+ * domain to pull some load. Likewise, we may need to spread
+ * load within the current LLC domain (e.g. packed SMT cores but
+ * other CPUs are idle). We can't really know from here how busy
+ * the others are - so just get a nohz balance going if it looks
+ * like this LLC domain has tasks we could move.
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
+ if (nr_busy > 1) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+ }
unlock:
rcu_read_unlock();
out: