summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c496
-rw-r--r--kernel/sched/debug.c49
-rw-r--r--kernel/sched/fair.c1339
-rw-r--r--kernel/sched/features.h24
-rw-r--r--kernel/sched/psi.c2
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h57
-rw-r--r--kernel/sched/topology.c15
8 files changed, 1092 insertions, 895 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d63e063608a..2299a5cfbfb9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1097,25 +1097,22 @@ int get_nohz_timer_target(void)
hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
- rcu_read_lock();
+ guard(rcu)();
+
for_each_domain(cpu, sd) {
for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
- if (!idle_cpu(i)) {
- cpu = i;
- goto unlock;
- }
+ if (!idle_cpu(i))
+ return i;
}
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
- cpu = default_cpu;
-unlock:
- rcu_read_unlock();
- return cpu;
+
+ return default_cpu;
}
/*
@@ -1194,6 +1191,20 @@ static void nohz_csd_func(void *info)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
+{
+ if (rq->nr_running != 1)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (!task_on_rq_queued(p))
+ return false;
+
+ return true;
+}
+
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
@@ -1229,6 +1240,18 @@ bool sched_can_stop_tick(struct rq *rq)
if (rq->nr_running > 1)
return false;
+ /*
+ * If there is one task and it has CFS runtime bandwidth constraints
+ * and it's on the cpu now we don't want to stop the tick.
+ * This check prevents clearing the bit if a newly enqueued task here is
+ * dequeued by migrating while the constrained task continues to run.
+ * E.g. going from 2->1 without going through pick_next_task().
+ */
+ if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+ if (cfs_task_bw_constrained(rq->curr))
+ return false;
+ }
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -1804,7 +1827,8 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
int old_min, old_max, old_min_rt;
int result;
- mutex_lock(&uclamp_mutex);
+ guard(mutex)(&uclamp_mutex);
+
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
@@ -1813,7 +1837,7 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
if (result)
goto undo;
if (!write)
- goto done;
+ return 0;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
@@ -1849,16 +1873,12 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
* Otherwise, keep it simple and do just a lazy update at each next
* task enqueue time.
*/
-
- goto done;
+ return 0;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
-done:
- mutex_unlock(&uclamp_mutex);
-
return result;
}
#endif
@@ -3413,7 +3433,6 @@ static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
@@ -3421,33 +3440,25 @@ static int migrate_swap_stop(void *data)
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
+ guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
+ guard(double_rq_lock)(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
+ return -EAGAIN;
if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
- ret = 0;
-
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
-
- return ret;
+ return 0;
}
/*
@@ -3722,14 +3733,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
struct sched_domain *sd;
__schedstat_inc(p->stats.nr_wakeups_remote);
- rcu_read_lock();
+
+ guard(rcu)();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
- rcu_read_unlock();
}
if (wake_flags & WF_MIGRATED)
@@ -3928,21 +3939,13 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
-
- rcu_read_lock();
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
-
- rq_lock_irqsave(rq, &rf);
- if (is_idle_task(rq->curr))
- resched_curr(rq);
- /* Else CPU is not idle, do nothing here: */
- rq_unlock_irqrestore(rq, &rf);
-
-out:
- rcu_read_unlock();
+ guard(rcu)();
+ if (is_idle_task(rcu_dereference(rq->curr))) {
+ guard(rq_lock_irqsave)(rq);
+ if (is_idle_task(rq->curr))
+ resched_curr(rq);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -4195,10 +4198,9 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
*/
int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- unsigned long flags;
+ guard(preempt)();
int cpu, success = 0;
- preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -4225,129 +4227,127 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* reordered with p->state check below. This pairs with smp_store_mb()
* in set_current_state() that the waiting thread does.
*/
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- smp_mb__after_spinlock();
- if (!ttwu_state_match(p, state, &success))
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ smp_mb__after_spinlock();
+ if (!ttwu_state_match(p, state, &success))
+ break;
- trace_sched_waking(p);
+ trace_sched_waking(p);
- /*
- * Ensure we load p->on_rq _after_ p->state, otherwise it would
- * be possible to, falsely, observe p->on_rq == 0 and get stuck
- * in smp_cond_load_acquire() below.
- *
- * sched_ttwu_pending() try_to_wake_up()
- * STORE p->on_rq = 1 LOAD p->state
- * UNLOCK rq->lock
- *
- * __schedule() (switch to task 'p')
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * UNLOCK rq->lock
- *
- * [task p]
- * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
- */
- smp_rmb();
- if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
- goto unlock;
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
+ */
+ smp_rmb();
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+ break;
#ifdef CONFIG_SMP
- /*
- * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
- * possible to, falsely, observe p->on_cpu == 0.
- *
- * One must be running (->on_cpu == 1) in order to remove oneself
- * from the runqueue.
- *
- * __schedule() (switch to task 'p') try_to_wake_up()
- * STORE p->on_cpu = 1 LOAD p->on_rq
- * UNLOCK rq->lock
- *
- * __schedule() (put 'p' to sleep)
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * STORE p->on_rq = 0 LOAD p->on_cpu
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
- * care about it's own p->state. See the comment in __schedule().
- */
- smp_acquire__after_ctrl_dep();
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
+ */
+ smp_acquire__after_ctrl_dep();
- /*
- * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
- * == 0), which means we need to do an enqueue, change p->state to
- * TASK_WAKING such that we can unlock p->pi_lock before doing the
- * enqueue, such as ttwu_queue_wakelist().
- */
- WRITE_ONCE(p->__state, TASK_WAKING);
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ WRITE_ONCE(p->__state, TASK_WAKING);
- /*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, considering queueing p on the remote CPUs wake_list
- * which potentially sends an IPI instead of spinning on p->on_cpu to
- * let the waker make forward progress. This is safe because IRQs are
- * disabled and the IPI will deliver after on_cpu is cleared.
- *
- * Ensure we load task_cpu(p) after p->on_cpu:
- *
- * set_task_cpu(p, cpu);
- * STORE p->cpu = @cpu
- * __schedule() (switch to task 'p')
- * LOCK rq->lock
- * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
- * STORE p->on_cpu = 1 LOAD p->cpu
- *
- * to ensure we observe the correct CPU on which the task is currently
- * scheduling.
- */
- if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
- goto unlock;
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+ break;
- /*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, wait until it's done referencing the task.
- *
- * Pairs with the smp_store_release() in finish_task().
- *
- * This ensures that tasks getting woken will be fully ordered against
- * their previous state and preserve Program Order.
- */
- smp_cond_load_acquire(&p->on_cpu, !VAL);
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until it's done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
- cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
- if (task_cpu(p) != cpu) {
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
+ cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
- wake_flags |= WF_MIGRATED;
- psi_ttwu_dequeue(p);
- set_task_cpu(p, cpu);
- }
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
#else
- cpu = task_cpu(p);
+ cpu = task_cpu(p);
#endif /* CONFIG_SMP */
- ttwu_queue(p, cpu, wake_flags);
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ ttwu_queue(p, cpu, wake_flags);
+ }
out:
if (success)
ttwu_stat(p, task_cpu(p), wake_flags);
- preempt_enable();
return success;
}
@@ -4500,6 +4500,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ p->se.vlag = 0;
+ p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5495,23 +5497,20 @@ unsigned int nr_iowait(void)
void sched_exec(void)
{
struct task_struct *p = current;
- unsigned long flags;
+ struct migration_arg arg;
int dest_cpu;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
- if (dest_cpu == smp_processor_id())
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
+ if (dest_cpu == smp_processor_id())
+ return;
- if (likely(cpu_active(dest_cpu))) {
- struct migration_arg arg = { p, dest_cpu };
+ if (unlikely(!cpu_active(dest_cpu)))
+ return;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
- return;
+ arg = (struct migration_arg){ p, dest_cpu };
}
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
#endif
@@ -5721,9 +5720,6 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr;
- struct rq_flags rf;
- u64 delta;
int os;
/*
@@ -5733,30 +5729,26 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (!tick_nohz_tick_stopped_cpu(cpu))
- goto out_requeue;
+ if (tick_nohz_tick_stopped_cpu(cpu)) {
+ guard(rq_lock_irq)(rq);
+ struct task_struct *curr = rq->curr;
- rq_lock_irq(rq, &rf);
- curr = rq->curr;
- if (cpu_is_offline(cpu))
- goto out_unlock;
+ if (cpu_online(cpu)) {
+ update_rq_clock(rq);
- update_rq_clock(rq);
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a
+ * reasonable amount of time.
+ */
+ u64 delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
+ curr->sched_class->task_tick(rq, curr, 0);
- if (!is_idle_task(curr)) {
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ calc_load_nohz_remote(rq);
+ }
}
- curr->sched_class->task_tick(rq, curr, 0);
-
- calc_load_nohz_remote(rq);
-out_unlock:
- rq_unlock_irq(rq, &rf);
-out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
@@ -6305,19 +6297,19 @@ static bool try_steal_cookie(int this, int that)
unsigned long cookie;
bool success = false;
- local_irq_disable();
- double_rq_lock(dst, src);
+ guard(irq)();
+ guard(double_rq_lock)(dst, src);
cookie = dst->core->core_cookie;
if (!cookie)
- goto unlock;
+ return false;
if (dst->curr != dst->idle)
- goto unlock;
+ return false;
p = sched_core_find(src, cookie);
if (!p)
- goto unlock;
+ return false;
do {
if (p == src->core_pick || p == src->curr)
@@ -6329,9 +6321,10 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
/*
- * sched_core_find() and sched_core_next() will ensure that task @p
- * is not throttled now, we also need to check whether the runqueue
- * of the destination CPU is being throttled.
+ * sched_core_find() and sched_core_next() will ensure
+ * that task @p is not throttled now, we also need to
+ * check whether the runqueue of the destination CPU is
+ * being throttled.
*/
if (sched_task_is_throttled(p, this))
goto next;
@@ -6349,10 +6342,6 @@ next:
p = sched_core_next(p, cookie);
} while (p);
-unlock:
- double_rq_unlock(dst, src);
- local_irq_enable();
-
return success;
}
@@ -6410,20 +6399,24 @@ static void queue_core_balance(struct rq *rq)
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
+DEFINE_LOCK_GUARD_1(core_lock, int,
+ sched_core_lock(*_T->lock, &_T->flags),
+ sched_core_unlock(*_T->lock, &_T->flags),
+ unsigned long flags)
+
static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- unsigned long flags;
int t;
- sched_core_lock(cpu, &flags);
+ guard(core_lock)(&cpu);
WARN_ON_ONCE(rq->core != rq);
/* if we're the first, we'll be our own leader */
if (cpumask_weight(smt_mask) == 1)
- goto unlock;
+ return;
/* find the leader */
for_each_cpu(t, smt_mask) {
@@ -6437,7 +6430,7 @@ static void sched_core_cpu_starting(unsigned int cpu)
}
if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
- goto unlock;
+ return;
/* install and validate core_rq */
for_each_cpu(t, smt_mask) {
@@ -6448,29 +6441,25 @@ static void sched_core_cpu_starting(unsigned int cpu)
WARN_ON_ONCE(rq->core != core_rq);
}
-
-unlock:
- sched_core_unlock(cpu, &flags);
}
static void sched_core_cpu_deactivate(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- unsigned long flags;
int t;
- sched_core_lock(cpu, &flags);
+ guard(core_lock)(&cpu);
/* if we're the last man standing, nothing to do */
if (cpumask_weight(smt_mask) == 1) {
WARN_ON_ONCE(rq->core != rq);
- goto unlock;
+ return;
}
/* if we're not the leader, nothing to do */
if (rq->core != rq)
- goto unlock;
+ return;
/* find a new leader */
for_each_cpu(t, smt_mask) {
@@ -6481,7 +6470,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
}
if (WARN_ON_ONCE(!core_rq)) /* impossible */
- goto unlock;
+ return;
/* copy the shared state to the new leader */
core_rq->core_task_seq = rq->core_task_seq;
@@ -6503,9 +6492,6 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
rq = cpu_rq(t);
rq->core = core_rq;
}
-
-unlock:
- sched_core_unlock(cpu, &flags);
}
static inline void sched_core_cpu_dying(unsigned int cpu)
@@ -7382,6 +7368,19 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
#ifdef CONFIG_SMP
/*
* This function computes an effective utilization for the given CPU, to be
@@ -9939,7 +9938,7 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -11073,11 +11072,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
/*
* Ensure max(child_quota) <= parent_quota. On cgroup2,
- * always take the min. On cgroup1, only inherit when no
- * limit is set:
+ * always take the non-RUNTIME_INF min. On cgroup1, only
+ * inherit when no limit is set. In both cases this is used
+ * by the scheduler to determine if a given CFS task has a
+ * bandwidth constraint at some higher level.
*/
if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
- quota = min(quota, parent_quota);
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF)
+ quota = min(quota, parent_quota);
} else {
if (quota == RUNTIME_INF)
quota = parent_quota;
@@ -11138,6 +11142,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
return 0;
}
+
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
+
+ return 0;
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -11214,6 +11239,10 @@ static struct cftype cpu_legacy_files[] = {
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
+ {
+ .name = "stat.local",
+ .seq_show = cpu_cfs_local_stat_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
@@ -11270,6 +11299,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
return 0;
}
+static int cpu_local_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ u64 throttled_self_usec;
+
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "throttled_usec %llu\n",
+ throttled_self_usec);
+ }
+#endif
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -11448,6 +11495,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
+ .css_local_stat_show = cpu_local_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 066ff1c8ae4e..4c3d0d9f3db6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
- debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
- debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
- debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
- debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
@@ -427,6 +424,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
#undef SDM
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
}
void update_sched_domain_debugfs(void)
@@ -581,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -626,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
+ struct sched_entity *last, *first;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -643,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
- if (rb_first_cached(&cfs_rq->tasks_timeline))
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_vruntime = first->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
+ right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
@@ -863,10 +863,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- PN(sysctl_sched_latency);
- PN(sysctl_sched_min_granularity);
- PN(sysctl_sched_idle_min_granularity);
- PN(sysctl_sched_wakeup_granularity);
+ PN(sysctl_sched_base_slice);
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ceb5d4c4738e..a317cb0cf69c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
#include <asm/switch_to.h>
@@ -57,22 +58,6 @@
#include "autogroup.h"
/*
- * Targeted preemption latency for CPU-bound tasks:
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- * run vmstat and monitor the context-switches (cs) field)
- *
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-
-/*
* The initial- and re-scaling of tunables is configurable
*
* Options are:
@@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-
-/*
- * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
- * Applies only when SCHED_IDLE tasks compete with normal tasks.
- *
- * (default: 0.75 msec)
- */
-unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
-
-/*
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
+unsigned int sysctl_sched_base_slice = 750000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
/*
* After fork, child runs first. If set to 0 (default) then
@@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8;
*/
unsigned int sysctl_sched_child_runs_first __read_mostly;
-/*
- * SCHED_OTHER wake-up granularity.
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- *
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
@@ -277,9 +237,7 @@ static void update_sysctl(void)
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_base_slice);
#undef SET_SYSCTL
}
@@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
const struct sched_class fair_sched_class;
@@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a,
return (s64)(a->vruntime - b->vruntime) < 0;
}
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
#define __node_2_se(node) \
rb_entry((node), struct sched_entity, run_node)
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag:
+ *
+ * \Sum lag_i = 0
+ *
+ * Where lag_i is given by:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * Where S is the ideal service time and V is it's virtual time counterpart.
+ * Therefore:
+ *
+ * \Sum lag_i = 0
+ * \Sum w_i * (V - v_i) = 0
+ * \Sum w_i * V - w_i * v_i = 0
+ *
+ * From which we can solve an expression for V in v_i (which we have in
+ * se->vruntime):
+ *
+ * \Sum v_i * w_i \Sum v_i * w_i
+ * V = -------------- = --------------
+ * \Sum w_i W
+ *
+ * Specifically, this is the weighted average of all entity virtual runtimes.
+ *
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
+ * that join/leave operations happen at lag_i = 0, otherwise the
+ * virtual time has non-continguous motion equivalent to:
+ *
+ * V +-= lag_i / W
+ *
+ * Also see the comment in place_entity() that deals with this. ]]
+ *
+ * However, since v_i is u64, and the multiplcation could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v0) + v0
+ *
+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
+ * V = ---------------------------- = --------------------- + v0
+ * W W
+ *
+ * Which we track using:
+ *
+ * v0 := cfs_rq->min_vruntime
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
+ * \Sum w_i := cfs_rq->avg_load
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ *
+ * Also, we use scale_load_down() to reduce the size.
+ *
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ if (load)
+ avg = div_s64(avg, load);
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
+ */
+void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 lag, limit;
+
+ SCHED_WARN_ON(!se->on_rq);
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * to the loss in precision caused by the division.
+ */
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= entity_key(cfs_rq, se) * load;
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ u64 min_vruntime = cfs_rq->min_vruntime;
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ min_vruntime = vruntime;
+ }
+ return min_vruntime;
+}
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (leftmost) { /* non-empty tree */
- struct sched_entity *se = __node_2_se(leftmost);
-
+ if (se) {
if (!curr)
vruntime = se->vruntime;
else
@@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
/* ensure we never gain time by being placed backwards. */
u64_u32_store(cfs_rq->min_vruntime,
- max_vruntime(cfs_rq->min_vruntime, vruntime));
+ __update_min_vruntime(cfs_rq, vruntime));
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (deadline_gt(min_deadline, se, rse))
+ se->min_deadline = rse->min_deadline;
+ }
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+{
+ u64 old_min_deadline = se->min_deadline;
+ struct rb_node *node = &se->run_node;
+
+ se->min_deadline = se->deadline;
+ __update_min_deadline(se, node->rb_right);
+ __update_min_deadline(se, node->rb_left);
+
+ return se->min_deadline == old_min_deadline;
+}
+
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
+ run_node, min_deadline, min_deadline_update);
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+ avg_vruntime_add(cfs_rq, se);
+ se->min_deadline = se->deadline;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_deadline_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_deadline_cb);
+ avg_vruntime_sub(cfs_rq, se);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -660,14 +845,88 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
return __node_2_se(left);
}
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
- struct rb_node *next = rb_next(&se->run_node);
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
- if (!next)
- return NULL;
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ /*
+ * Once selected, run a task until it either becomes non-eligible or
+ * until it gets a new slice. See the HACK in set_next_entity().
+ */
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ return curr;
- return __node_2_se(next);
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /*
+ * If this entity is not eligible, try the left subtree.
+ */
+ if (!entity_eligible(cfs_rq, se)) {
+ node = node->rb_left;
+ continue;
+ }
+
+ /*
+ * If this entity has an earlier deadline than the previous
+ * best, take this one. If it also has the earliest deadline
+ * of its subtree, we're done.
+ */
+ if (!best || deadline_gt(deadline, best, se)) {
+ best = se;
+ if (best->deadline == best->min_deadline)
+ break;
+ }
+
+ /*
+ * If the earlest deadline in this subtree is in the fully
+ * eligible left half of our space, go there.
+ */
+ if (node->rb_left &&
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
+ node = node->rb_left;
+ continue;
+ }
+
+ node = node->rb_right;
+ }
+
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
+ best = curr;
+
+ if (unlikely(!best)) {
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ if (left) {
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
+ return left;
+ }
+ }
+
+ return best;
}
#ifdef CONFIG_SCHED_DEBUG
@@ -689,104 +948,45 @@ int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
- sysctl_sched_min_granularity);
-
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
- WRT_SYSCTL(sched_min_granularity);
- WRT_SYSCTL(sched_latency);
- WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_base_slice);
#undef WRT_SYSCTL
return 0;
}
#endif
-/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
-
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
- if (unlikely(nr_running > sched_nr_latency))
- return nr_running * sysctl_sched_min_granularity;
- else
- return sysctl_sched_latency;
-}
-
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
*/
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- unsigned int nr_running = cfs_rq->nr_running;
- struct sched_entity *init_se = se;
- unsigned int min_gran;
- u64 slice;
-
- if (sched_feat(ALT_PERIOD))
- nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
-
- slice = __sched_period(nr_running + !se->on_rq);
-
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
- struct cfs_rq *qcfs_rq;
-
- qcfs_rq = cfs_rq_of(se);
- load = &qcfs_rq->load;
-
- if (unlikely(!se->on_rq)) {
- lw = qcfs_rq->load;
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return;
- update_load_add(&lw, se->load.weight);
- load = &lw;
- }
- slice = __calc_delta(slice, se->load.weight, load);
- }
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_base_slice.
+ */
+ se->slice = sysctl_sched_base_slice;
- if (sched_feat(BASE_SLICE)) {
- if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
- min_gran = sysctl_sched_idle_min_granularity;
- else
- min_gran = sysctl_sched_min_granularity;
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
- slice = max_t(u64, slice, min_gran);
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ if (cfs_rq->nr_running > 1) {
+ resched_curr(rq_of(cfs_rq));
+ clear_buddies(cfs_rq, se);
}
-
- return slice;
-}
-
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
#include "pelt.h"
@@ -921,6 +1121,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
@@ -3375,16 +3576,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
+ unsigned long old_weight = se->load.weight;
+
if (se->on_rq) {
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
+ else
+ avg_vruntime_sub(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
update_load_set(&se->load, weight);
+ if (!se->on_rq) {
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * old_weight, weight);
+ } else {
+ s64 deadline = se->deadline - se->vruntime;
+ /*
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ */
+ deadline = div_s64(deadline * old_weight, weight);
+ se->deadline = se->vruntime + deadline;
+ }
+
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
@@ -3394,9 +3615,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#endif
enqueue_load_avg(cfs_rq, se);
- if (se->on_rq)
+ if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
-
+ if (cfs_rq->curr != se)
+ avg_vruntime_add(cfs_rq, se);
+ }
}
void reweight_task(struct task_struct *p, int prio)
@@ -4692,159 +4915,125 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
-
- if (d < 0)
- d = -d;
-
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq->nr_spread_over);
-#endif
-}
-
-static inline bool entity_is_long_sleeper(struct sched_entity *se)
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- u64 sleep_time;
-
- if (se->exec_start == 0)
- return false;
+ u64 vslice = calc_delta_fair(se->slice, se);
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 lag = 0;
- cfs_rq = cfs_rq_of(se);
-
- sleep_time = rq_clock_task(rq_of(cfs_rq));
+ /*
+ * Due to how V is constructed as the weighted average of entities,
+ * adding tasks with positive lag, or removing tasks with negative lag
+ * will move 'time' backwards, this can screw around with the lag of
+ * other tasks.
+ *
+ * EEVDF: placement strategy #1 / #2
+ */
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ struct sched_entity *curr = cfs_rq->curr;
+ unsigned long load;
- /* Happen while migrating because of clock task divergence */
- if (sleep_time <= se->exec_start)
- return false;
+ lag = se->vlag;
- sleep_time -= se->exec_start;
- if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
- return true;
+ /*
+ * If we want to place a task and preserve lag, we have to
+ * consider the effect of the new entity on the weighted
+ * average and compensate for this, otherwise lag can quickly
+ * evaporate.
+ *
+ * Lag is defined as:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * To avoid the 'w_i' term all over the place, we only track
+ * the virtual lag:
+ *
+ * vl_i = V - v_i <=> v_i = V - vl_i
+ *
+ * And we take V to be the weighted average of all v:
+ *
+ * V = (\Sum w_j*v_j) / W
+ *
+ * Where W is: \Sum w_j
+ *
+ * Then, the weighted average after adding an entity with lag
+ * vl_i is given by:
+ *
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = V - w_i*vl_i / (W + w_i)
+ *
+ * And the actual lag after adding an entity with vl_i is:
+ *
+ * vl'_i = V' - v_i
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+ * = vl_i - w_i*vl_i / (W + w_i)
+ *
+ * Which is strictly less than vl_i. So in order to preserve lag
+ * we should inflate the lag before placement such that the
+ * effective lag after placement comes out right.
+ *
+ * As such, invert the above relation for vl'_i to get the vl_i
+ * we need to use such that the lag after placement is the lag
+ * we computed before dequeue.
+ *
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+ *
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+ * = W*vl_i
+ *
+ * vl_i = (W + w_i)*vl'_i / W
+ */
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
- return false;
-}
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
+ }
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
- u64 vruntime = cfs_rq->min_vruntime;
+ se->vruntime = vruntime - lag;
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
-
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh;
-
- if (se_is_idle(se))
- thresh = sysctl_sched_min_granularity;
- else
- thresh = sysctl_sched_latency;
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
- /*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
- */
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
-
- vruntime -= thresh;
- }
-
- /*
- * Pull vruntime of the entity being placed to the base level of
- * cfs_rq, to prevent boosting it if placed backwards.
- * However, min_vruntime can advance much faster than real time, with
- * the extreme being when an entity with the minimal weight always runs
- * on the cfs_rq. If the waking entity slept for a long time, its
- * vruntime difference from min_vruntime may overflow s64 and their
- * comparison may get inversed, so ignore the entity's original
- * vruntime in that case.
- * The maximal vruntime speedup is given by the ratio of normal to
- * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
- * When placing a migrated waking entity, its exec_start has been set
- * from a different rq. In order to take into account a possible
- * divergence between new and prev rq's clocks task because of irq and
- * stolen time, we take an additional margin.
- * So, cutting off on the sleep time of
- * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
- * should be safe.
- */
- if (entity_is_long_sleeper(se))
- se->vruntime = vruntime;
- else
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static inline bool cfs_bandwidth_used(void);
-/*
- * MIGRATION
- *
- * dequeue
- * update_curr()
- * update_min_vruntime()
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way the vruntime transition between RQs is done when both
- * min_vruntime are up-to-date.
- *
- * WAKEUP (remote)
- *
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way we don't have the most up-to-date min_vruntime on the originating
- * CPU and an up-to-date min_vruntime on the destination CPU.
- */
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, flags);
update_curr(cfs_rq);
/*
- * Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past. Being
- * placed in the past could significantly boost this task to the
- * fairness detriment of existing tasks.
- */
- if (renorm && !curr)
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
@@ -4855,37 +5044,46 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
+ /*
+ * XXX update_load_avg() above will have attached us to the pelt sum;
+ * but update_cfs_group() here will re-adjust the weight and have to
+ * undo/redo all that. Seems wasteful.
+ */
update_cfs_group(se);
+
+ /*
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
+ * we can place the entity.
+ */
+ if (!curr)
+ place_entity(cfs_rq, se, flags);
+
account_entity_enqueue(cfs_rq, se);
- if (flags & ENQUEUE_WAKEUP)
- place_entity(cfs_rq, se, 0);
/* Entity has migrated, no longer consider this task hot */
if (flags & ENQUEUE_MIGRATED)
se->exec_start = 0;
check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
- check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq))
+ if (!throttled_hierarchy(cfs_rq)) {
list_add_leaf_cfs_rq(cfs_rq);
- }
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last != se)
- break;
+ } else {
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct rq *rq = rq_of(cfs_rq);
- cfs_rq->last = NULL;
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+#endif
+ }
}
}
@@ -4900,27 +5098,10 @@ static void __clear_buddies_next(struct sched_entity *se)
}
}
-static void __clear_buddies_skip(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip != se)
- break;
-
- cfs_rq->skip = NULL;
- }
-}
-
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->last == se)
- __clear_buddies_last(se);
-
if (cfs_rq->next == se)
__clear_buddies_next(se);
-
- if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -4954,20 +5135,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
+ update_entity_lag(cfs_rq, se);
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
@@ -4986,52 +5159,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_idle_cfs_rq_clock_pelt(cfs_rq);
}
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- unsigned long ideal_runtime, delta_exec;
- struct sched_entity *se;
- s64 delta;
-
- /*
- * When many tasks blow up the sched_period; it is possible that
- * sched_slice() reports unusually large results (when many tasks are
- * very light for example). Therefore impose a maximum.
- */
- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
-
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
- return;
- }
-
- /*
- * Ensure that a task that missed wakeup preemption by a
- * narrow margin doesn't have to wait for a full slice.
- * This also mitigates buddy induced latencies under load.
- */
- if (delta_exec < sysctl_sched_min_granularity)
- return;
-
- se = __pick_first_entity(cfs_rq);
- delta = curr->vruntime - se->vruntime;
-
- if (delta < 0)
- return;
-
- if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
-}
-
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -5047,6 +5174,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ /*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+ se->vlag = se->deadline;
}
update_stats_curr_start(cfs_rq, se);
@@ -5070,9 +5202,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
@@ -5083,50 +5212,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- struct sched_entity *se;
-
- /*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
- */
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
-
- se = left; /* ideally we run the leftmost entity */
-
/*
- * Avoid running the skip buddy, if running something else can
- * be done without getting too unfair.
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
*/
- if (cfs_rq->skip && cfs_rq->skip == se) {
- struct sched_entity *second;
+ if (sched_feat(NEXT_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ return cfs_rq->next;
- if (se == curr) {
- second = __pick_first_entity(cfs_rq);
- } else {
- second = __pick_next_entity(se);
- if (!second || (curr && entity_before(curr, second)))
- second = curr;
- }
-
- if (second && wakeup_preempt_entity(second, left) < 1)
- se = second;
- }
-
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- se = cfs_rq->next;
- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- se = cfs_rq->last;
- }
-
- return se;
+ return pick_eevdf(cfs_rq);
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -5143,8 +5236,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
-
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
@@ -5185,9 +5276,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}
@@ -5377,6 +5465,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* Add cfs_rq with load or one or more already running entities to the list */
if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+ cfs_rq->throttled_clock_self = 0;
+
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;
+
+ cfs_rq->throttled_clock_self_time += delta;
+ }
}
return 0;
@@ -5391,6 +5490,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
+
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@@ -5480,7 +5583,9 @@ done:
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
+ SCHED_WARN_ON(cfs_rq->throttled_clock);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -5498,7 +5603,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ if (cfs_rq->throttled_clock) {
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ cfs_rq->throttled_clock = 0;
+ }
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
@@ -6014,13 +6122,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
cfs_b->burst = 0;
+ cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
@@ -6157,6 +6266,46 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
rq_clock_stop_loop_update(rq);
}
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (cfs_rq->runtime_enabled ||
+ tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/* called from pick_next_task_fair() */
+static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq);
+
+ if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+ return;
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (rq->nr_running != 1)
+ return;
+
+ /*
+ * We know there is only one task runnable and we've just picked it. The
+ * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
+ * be otherwise able to stop the tick. Just need to check if we are using
+ * bandwidth control.
+ */
+ if (cfs_task_bw_constrained(p))
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#endif
+
#else /* CONFIG_CFS_BANDWIDTH */
static inline bool cfs_bandwidth_used(void)
@@ -6186,7 +6335,7 @@ static inline int throttled_lb_pair(struct task_group *tg,
return 0;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -6199,9 +6348,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-
+#ifdef CONFIG_CGROUP_SCHED
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
+#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
+static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
+#endif
+
/**************************************************
* CFS operations on tasks:
*/
@@ -6210,13 +6368,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
SCHED_WARN_ON(task_rq(p) != rq);
if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;
if (delta < 0) {
@@ -6240,8 +6397,7 @@ static void hrtick_update(struct rq *rq)
if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, curr);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -6282,17 +6438,6 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-/*
- * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
- * of idle_nr_running, which does not consider idle descendants of normal
- * entities.
- */
-static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->nr_running &&
- cfs_rq->nr_running == cfs_rq->idle_nr_running;
-}
-
#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
@@ -7065,7 +7210,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
- for_each_cpu_wrap(cpu, cpus, target + 1) {
+ for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
@@ -7289,9 +7434,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
- if (boost)
- util_est = max(util_est, runnable);
-
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
@@ -7802,18 +7944,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
struct sched_entity *se = &p->se;
- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
- }
-
if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
@@ -7851,66 +7981,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
#endif /* CONFIG_SMP */
-static unsigned long wakeup_gran(struct sched_entity *se)
-{
- unsigned long gran = sysctl_sched_wakeup_granularity;
-
- /*
- * Since its curr running now, convert the gran from real-time
- * to virtual-time in his units.
- *
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- return calc_delta_fair(gran, se);
-}
-
-/*
- * Should 'se' preempt 'curr'.
- *
- * |s1
- * |s2
- * |s3
- * g
- * |<--->|c
- *
- * w(c, s1) = -1
- * w(c, s2) = 0
- * w(c, s3) = 1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
-
- if (vdiff <= 0)
- return -1;
-
- gran = wakeup_gran(se);
- if (vdiff > gran)
- return 1;
-
- return 0;
-}
-
-static void set_last_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- if (se_is_idle(se))
- return;
- cfs_rq_of(se)->last = se;
- }
-}
-
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
@@ -7922,12 +7992,6 @@ static void set_next_buddy(struct sched_entity *se)
}
}
-static void set_skip_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se)
- cfs_rq_of(se)->skip = se;
-}
-
/*
* Preempt the current task with a newly woken task if needed:
*/
@@ -7936,7 +8000,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
int cse_is_idle, pse_is_idle;
@@ -7952,7 +8015,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
next_buddy_marked = 1;
}
@@ -7997,35 +8060,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (cse_is_idle != pse_is_idle)
return;
- update_curr(cfs_rq_of(se));
- if (wakeup_preempt_entity(se, pse) == 1) {
- /*
- * Bias pick_next to pick the sched entity that is
- * triggering this preemption.
- */
- if (!next_buddy_marked)
- set_next_buddy(pse);
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+
+ /*
+ * XXX pick_eevdf(cfs_rq) != se ?
+ */
+ if (pick_eevdf(cfs_rq) == pse)
goto preempt;
- }
return;
preempt:
resched_curr(rq);
- /*
- * Only set the backward buddy when the current task is still
- * on the rq. This can happen when a wakeup gets interleaved
- * with schedule on the ->pre_schedule() or idle_balance()
- * point, either of which can * drop the rq lock.
- *
- * Also, during early boot the idle thread is in the fair class,
- * for obvious reasons its a bad idea to schedule back to it.
- */
- if (unlikely(!se->on_rq || curr == rq->idle))
- return;
-
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
- set_last_buddy(se);
}
#ifdef CONFIG_SMP
@@ -8176,6 +8223,7 @@ done: __maybe_unused;
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
return p;
@@ -8226,8 +8274,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
/*
* sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
*/
static void yield_task_fair(struct rq *rq)
{
@@ -8243,21 +8289,19 @@ static void yield_task_fair(struct rq *rq)
clear_buddies(cfs_rq, se);
- if (curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- /*
- * Tell update_rq_clock() that we've just updated,
- * so we don't do microscopic update in schedule()
- * and double the fastpath cost.
- */
- rq_clock_skip_update(rq);
- }
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq);
- set_skip_buddy(se);
+ se->deadline += calc_delta_fair(se->slice, se);
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
@@ -8420,6 +8464,11 @@ enum group_type {
*/
group_misfit_task,
/*
+ * Balance SMT group that's fully busy. Can benefit from migration
+ * a task on SMT with busy sibling to another CPU on idle core.
+ */
+ group_smt_balance,
+ /*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
@@ -8500,8 +8549,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
- (&p->se == cfs_rq_of(&p->se)->next ||
- &p->se == cfs_rq_of(&p->se)->last))
+ (&p->se == cfs_rq_of(&p->se)->next))
return 1;
if (sysctl_sched_migration_cost == -1)
@@ -9127,6 +9175,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -9400,6 +9449,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (sgs->group_asym_packing)
return group_asym_packing;
+ if (sgs->group_smt_balance)
+ return group_smt_balance;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
@@ -9469,6 +9521,71 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+ struct sched_group *sg2)
+{
+ if (!sg1 || !sg2)
+ return false;
+
+ return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+ (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
+
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (env->idle == CPU_NOT_IDLE)
+ return false;
+
+ /*
+ * For SMT source group, it is better to move a task
+ * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+ * will not be on.
+ */
+ if (group->flags & SD_SHARE_CPUCAPACITY &&
+ sgs->sum_h_nr_running > 1)
+ return true;
+
+ return false;
+}
+
+static inline long sibling_imbalance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *local)
+{
+ int ncores_busiest, ncores_local;
+ long imbalance;
+
+ if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ return 0;
+
+ ncores_busiest = sds->busiest->cores;
+ ncores_local = sds->local->cores;
+
+ if (ncores_busiest == ncores_local) {
+ imbalance = busiest->sum_nr_running;
+ lsub_positive(&imbalance, local->sum_nr_running);
+ return imbalance;
+ }
+
+ /* Balance such that nr_running/ncores ratio are same on both groups */
+ imbalance = ncores_local * busiest->sum_nr_running;
+ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+ /* Normalize imbalance and do rounding on normalization */
+ imbalance = 2 * imbalance + ncores_local + ncores_busiest;
+ imbalance /= ncores_local + ncores_busiest;
+
+ /* Take advantage of resource in an empty sched group */
+ if (imbalance == 0 && local->sum_nr_running == 0 &&
+ busiest->sum_nr_running > 1)
+ imbalance = 2;
+
+ return imbalance;
+}
+
static inline bool
sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
{
@@ -9561,6 +9678,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_asym_packing = 1;
}
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (!local_group && smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
+
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
@@ -9645,6 +9766,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
return false;
break;
+ case group_smt_balance:
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9674,6 +9796,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_has_spare:
/*
+ * Do not pick sg with SMT CPUs over sg with pure CPUs,
+ * as we do not want to pull task off SMT core with one task
+ * and make the core idle.
+ */
+ if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+ return false;
+ else
+ return true;
+ }
+
+ /*
* Select not overloaded group with lowest number of idle cpus
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
@@ -9869,6 +10003,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those types are not used in the slow wakeup path */
return false;
@@ -10000,6 +10135,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those type are not used in the slow wakeup path */
return NULL;
@@ -10254,6 +10390,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+ if (busiest->group_type == group_smt_balance) {
+ /* Reduce number of tasks sharing CPU capacity */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
+ }
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -10301,14 +10444,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
- unsigned int nr_diff = busiest->sum_nr_running;
/*
* When prefer sibling, evenly spread running tasks on
* groups.
*/
env->migration_type = migrate_task;
- lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff;
+ env->imbalance = sibling_imbalance(env, sds, busiest, local);
} else {
/*
@@ -10505,20 +10646,27 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* group's child domain.
*/
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- busiest->sum_nr_running > local->sum_nr_running + 1)
+ sibling_imbalance(env, &sds, busiest, local) > 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE)
+ if (env->idle == CPU_NOT_IDLE) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
+ }
+
+ if (busiest->group_type == group_smt_balance &&
+ smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+ /* Let non SMT CPU pull from SMT CPU sharing with sibling */
+ goto force_balance;
+ }
if (busiest->group_weight > 1 &&
- local->idle_cpus <= (busiest->idle_cpus + 1))
+ local->idle_cpus <= (busiest->idle_cpus + 1)) {
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
@@ -10529,12 +10677,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* there is more than 1 CPU per group.
*/
goto out_balanced;
+ }
- if (busiest->sum_h_nr_running == 1)
+ if (busiest->sum_h_nr_running == 1) {
/*
* busiest doesn't have any tasks waiting to run
*/
goto out_balanced;
+ }
}
force_balance:
@@ -10768,7 +10918,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- int cpu;
+ int cpu, idle_smt = -1;
/*
* Ensure the balancing environment is consistent; can happen
@@ -10795,10 +10945,24 @@ static int should_we_balance(struct lb_env *env)
if (!idle_cpu(cpu))
continue;
+ /*
+ * Don't balance to idle SMT in busy core right away when
+ * balancing cores, but remember the first idle SMT CPU for
+ * later consideration. Find CPU on an idle core first.
+ */
+ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+ if (idle_smt == -1)
+ idle_smt = cpu;
+ continue;
+ }
+
/* Are we the first idle CPU? */
return cpu == env->dst_cpu;
}
+ if (idle_smt == env->dst_cpu)
+ return true;
+
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
}
@@ -12011,8 +12175,8 @@ static void rq_offline_fair(struct rq *rq)
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
{
- u64 slice = sched_slice(cfs_rq_of(se), se);
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
return (rtime * min_nr_tasks > slice);
}
@@ -12168,8 +12332,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
+ struct cfs_rq *cfs_rq;
struct rq *rq = this_rq();
struct rq_flags rf;
@@ -12178,22 +12342,9 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (curr) {
+ if (curr)
update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- }
- place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
+ place_entity(cfs_rq, se, ENQUEUE_INITIAL);
rq_unlock(rq, &rf);
}
@@ -12222,34 +12373,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
check_preempt_curr(rq, p, 0);
}
-static inline bool vruntime_normalized(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
-
- /*
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
- * the dequeue_entity(.flags=0) will already have normalized the
- * vruntime.
- */
- if (p->on_rq)
- return true;
-
- /*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - A forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - A task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- */
- if (!se->sum_exec_runtime ||
- (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
- return true;
-
- return false;
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Propagate the changes of the sched_entity across the tg tree to make it
@@ -12320,16 +12443,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
- }
detach_entity_cfs_rq(se);
}
@@ -12337,12 +12450,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se);
-
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -12454,7 +12563,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
@@ -12707,7 +12816,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);
return rr_interval;
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..f770168230ae 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,16 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Only give sleepers 50% of their service deficit. This allows
- * them to run sooner, but does not allow tons of sleepers to
- * rip the spread apart.
- */
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
-SCHED_FEAT(START_DEBIT, true)
+SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+SCHED_FEAT(RUN_TO_PARITY, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -20,13 +16,6 @@ SCHED_FEAT(START_DEBIT, true)
SCHED_FEAT(NEXT_BUDDY, false)
/*
- * Prefer to schedule the task that ran last (when we did
- * wake-preempt) as that likely will touch the same data, increases
- * cache locality.
- */
-SCHED_FEAT(LAST_BUDDY, true)
-
-/*
* Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
@@ -99,5 +88,4 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
-SCHED_FEAT(ALT_PERIOD, true)
-SCHED_FEAT(BASE_SLICE, true)
+SCHED_FEAT(HZ_BW, true)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9bb3f2b3ccfc..1d0f634725a6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -140,7 +140,7 @@
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
-DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 00e0e5074115..0597ba0f85ff 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -25,7 +25,7 @@ unsigned int sysctl_sched_rt_period = 1000000;
int sysctl_sched_rt_runtime = 950000;
#ifdef CONFIG_SYSCTL
-static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
@@ -3062,6 +3062,9 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
sched_rr_timeslice =
sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
msecs_to_jiffies(sysctl_sched_rr_timeslice);
+
+ if (sysctl_sched_rr_timeslice <= 0)
+ sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
}
mutex_unlock(&mutex);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 48d0be005f08..11a88469b55e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -454,11 +454,12 @@ extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
-extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern bool cfs_task_bw_constrained(struct task_struct *p);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
@@ -494,6 +495,7 @@ static inline void set_task_rq_fair(struct sched_entity *se,
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
+static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
#endif /* CONFIG_CGROUP_SCHED */
@@ -548,6 +550,9 @@ struct cfs_rq {
unsigned int idle_nr_running; /* SCHED_IDLE */
unsigned int idle_h_nr_running; /* SCHED_IDLE */
+ s64 avg_vruntime;
+ u64 avg_load;
+
u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -567,8 +572,6 @@ struct cfs_rq {
*/
struct sched_entity *curr;
struct sched_entity *next;
- struct sched_entity *last;
- struct sched_entity *skip;
#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
@@ -636,6 +639,8 @@ struct cfs_rq {
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
+ u64 throttled_clock_self;
+ u64 throttled_clock_self_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
@@ -1700,6 +1705,21 @@ rq_unlock(struct rq *rq, struct rq_flags *rf)
raw_spin_rq_unlock(rq);
}
+DEFINE_LOCK_GUARD_1(rq_lock, struct rq,
+ rq_lock(_T->lock, &_T->rf),
+ rq_unlock(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq,
+ rq_lock_irq(_T->lock, &_T->rf),
+ rq_unlock_irq(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
+ rq_lock_irqsave(_T->lock, &_T->rf),
+ rq_unlock_irqrestore(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
static inline struct rq *
this_rq_lock_irq(struct rq_flags *rf)
__acquires(rq->lock)
@@ -1882,6 +1902,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
+ unsigned int cores;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* CPU of highest priority in group */
int flags;
@@ -2196,6 +2217,7 @@ extern const u32 sched_prio_to_wmult[40];
#else
#define ENQUEUE_MIGRATED 0x00
#endif
+#define ENQUEUE_INITIAL 0x80
#define RETRY_TASK ((void *)-1UL)
@@ -2500,11 +2522,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_base_slice;
+
#ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
-extern unsigned int sysctl_sched_idle_min_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
@@ -2610,6 +2630,12 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
#endif
+#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
+__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
+static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
+ _lock; return _t; }
+
#ifdef CONFIG_SMP
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
@@ -2739,6 +2765,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
+static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+ raw_spin_unlock(l1);
+ raw_spin_unlock(l2);
+}
+
+DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t,
+ double_raw_lock(_T->lock, _T->lock2),
+ double_raw_unlock(_T->lock, _T->lock2))
+
/*
* double_rq_unlock - safely unlock two runqueues
*
@@ -2796,6 +2832,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
#endif
+DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
+ double_rq_lock(_T->lock, _T->lock2),
+ double_rq_unlock(_T->lock, _T->lock2))
+
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
@@ -3483,4 +3523,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d3a3b2646ec4..05a5bc678c08 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -722,8 +722,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (parent->parent) {
parent->parent->child = tmp;
- if (tmp->flags & SD_SHARE_CPUCAPACITY)
- parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY;
+ parent->parent->groups->flags = tmp->flags;
}
/*
@@ -1275,14 +1274,24 @@ build_sched_groups(struct sched_domain *sd, int cpu)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
+ struct cpumask *mask = sched_domains_tmpmask2;
WARN_ON(!sg);
do {
- int cpu, max_cpu = -1;
+ int cpu, cores = 0, max_cpu = -1;
sg->group_weight = cpumask_weight(sched_group_span(sg));
+ cpumask_copy(mask, sched_group_span(sg));
+ for_each_cpu(cpu, mask) {
+ cores++;
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
+#endif
+ }
+ sg->cores = cores;
+
if (!(sd->flags & SD_ASYM_PACKING))
goto next;