summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/clock.c4
-rw-r--r--kernel/sched/core.c227
-rw-r--r--kernel/sched/cputime.c78
-rw-r--r--kernel/sched/deadline.c59
-rw-r--r--kernel/sched/fair.c316
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h73
-rw-r--r--kernel/sched/wait.c28
10 files changed, 515 insertions, 275 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 750ed601ddf7..a5d966cb8891 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
ag = autogroup_task_get(p);
down_write(&ag->lock);
- err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
+ err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
if (!err)
ag->nice = nice;
up_write(&ag->lock);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c0a205101c23..bc54e84675da 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,7 +1,7 @@
/*
* sched_clock for unstable cpu clocks
*
- * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
*
* Updates and enhancements:
* Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
@@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
return;
sched_clock_tick();
- touch_softlockup_watchdog();
+ touch_softlockup_watchdog_sched();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d568ac9319e..44253adb3c36 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -731,7 +731,7 @@ bool sched_can_stop_tick(void)
if (current->policy == SCHED_RR) {
struct sched_rt_entity *rt_se = &current->rt;
- return rt_se->run_list.prev == rt_se->run_list.next;
+ return list_is_singular(&rt_se->run_list);
}
/*
@@ -823,8 +823,8 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(prio_to_weight[prio]);
- load->inv_weight = prio_to_wmult[prio];
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1071,8 +1071,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
{
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ dequeue_task(rq, p, 0);
set_task_cpu(p, new_cpu);
raw_spin_unlock(&rq->lock);
@@ -1080,8 +1080,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
return rq;
@@ -1274,6 +1274,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!p->on_rq);
+ /*
+ * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
+ * because schedstat_wait_{start,end} rebase migrating task's wait_start
+ * time relying on p->on_rq.
+ */
+ WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ p->sched_class == &fair_sched_class &&
+ (p->on_rq && !task_on_rq_migrating(p)));
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold either p->pi_lock or rq->lock, when changing
@@ -1310,9 +1319,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
} else {
/*
@@ -1905,6 +1916,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
raw_spin_unlock(&rq->lock);
}
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ * MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ * A) UNLOCK of the rq(c0)->lock scheduling out task t
+ * B) migration for t is required to synchronize *both* rq(c0)->lock and
+ * rq(c1)->lock (if not at the same time, then in that order).
+ * C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ * CPU0 CPU1 CPU2
+ *
+ * LOCK rq(0)->lock
+ * sched-out X
+ * sched-in Y
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(0)->lock // orders against CPU0
+ * dequeue X
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(1)->lock
+ * enqueue X
+ * UNLOCK rq(1)->lock
+ *
+ * LOCK rq(1)->lock // orders against CPU2
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(1)->lock
+ *
+ *
+ * BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ * 1) smp_store_release(X->on_cpu, 0)
+ * 2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ * LOCK rq(0)->lock LOCK X->pi_lock
+ * dequeue X
+ * sched-out X
+ * smp_store_release(X->on_cpu, 0);
+ *
+ * smp_cond_acquire(!X->on_cpu);
+ * X->state = WAKING
+ * set_task_cpu(X,2)
+ *
+ * LOCK rq(2)->lock
+ * enqueue X
+ * X->state = RUNNING
+ * UNLOCK rq(2)->lock
+ *
+ * LOCK rq(2)->lock // orders against CPU1
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(2)->lock
+ *
+ * UNLOCK X->pi_lock
+ * UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
@@ -1947,15 +2049,34 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#ifdef CONFIG_SMP
/*
- * If the owning (remote) cpu is still in the middle of schedule() with
- * this task as prev, wait until its done referencing the task.
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * [S] ->on_cpu = 1; [L] ->on_rq
+ * UNLOCK rq->lock
+ * RMB
+ * LOCK rq->lock
+ * [S] ->on_rq = 0; [L] ->on_cpu
+ *
+ * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+ * from the consecutive calls to schedule(); the first switching to our
+ * task, the second putting it to sleep.
*/
- while (p->on_cpu)
- cpu_relax();
+ smp_rmb();
+
/*
- * Pairs with the smp_wmb() in finish_lock_switch().
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_lock_switch().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
*/
- smp_rmb();
+ smp_cond_acquire(!p->on_cpu);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2039,7 +2160,6 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- WARN_ON(task_is_stopped_or_traced(p));
return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
@@ -2085,6 +2205,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -3085,7 +3209,6 @@ static void __sched notrace __schedule(bool preempt)
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch();
prev = rq->curr;
/*
@@ -3104,13 +3227,16 @@ static void __sched notrace __schedule(bool preempt)
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ local_irq_disable();
+ rcu_note_context_switch();
+
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock(&rq->lock);
lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -5847,13 +5973,13 @@ static int init_rootdomain(struct root_domain *rd)
{
memset(rd, 0, sizeof(*rd));
- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
goto free_span;
- if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
goto free_online;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
init_dl_bw(&rd->dl_bw);
@@ -7331,6 +7457,9 @@ int in_sched_functions(unsigned long addr)
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7388,11 +7517,12 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
-
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
@@ -7673,7 +7803,7 @@ static void free_sched_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kfree(tg);
+ kmem_cache_free(task_group_cache, tg);
}
/* allocate runqueue etc for a new task group */
@@ -7681,7 +7811,7 @@ struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
if (!tg)
return ERR_PTR(-ENOMEM);
@@ -8212,17 +8342,17 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task, void *private)
+static void cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
}
-static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
+ struct cgroup_subsys_state *css;
- cgroup_taskset_for_each(task, tset) {
+ cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
@@ -8235,12 +8365,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
return 0;
}
-static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
+ struct cgroup_subsys_state *css;
- cgroup_taskset_for_each(task, tset)
+ cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);
}
@@ -8586,3 +8716,44 @@ void dump_cpu_task(int cpu)
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+const int sched_prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+const u32 sched_prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 26a54461bf59..b2ab2ffb1adc 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -5,6 +5,9 @@
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include "sched.h"
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -466,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
- if (vtime_accounting_enabled())
+ if (vtime_accounting_cpu_enabled())
return;
if (sched_clock_irqtime) {
@@ -680,7 +683,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long long delta = vtime_delta(tsk);
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap += delta;
/* CHECKME: always safe to convert nsecs to cputime? */
@@ -696,37 +699,37 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
- write_seqlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&tsk->vtime_seqcount);
delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_user_enter(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_guest_enter(struct task_struct *tsk)
@@ -738,19 +741,19 @@ void vtime_guest_enter(struct task_struct *tsk)
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqlock(&tsk->vtime_seqlock);
+ write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
current->flags &= ~PF_VCPU;
- write_sequnlock(&tsk->vtime_seqlock);
+ write_seqcount_end(&tsk->vtime_seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -763,24 +766,26 @@ void vtime_account_idle(struct task_struct *tsk)
void arch_vtime_task_switch(struct task_struct *prev)
{
- write_seqlock(&prev->vtime_seqlock);
- prev->vtime_snap_whence = VTIME_SLEEPING;
- write_sequnlock(&prev->vtime_seqlock);
+ write_seqcount_begin(&prev->vtime_seqcount);
+ prev->vtime_snap_whence = VTIME_INACTIVE;
+ write_seqcount_end(&prev->vtime_seqcount);
- write_seqlock(&current->vtime_seqlock);
+ write_seqcount_begin(&current->vtime_seqcount);
current->vtime_snap_whence = VTIME_SYS;
current->vtime_snap = sched_clock_cpu(smp_processor_id());
- write_sequnlock(&current->vtime_seqlock);
+ write_seqcount_end(&current->vtime_seqcount);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
unsigned long flags;
- write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ local_irq_save(flags);
+ write_seqcount_begin(&t->vtime_seqcount);
t->vtime_snap_whence = VTIME_SYS;
t->vtime_snap = sched_clock_cpu(cpu);
- write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+ write_seqcount_end(&t->vtime_seqcount);
+ local_irq_restore(flags);
}
cputime_t task_gtime(struct task_struct *t)
@@ -788,14 +793,17 @@ cputime_t task_gtime(struct task_struct *t)
unsigned int seq;
cputime_t gtime;
+ if (!vtime_accounting_enabled())
+ return t->gtime;
+
do {
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
gtime = t->gtime;
- if (t->flags & PF_VCPU)
+ if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
gtime += vtime_delta(t);
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
return gtime;
}
@@ -818,7 +826,7 @@ fetch_task_cputime(struct task_struct *t,
*udelta = 0;
*sdelta = 0;
- seq = read_seqbegin(&t->vtime_seqlock);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
if (u_dst)
*u_dst = *u_src;
@@ -826,7 +834,7 @@ fetch_task_cputime(struct task_struct *t,
*s_dst = *s_src;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ if (t->vtime_snap_whence == VTIME_INACTIVE ||
is_idle_task(t))
continue;
@@ -842,7 +850,7 @@ fetch_task_cputime(struct task_struct *t,
if (t->vtime_snap_whence == VTIME_SYS)
*sdelta = delta;
}
- } while (read_seqretry(&t->vtime_seqlock, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
}
@@ -850,6 +858,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
{
cputime_t udelta, sdelta;
+ if (!vtime_accounting_enabled()) {
+ if (utime)
+ *utime = t->utime;
+ if (stime)
+ *stime = t->stime;
+ return;
+ }
+
fetch_task_cputime(t, utime, stime, &t->utime,
&t->stime, &udelta, &sdelta);
if (utime)
@@ -863,6 +879,14 @@ void task_cputime_scaled(struct task_struct *t,
{
cputime_t udelta, sdelta;
+ if (!vtime_accounting_enabled()) {
+ if (utimescaled)
+ *utimescaled = t->utimescaled;
+ if (stimescaled)
+ *stimescaled = t->stimescaled;
+ return;
+ }
+
fetch_task_cputime(t, utimescaled, stimescaled,
&t->utimescaled, &t->stimescaled, &udelta, &sdelta);
if (utimescaled)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8b0a15e285f9..cd64c979d0e1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
}
}
- if (leftmost)
+ if (leftmost) {
dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+ dl_rq->earliest_dl.next = p->dl.deadline;
+ }
rb_link_node(&p->pushable_dl_tasks, parent, link);
rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
next_node = rb_next(&p->pushable_dl_tasks);
dl_rq->pushable_dl_tasks_leftmost = next_node;
+ if (next_node) {
+ dl_rq->earliest_dl.next = rb_entry(next_node,
+ struct task_struct, pushable_dl_tasks)->dl.deadline;
+ }
}
rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
@@ -782,42 +788,14 @@ static void update_curr_dl(struct rq *rq)
#ifdef CONFIG_SMP
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
-
-static inline u64 next_deadline(struct rq *rq)
-{
- struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
-
- if (next && dl_prio(next->prio))
- return next->dl.deadline;
- else
- return 0;
-}
-
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
{
struct rq *rq = rq_of_dl_rq(dl_rq);
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
- /*
- * If the dl_rq had no -deadline tasks, or if the new task
- * has shorter deadline than the current one on dl_rq, we
- * know that the previous earliest becomes our next earliest,
- * as the new task becomes the earliest itself.
- */
- dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
- } else if (dl_rq->earliest_dl.next == 0 ||
- dl_time_before(deadline, dl_rq->earliest_dl.next)) {
- /*
- * On the other hand, if the new -deadline task has a
- * a later deadline than the earliest one on dl_rq, but
- * it is earlier than the next (if any), we must
- * recompute the next-earliest.
- */
- dl_rq->earliest_dl.next = next_deadline(rq);
}
}
@@ -839,7 +817,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
- dl_rq->earliest_dl.next = next_deadline(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
}
}
@@ -1274,28 +1251,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
return 0;
}
-/* Returns the second earliest -deadline task, NULL otherwise */
-static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
-{
- struct rb_node *next_node = rq->dl.rb_leftmost;
- struct sched_dl_entity *dl_se;
- struct task_struct *p = NULL;
-
-next_node:
- next_node = rb_next(next_node);
- if (next_node) {
- dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
- p = dl_task_of(dl_se);
-
- if (pick_dl_task(rq, p, cpu))
- return p;
-
- goto next_node;
- }
-
- return NULL;
-}
-
/*
* Return the earliest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f04fda8f669c..1926606ece80 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -17,7 +17,7 @@
* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
*
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/latencytop.h>
@@ -738,12 +738,56 @@ static void update_curr_fair(struct rq *rq)
update_curr(cfs_rq_of(&rq->curr->se));
}
+#ifdef CONFIG_SCHEDSTATS
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 wait_start = rq_clock(rq_of(cfs_rq));
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+ likely(wait_start > se->statistics.wait_start))
+ wait_start -= se->statistics.wait_start;
+
+ se->statistics.wait_start = wait_start;
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct task_struct *p;
+ u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ se->statistics.wait_start = delta;
+ return;
+ }
+ trace_sched_stat_wait(p, delta);
+ }
+
+ se->statistics.wait_max = max(se->statistics.wait_max, delta);
+ se->statistics.wait_count++;
+ se->statistics.wait_sum += delta;
+ se->statistics.wait_start = 0;
+}
+#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
}
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+#endif
+
/*
* Task is being enqueued - update stats:
*/
@@ -757,23 +801,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_start(cfs_rq, se);
}
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
- schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
- schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
- if (entity_is_task(se)) {
- trace_sched_stat_wait(task_of(se),
- rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
- }
-#endif
- schedstat_set(se->statistics.wait_start, 0);
-}
-
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -2155,6 +2182,7 @@ void task_numa_work(struct callback_head *work)
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
@@ -2277,6 +2305,17 @@ out:
else
reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
+
+ /*
+ * Make sure tasks use at least 32x as much time to run other code
+ * than they used here, to limit NUMA PTE scanning overhead to 3% max.
+ * Usually update_task_scan_period slows down scanning enough; on an
+ * overloaded system we need to limit overhead on a per task basis.
+ */
+ if (unlikely(p->se.sum_exec_runtime != runtime)) {
+ u64 diff = p->se.sum_exec_runtime - runtime;
+ p->node_stamp += 32 * diff;
+ }
}
/*
@@ -2670,12 +2709,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
+ }
+}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2689,7 +2780,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
int decayed, removed = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
removed = 1;
@@ -2809,48 +2900,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
-
#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
u64 last_update_time_copy;
+ u64 last_update_time;
do {
last_update_time_copy = cfs_rq->load_last_update_time_copy;
smp_rmb();
last_update_time = cfs_rq->avg.last_update_time;
} while (last_update_time != last_update_time_copy);
-#else
- last_update_time = cfs_rq->avg.last_update_time;
-#endif
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
- atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
- atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+ return last_update_time;
}
-
-/*
- * Update the rq's load with the elapsed running time before entering
- * idle. if the last scheduled task is not a CFS task, idle_enter will
- * be the only way to update the runnable statistic.
- */
-void idle_enter_fair(struct rq *this_rq)
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
+ return cfs_rq->avg.last_update_time;
}
+#endif
/*
- * Update the rq's load with the elapsed idle time before a task is
- * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
- * be the only way to update the runnable statistic.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
*/
-void idle_exit_fair(struct rq *this_rq)
+void remove_entity_load_avg(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ /*
+ * Newly created task or never used group entity should not be removed
+ * from its (source) cfs_rq
+ */
+ if (se->avg.last_update_time == 0)
+ return;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+ atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -4240,42 +4331,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ * The exact cpuload calculated at every tick would be:
+ *
+ * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
*
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ * If a cpu misses updates for n ticks (as it was idle) and update gets
+ * called on the n+1-th tick when cpu may be busy, then we have:
+ *
+ * load_n = (1 - 1/2^i)^n * load_0
+ * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
*
* decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * load' = (1 - 1/2^i)^n * load
+ *
+ * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
+ * This allows us to precompute the above in said factors, thereby allowing the
+ * reduction of an arbitrary n in O(log_2 n) steps. (See also
+ * fixed_power_int())
*
* The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
*/
#define DEGRADE_SHIFT 7
-static const unsigned char
- degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
- degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- {0, 0, 0, 0, 0, 0, 0, 0},
- {64, 32, 8, 0, 0, 0, 0, 0},
- {96, 72, 40, 12, 1, 0, 0},
- {112, 98, 75, 43, 15, 1, 0},
- {120, 112, 98, 76, 45, 16, 2} };
+
+static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 64, 32, 8, 0, 0, 0, 0, 0 },
+ { 96, 72, 40, 12, 1, 0, 0, 0 },
+ { 112, 98, 75, 43, 15, 1, 0, 0 },
+ { 120, 112, 98, 76, 45, 16, 2, 0 }
+};
/*
* Update cpu_load for any missed ticks, due to tickless idle. The backlog
@@ -4306,14 +4392,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
return load;
}
-/*
+/**
+ * __update_cpu_load - update the rq->cpu_load[] statistics
+ * @this_rq: The rq to update statistics for
+ * @this_load: The current load
+ * @pending_updates: The number of missed updates
+ * @active: !0 for NOHZ_FULL
+ *
* Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
+ * scheduler tick (TICK_NSEC).
+ *
+ * This function computes a decaying average:
+ *
+ * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
+ *
+ * Because of NOHZ it might not get called on every tick which gives need for
+ * the @pending_updates argument.
+ *
+ * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
+ * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
+ * = A * (A * load[i]_n-2 + B) + B
+ * = A * (A * (A * load[i]_n-3 + B) + B) + B
+ * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
+ * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
+ * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
+ * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
+ *
+ * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
+ * any change in load would have resulted in the tick being turned back on.
+ *
+ * For regular NOHZ, this reduces to:
+ *
+ * load[i]_n = (1 - 1/2^i)^n * load[i]_0
+ *
+ * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
+ * term. See the @active paramter.
*/
static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
+ unsigned long pending_updates, int active)
{
+ unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
int i, scale;
this_rq->nr_load_updates++;
@@ -4325,8 +4443,9 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
- old_load = this_rq->cpu_load[i];
+ old_load = this_rq->cpu_load[i] - tickless_load;
old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ old_load += tickless_load;
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -4381,16 +4500,17 @@ static void update_idle_cpu_load(struct rq *this_rq)
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
this_rq->last_load_update_tick = curr_jiffies;
- __update_cpu_load(this_rq, load, pending_updates);
+ __update_cpu_load(this_rq, load, pending_updates, 0);
}
/*
* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
*/
-void update_cpu_load_nohz(void)
+void update_cpu_load_nohz(int active)
{
struct rq *this_rq = this_rq();
unsigned long curr_jiffies = READ_ONCE(jiffies);
+ unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
unsigned long pending_updates;
if (curr_jiffies == this_rq->last_load_update_tick)
@@ -4401,10 +4521,11 @@ void update_cpu_load_nohz(void)
if (pending_updates) {
this_rq->last_load_update_tick = curr_jiffies;
/*
- * We were idle, this means load 0, the current load might be
- * !0 due to remote wakeups and the sort.
+ * In the regular NOHZ case, we were idle, this means load 0.
+ * In the NOHZ_FULL case, we were non-idle, we should consider
+ * its weighted load.
*/
- __update_cpu_load(this_rq, 0, pending_updates);
+ __update_cpu_load(this_rq, load, pending_updates, active);
}
raw_spin_unlock(&this_rq->lock);
}
@@ -4420,7 +4541,7 @@ void update_cpu_load_active(struct rq *this_rq)
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1);
+ __update_cpu_load(this_rq, load, 1, 1);
}
/*
@@ -5007,8 +5128,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. However, the caller only guarantees p->pi_lock is held; no
- * other assumptions, including the state of rq->lock, should be made.
+ * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p)
{
@@ -5721,8 +5841,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
- deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(env->src_rq, p, 0);
set_task_cpu(p, env->dst_cpu);
}
@@ -5855,8 +5975,8 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -6302,7 +6422,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
bool *overload)
{
unsigned long load;
- int i;
+ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -6319,7 +6439,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
- if (rq->nr_running > 1)
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
*overload = true;
#ifdef CONFIG_NUMA_BALANCING
@@ -6327,7 +6448,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
}
@@ -7248,8 +7372,6 @@ static int idle_balance(struct rq *this_rq)
int pulled_task = 0;
u64 curr_cost = 0;
- idle_enter_fair(this_rq);
-
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -7330,10 +7452,8 @@ out:
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
- if (pulled_task) {
- idle_exit_fair(this_rq);
+ if (pulled_task)
this_rq->idle_stamp = 0;
- }
return pulled_task;
}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c4ae0f1fdf9b..47ce94931f1b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
- idle_exit_fair(rq);
rq_last_tick_reset(rq);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e3cc16312046..8ec86abe0ea1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -64,7 +64,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
static void push_irq_work_func(struct irq_work *work);
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efd3bfc7e347..10f16374df7f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -248,7 +248,12 @@ struct task_group {
unsigned long shares;
#ifdef CONFIG_SMP
- atomic_long_t load_avg;
+ /*
+ * load_avg can be heavily contended at clock tick time, so put
+ * it in its own cacheline separated from the fields above which
+ * will also be accessed at each tick.
+ */
+ atomic_long_t load_avg ____cacheline_aligned;
#endif
#endif
@@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#else /* CONFIG_CGROUP_SCHED */
@@ -933,6 +946,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
#endif
@@ -1073,7 +1087,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* We must ensure this doesn't happen until the switch is completely
* finished.
*
- * Pairs with the control dependency and rmb in try_to_wake_up().
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
@@ -1110,46 +1127,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#define WEIGHT_IDLEPRIO 3
#define WMULT_IDLEPRIO 1431655765
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */ 88761, 71755, 56483, 46273, 36291,
- /* -15 */ 29154, 23254, 18705, 14949, 11916,
- /* -10 */ 9548, 7620, 6100, 4904, 3906,
- /* -5 */ 3121, 2501, 1991, 1586, 1277,
- /* 0 */ 1024, 820, 655, 526, 423,
- /* 5 */ 335, 272, 215, 172, 137,
- /* 10 */ 110, 87, 70, 56, 45,
- /* 15 */ 36, 29, 23, 18, 15,
-};
-
-/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
- */
-static const u32 prio_to_wmult[40] = {
- /* -20 */ 48388, 59856, 76040, 92818, 118348,
- /* -15 */ 147320, 184698, 229616, 287308, 360437,
- /* -10 */ 449829, 563644, 704093, 875809, 1099582,
- /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
- /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
- /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
- /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
- /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
+extern const int sched_prio_to_weight[40];
+extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_HEAD 0x02
@@ -1249,16 +1228,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void idle_enter_fair(struct rq *this_rq);
-extern void idle_exit_fair(struct rq *this_rq);
-
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
-#else
-
-static inline void idle_enter_fair(struct rq *rq) { }
-static inline void idle_exit_fair(struct rq *rq) { }
-
#endif
#ifdef CONFIG_CPU_IDLE
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..f15d6b6a538a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
do {
prepare_to_wait(wq, &q->wait, mode);
if (test_bit(q->key.bit_nr, q->key.flags))
- ret = (*action)(&q->key);
+ ret = (*action)(&q->key, mode);
} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
finish_wait(wq, &q->wait);
return ret;
@@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
prepare_to_wait_exclusive(wq, &q->wait, mode);
if (!test_bit(q->key.bit_nr, q->key.flags))
continue;
- ret = action(&q->key);
+ ret = action(&q->key, mode);
if (!ret)
continue;
abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -581,44 +581,44 @@ void wake_up_atomic_t(atomic_t *p)
}
EXPORT_SYMBOL(wake_up_atomic_t);
-__sched int bit_wait(struct wait_bit_key *word)
+__sched int bit_wait(struct wait_bit_key *word, int mode)
{
- if (signal_pending_state(current->state, current))
- return 1;
schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
return 0;
}
EXPORT_SYMBOL(bit_wait);
-__sched int bit_wait_io(struct wait_bit_key *word)
+__sched int bit_wait_io(struct wait_bit_key *word, int mode)
{
- if (signal_pending_state(current->state, current))
- return 1;
io_schedule();
+ if (signal_pending_state(mode, current))
+ return -EINTR;
return 0;
}
EXPORT_SYMBOL(bit_wait_io);
-__sched int bit_wait_timeout(struct wait_bit_key *word)
+__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
{
unsigned long now = READ_ONCE(jiffies);
- if (signal_pending_state(current->state, current))
- return 1;
if (time_after_eq(now, word->timeout))
return -EAGAIN;
schedule_timeout(word->timeout - now);
+ if (signal_pending_state(mode, current))
+ return -EINTR;
return 0;
}
EXPORT_SYMBOL_GPL(bit_wait_timeout);
-__sched int bit_wait_io_timeout(struct wait_bit_key *word)
+__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
{
unsigned long now = READ_ONCE(jiffies);
- if (signal_pending_state(current->state, current))
- return 1;
if (time_after_eq(now, word->timeout))
return -EAGAIN;
io_schedule_timeout(word->timeout - now);
+ if (signal_pending_state(mode, current))
+ return -EINTR;
return 0;
}
EXPORT_SYMBOL_GPL(bit_wait_io_timeout);