summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 11:25:07 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 11:25:07 -0700
commitf7951c33f0fed14ee26651a70a46899a59a31e18 (patch)
treedff372035ceaa7b3a01e2f15c885ff0ff2510e68 /kernel
parent2406fb8d94fb17fee3ace0c09427c08825eacb16 (diff)
parent1b6266ebe3da8198e9a02fbad77bbb56e2f7ce2e (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Thomas Gleixner: - Cleanup and improvement of NUMA balancing - Refactoring and improvements to the PELT (Per Entity Load Tracking) code - Watchdog simplification and related cleanups - The usual pile of small incremental fixes and improvements * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits) watchdog: Reduce message verbosity stop_machine: Reflow cpu_stop_queue_two_works() sched/numa: Move task_numa_placement() closer to numa_migrate_preferred() sched/numa: Use group_weights to identify if migration degrades locality sched/numa: Update the scan period without holding the numa_group lock sched/numa: Remove numa_has_capacity() sched/numa: Modify migrate_swap() to accept additional parameters sched/numa: Remove unused task_capacity from 'struct numa_stats' sched/numa: Skip nodes that are at 'hoplimit' sched/debug: Reverse the order of printing faults sched/numa: Use task faults only if numa_group is not yet set up sched/numa: Set preferred_node based on best_cpu sched/numa: Simplify load_too_imbalanced() sched/numa: Evaluate move once per node sched/numa: Remove redundant field sched/debug: Show the sum wait time of a task group sched/fair: Remove #ifdefs from scale_rt_capacity() sched/core: Remove get_cpu() from sched_fork() sched/cpufreq: Clarify sugov_get_util() sched/sysctl: Remove unused sched_time_avg_ms sysctl ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/rcu/srcutiny.c4
-rw-r--r--kernel/rcu/tree.c8
-rw-r--r--kernel/rcu/tree_exp.h4
-rw-r--r--kernel/rcu/tree_plugin.h12
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c72
-rw-r--r--kernel/sched/cpufreq_schedutil.c103
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/debug.c35
-rw-r--r--kernel/sched/fair.c663
-rw-r--r--kernel/sched/pelt.c399
-rw-r--r--kernel/sched/pelt.h72
-rw-r--r--kernel/sched/rt.c15
-rw-r--r--kernel/sched/sched.h87
-rw-r--r--kernel/sched/swait.c32
-rw-r--r--kernel/smpboot.c54
-rw-r--r--kernel/stop_machine.c41
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/watchdog.c147
-rw-r--r--kernel/watchdog_hld.c4
23 files changed, 966 insertions, 819 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2f8f338e77cf..15be70aae8ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
+ [CPUHP_AP_WATCHDOG_ONLINE] = {
+ .name = "lockup_detector:online",
+ .startup.single = lockup_detector_online_cpu,
+ .teardown.single = lockup_detector_offline_cpu,
+ },
[CPUHP_AP_WORKQUEUE_ONLINE] = {
.name = "workqueue:online",
.startup.single = workqueue_online_cpu,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 486dedbd9af5..087d18d771b5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self)
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break;
- complete_all(&self->parked);
+ complete(&self->parked);
schedule();
}
__set_current_state(TASK_RUNNING);
@@ -471,7 +471,6 @@ void kthread_unpark(struct task_struct *k)
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);
- reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
@@ -499,6 +498,9 @@ int kthread_park(struct task_struct *k)
if (WARN_ON(k->flags & PF_EXITING))
return -ENOSYS;
+ if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
+ return -EBUSY;
+
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 87331565e505..70178f6ffdc4 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -92,7 +92,7 @@ static void s2idle_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
- swait_event(s2idle_wait_head,
+ swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
cpuidle_pause();
@@ -160,7 +160,7 @@ void s2idle_wake(void)
raw_spin_lock_irqsave(&s2idle_lock, flags);
if (s2idle_state > S2IDLE_STATE_NONE) {
s2idle_state = S2IDLE_STATE_WAKE;
- swake_up(&s2idle_wait_head);
+ swake_up_one(&s2idle_wait_head);
}
raw_spin_unlock_irqrestore(&s2idle_lock, flags);
}
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 622792abe41a..04fc2ed71af8 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
if (!newval && READ_ONCE(sp->srcu_gp_waiting))
- swake_up(&sp->srcu_wq);
+ swake_up_one(&sp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp)
idx = sp->srcu_idx;
WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
- swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
+ swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
/* Invoke the callbacks we removed above. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6930934e8b9f..0b760c1369f7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1701,7 +1701,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
- swake_up(&rsp->gp_wq);
+ swake_up_one(&rsp->gp_wq);
}
/*
@@ -2015,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
}
/*
- * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
* time.
*/
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
@@ -2163,7 +2163,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gp_seq),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
@@ -2191,7 +2191,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gp_seq),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = swait_event_idle_timeout(rsp->gp_wq,
+ ret = swait_event_idle_timeout_exclusive(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index b3df3b770afb..0b2c2ad69629 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
- swake_up(&rsp->expedited_wq);
+ swake_up_one(&rsp->expedited_wq);
}
break;
}
@@ -526,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_start = jiffies;
for (;;) {
- ret = swait_event_timeout(
+ ret = swait_event_timeout_exclusive(
rsp->expedited_wq,
sync_rcu_preempt_exp_done_unlocked(rnp_root),
jiffies_stall);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c1b17f5b9361..a97c20ea9bce 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1926,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
del_timer(&rdp->nocb_timer);
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
- swake_up(&rdp_leader->nocb_wq);
+ smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
+ swake_up_one(&rdp_leader->nocb_wq);
} else {
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
@@ -2159,7 +2159,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/
trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
- swait_event_interruptible(
+ swait_event_interruptible_exclusive(
rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
(d = rcu_seq_done(&rnp->gp_seq, c)));
if (likely(d))
@@ -2188,7 +2188,7 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
- swait_event_interruptible(my_rdp->nocb_wq,
+ swait_event_interruptible_exclusive(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
my_rdp->nocb_leader_sleep = true;
@@ -2253,7 +2253,7 @@ wait_again:
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
/* List was empty, so wake up the follower. */
- swake_up(&rdp->nocb_wq);
+ swake_up_one(&rdp->nocb_wq);
}
}
@@ -2270,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
{
for (;;) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
- swait_event_interruptible(rdp->nocb_wq,
+ swait_event_interruptible_exclusive(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head));
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index d9a02b318108..7fe183404c38 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe365c9a08e9..deafa9fe602b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -17,6 +17,8 @@
#include "../workqueue_internal.h"
#include "../smpboot.h"
+#include "pelt.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -45,14 +47,6 @@ const_debug unsigned int sysctl_sched_features =
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * period over which we average the RT time consumption, measured
- * in ms.
- *
- * default: 1s
- */
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
-
-/*
* period over which we measure -rt task CPU usage in us.
* default: 1s
*/
@@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
- sched_rt_avg_update(rq, irq_delta + steal);
+ update_irq_load_avg(rq, irq_delta + steal);
#endif
}
@@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
-
-void sched_avg_update(struct rq *rq)
-{
- s64 period = sched_avg_period();
-
- while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
- /*
- * Inline assembly required to prevent the compiler
- * optimising this loop into a divmod call.
- * See __iter_div_u64_rem() for another example of this.
- */
- asm("" : "+rm" (rq->age_stamp));
- rq->age_stamp += period;
- rq->rt_avg /= 2;
- }
-}
-
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
+#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
@@ -1280,16 +1258,17 @@ unlock:
/*
* Cross migrate two tasks
*/
-int migrate_swap(struct task_struct *cur, struct task_struct *p)
+int migrate_swap(struct task_struct *cur, struct task_struct *p,
+ int target_cpu, int curr_cpu)
{
struct migration_swap_arg arg;
int ret = -EINVAL;
arg = (struct migration_swap_arg){
.src_task = cur,
- .src_cpu = task_cpu(cur),
+ .src_cpu = curr_cpu,
.dst_task = p,
- .dst_cpu = task_cpu(p),
+ .dst_cpu = target_cpu,
};
if (arg.src_cpu == arg.dst_cpu)
@@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
out:
return ret;
}
+#endif /* CONFIG_NUMA_BALANCING */
/*
* wait_task_inactive - wait for a thread to unschedule.
@@ -2317,7 +2297,6 @@ static inline void init_schedstats(void) {}
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
- int cpu = get_cpu();
__sched_fork(clone_flags, p);
/*
@@ -2353,14 +2332,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
- if (dl_prio(p->prio)) {
- put_cpu();
+ if (dl_prio(p->prio))
return -EAGAIN;
- } else if (rt_prio(p->prio)) {
+ else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
- } else {
+ else
p->sched_class = &fair_sched_class;
- }
init_entity_runnable_average(&p->se);
@@ -2376,7 +2353,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
*/
- __set_task_cpu(p, cpu);
+ __set_task_cpu(p, smp_processor_id());
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2393,8 +2370,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
-
- put_cpu();
return 0;
}
@@ -5714,13 +5689,6 @@ void set_rq_offline(struct rq *rq)
}
}
-static void set_cpu_rq_start_time(unsigned int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- rq->age_stamp = sched_clock_cpu(cpu);
-}
-
/*
* used to mark begin/end of suspend/resume:
*/
@@ -5838,7 +5806,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
- set_cpu_rq_start_time(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -6106,7 +6073,6 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
- set_cpu_rq_start_time(smp_processor_id());
#endif
init_sched_fair_class();
@@ -6785,6 +6751,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
+ if (schedstat_enabled() && tg != &root_task_group) {
+ u64 ws = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ ws += schedstat_val(tg->se[i]->statistics.wait_sum);
+
+ seq_printf(sf, "wait_sum %llu\n", ws);
+ }
+
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c907fde01eaa..3fffad3bc8a8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,9 +53,7 @@ struct sugov_cpu {
unsigned int iowait_boost_max;
u64 last_update;
- /* The fields below are only needed when sharing a policy: */
- unsigned long util_cfs;
- unsigned long util_dl;
+ unsigned long bw_dl;
unsigned long max;
/* The field below is for single-CPU policies only: */
@@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(struct sugov_cpu *sg_cpu)
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
+ unsigned long util, irq, max;
- sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
- sg_cpu->util_cfs = cpu_util_cfs(rq);
- sg_cpu->util_dl = cpu_util_dl(rq);
-}
-
-static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
-{
- struct rq *rq = cpu_rq(sg_cpu->cpu);
+ sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ sg_cpu->bw_dl = cpu_bw_dl(rq);
if (rt_rq_is_runnable(&rq->rt))
- return sg_cpu->max;
+ return max;
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= max))
+ return max;
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = cpu_util_cfs(rq);
+ util += cpu_util_rt(rq);
+
+ /*
+ * We do not make cpu_util_dl() a permanent part of this sum because we
+ * want to use cpu_bw_dl() later on, but we need to check if the
+ * CFS+RT+DL sum is saturated (ie. no idle time) such that we select
+ * f_max when there is no idle time.
+ *
+ * NOTE: numerical errors or stop class might cause us to not quite hit
+ * saturation when we should -- something for later.
+ */
+ if ((util + cpu_util_dl(rq)) >= max)
+ return max;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * irq metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * 1 - irq
+ * U' = irq + ------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, max);
+ util += irq;
/*
- * Utilization required by DEADLINE must always be granted while, for
- * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
- * gracefully reduce the frequency when no tasks show up for longer
+ * Bandwidth required by DEADLINE must always be granted while, for
+ * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+ * to gracefully reduce the frequency when no tasks show up for longer
* periods of time.
*
- * Ideally we would like to set util_dl as min/guaranteed freq and
- * util_cfs + util_dl as requested freq. However, cpufreq is not yet
- * ready for such an interface. So, we only do the latter for now.
+ * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+ * bw_dl as requested freq. However, cpufreq is not yet ready for such
+ * an interface. So, we only do the latter for now.
*/
- return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
+ return min(max, util + sg_cpu->bw_dl);
}
/**
@@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
- if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
+ if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
sg_policy->need_freq_update = true;
}
@@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
busy = sugov_cpu_is_busy(sg_cpu);
- sugov_get_util(sg_cpu);
+ util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
- util = sugov_aggregate_util(sg_cpu);
sugov_iowait_apply(sg_cpu, time, &util, &max);
next_f = get_next_freq(sg_policy, util, max);
/*
@@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
- sugov_get_util(j_sg_cpu);
+ j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
- j_util = sugov_aggregate_util(j_sg_cpu);
sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
if (j_util * max > j_max * util) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5fbdde6afa9..997ea7b839fa 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,7 @@
* Fabio Checconi <fchecconi@gmail.com>
*/
#include "sched.h"
+#include "pelt.h"
struct dl_bandwidth def_dl_bandwidth;
@@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq)
curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
- sched_rt_avg_update(rq, delta_exec);
-
if (dl_entity_is_special(dl_se))
return;
@@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
deadline_queue_push_tasks(rq);
+ if (rq->curr->sched_class != &dl_sched_class)
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+
return p;
}
@@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e593b4118578..870d4f3da285 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp)
cmp += 3;
}
- for (i = 0; i < __SCHED_FEAT_NR; i++) {
- if (strcmp(cmp, sched_feat_names[i]) == 0) {
- if (neg) {
- sysctl_sched_features &= ~(1UL << i);
- sched_feat_disable(i);
- } else {
- sysctl_sched_features |= (1UL << i);
- sched_feat_enable(i);
- }
- break;
- }
+ i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
+ if (i < 0)
+ return i;
+
+ if (neg) {
+ sysctl_sched_features &= ~(1UL << i);
+ sched_feat_disable(i);
+ } else {
+ sysctl_sched_features |= (1UL << i);
+ sched_feat_enable(i);
}
- return i;
+ return 0;
}
static ssize_t
@@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
{
char buf[64];
char *cmp;
- int i;
+ int ret;
struct inode *inode;
if (cnt > 63)
@@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
inode_lock(inode);
- i = sched_feat_set(cmp);
+ ret = sched_feat_set(cmp);
inode_unlock(inode);
- if (i == __SCHED_FEAT_NR)
- return -EINVAL;
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -843,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf)
{
SEQ_printf(m, "numa_faults node=%d ", node);
- SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
- SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
+ SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
+ SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
}
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2f0a0be4d344..309c93fcc604 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return cfs_rq->rq;
}
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se) (!se->my_q)
-
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return container_of(cfs_rq, struct rq, cfs);
}
-#define entity_is_task(se) 1
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
-
+#include "pelt.h"
#include "sched-pelt.h"
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
@@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
* To solve this problem, we also cap the util_avg of successive tasks to
* only 1/2 of the left utilization budget:
*
- * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
*
- * where n denotes the nth task.
+ * where n denotes the nth task and cpu_scale the CPU capacity.
*
- * For example, a simplest series from the beginning would be like:
+ * For example, for a CPU with 1024 of capacity, a simplest series from
+ * the beginning would be like:
*
* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
@@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
- long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+ long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* of each group. Skip other nodes.
*/
if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist > maxdist)
+ dist >= maxdist)
continue;
/* Add up the faults from nearby nodes. */
@@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
- unsigned long nr_running;
unsigned long load;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
- /* Approximate capacity in terms of runnable tasks on a node */
- unsigned long task_capacity;
- int has_free_capacity;
+ unsigned int nr_running;
};
/*
@@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
* the @ns structure is NULL'ed and task_numa_compare() will
* not find this node attractive.
*
- * We'll either bail at !has_free_capacity, or we'll detect a huge
- * imbalance and bail there.
+ * We'll detect a huge imbalance and bail there.
*/
if (!cpus)
return;
@@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
capacity = cpus / smt; /* cores */
- ns->task_capacity = min_t(unsigned, capacity,
+ capacity = min_t(unsigned, capacity,
DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
- ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct task_numa_env {
@@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load,
src_capacity = env->src_stats.compute_capacity;
dst_capacity = env->dst_stats.compute_capacity;
- /* We care about the slope of the imbalance, not the direction. */
- if (dst_load < src_load)
- swap(dst_load, src_load);
+ imb = abs(dst_load * src_capacity - src_load * dst_capacity);
- /* Is the difference below the threshold? */
- imb = dst_load * src_capacity * 100 -
- src_load * dst_capacity * env->imbalance_pct;
- if (imb <= 0)
- return false;
-
- /*
- * The imbalance is above the allowed threshold.
- * Compare it with the old imbalance.
- */
orig_src_load = env->src_stats.load;
orig_dst_load = env->dst_stats.load;
- if (orig_dst_load < orig_src_load)
- swap(orig_dst_load, orig_src_load);
-
- old_imb = orig_dst_load * src_capacity * 100 -
- orig_src_load * dst_capacity * env->imbalance_pct;
+ old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
/* Would this change make things worse? */
return (imb > old_imb);
@@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
* be exchanged with the source task
*/
static void task_numa_compare(struct task_numa_env *env,
- long taskimp, long groupimp)
+ long taskimp, long groupimp, bool maymove)
{
- struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
long src_load, dst_load;
@@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env,
if (cur == env->p)
goto unlock;
+ if (!cur) {
+ if (maymove || imp > env->best_imp)
+ goto assign;
+ else
+ goto unlock;
+ }
+
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
- * the value is, the more rmeote accesses that would be expected to
+ * the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
*/
- if (cur) {
- /* Skip this swap candidate if cannot move to the source CPU: */
- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
- goto unlock;
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+ goto unlock;
+ /*
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+ if (cur->numa_group == env->p->numa_group) {
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
- * If dst and source tasks are in the same NUMA group, or not
- * in any group then look only at task weights.
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
*/
- if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid, dist) -
- task_weight(cur, env->dst_nid, dist);
- /*
- * Add some hysteresis to prevent swapping the
- * tasks within a group over tiny differences.
- */
- if (cur->numa_group)
- imp -= imp/16;
- } else {
- /*
- * Compare the group weights. If a task is all by
- * itself (not part of a group), use the task weight
- * instead.
- */
- if (cur->numa_group)
- imp += group_weight(cur, env->src_nid, dist) -
- group_weight(cur, env->dst_nid, dist);
- else
- imp += task_weight(cur, env->src_nid, dist) -
- task_weight(cur, env->dst_nid, dist);
- }
+ if (cur->numa_group)
+ imp -= imp / 16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by itself
+ * (not part of a group), use the task weight instead.
+ */
+ if (cur->numa_group && env->p->numa_group)
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
+ else
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
- if (imp <= env->best_imp && moveimp <= env->best_imp)
+ if (imp <= env->best_imp)
goto unlock;
- if (!cur) {
- /* Is there capacity at our destination? */
- if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
- !env->dst_stats.has_free_capacity)
- goto unlock;
-
- goto balance;
- }
-
- /* Balance doesn't matter much if we're running a task per CPU: */
- if (imp > env->best_imp && src_rq->nr_running == 1 &&
- dst_rq->nr_running == 1)
+ if (maymove && moveimp > imp && moveimp > env->best_imp) {
+ imp = moveimp - 1;
+ cur = NULL;
goto assign;
+ }
/*
* In the overloaded case, try and keep the load balanced.
*/
-balance:
- load = task_h_load(env->p);
+ load = task_h_load(env->p) - task_h_load(cur);
+ if (!load)
+ goto assign;
+
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
- if (moveimp > imp && moveimp > env->best_imp) {
- /*
- * If the improvement from just moving env->p direction is
- * better than swapping tasks around, check if a move is
- * possible. Store a slightly smaller score than moveimp,
- * so an actually idle CPU will win.
- */
- if (!load_too_imbalanced(src_load, dst_load, env)) {
- imp = moveimp - 1;
- cur = NULL;
- goto assign;
- }
- }
-
- if (imp <= env->best_imp)
- goto unlock;
-
- if (cur) {
- load = task_h_load(cur);
- dst_load -= load;
- src_load += load;
- }
-
if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
+assign:
/*
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
@@ -1711,7 +1663,6 @@ balance:
local_irq_enable();
}
-assign:
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
@@ -1720,43 +1671,30 @@ unlock:
static void task_numa_find_cpu(struct task_numa_env *env,
long taskimp, long groupimp)
{
+ long src_load, dst_load, load;
+ bool maymove = false;
int cpu;
+ load = task_h_load(env->p);
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+
+ /*
+ * If the improvement from just moving env->p direction is better
+ * than swapping tasks around, check if a move is possible.
+ */
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
+
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
continue;
env->dst_cpu = cpu;
- task_numa_compare(env, taskimp, groupimp);
+ task_numa_compare(env, taskimp, groupimp, maymove);
}
}
-/* Only move tasks to a NUMA node less busy than the current node. */
-static bool numa_has_capacity(struct task_numa_env *env)
-{
- struct numa_stats *src = &env->src_stats;
- struct numa_stats *dst = &env->dst_stats;
-
- if (src->has_free_capacity && !dst->has_free_capacity)
- return false;
-
- /*
- * Only consider a task move if the source has a higher load
- * than the destination, corrected for CPU capacity on each node.
- *
- * src->load dst->load
- * --------------------- vs ---------------------
- * src->compute_capacity dst->compute_capacity
- */
- if (src->load * dst->compute_capacity * env->imbalance_pct >
-
- dst->load * src->compute_capacity * 100)
- return true;
-
- return false;
-}
-
static int task_numa_migrate(struct task_struct *p)
{
struct task_numa_env env = {
@@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p)
* elsewhere, so there is no point in (re)trying.
*/
if (unlikely(!sd)) {
- p->numa_preferred_nid = task_node(p);
+ sched_setnuma(p, task_node(p));
return -EINVAL;
}
@@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p)
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
- if (numa_has_capacity(&env))
- task_numa_find_cpu(&env, taskimp, groupimp);
+ task_numa_find_cpu(&env, taskimp, groupimp);
/*
* Look at other nodes in these cases:
@@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
- if (numa_has_capacity(&env))
- task_numa_find_cpu(&env, taskimp, groupimp);
+ task_numa_find_cpu(&env, taskimp, groupimp);
}
}
@@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p)
* trying for a better one later. Do not set the preferred node here.
*/
if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
-
if (env.best_cpu == -1)
nid = env.src_nid;
else
- nid = env.dst_nid;
+ nid = cpu_to_node(env.best_cpu);
- if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
- sched_setnuma(p, env.dst_nid);
+ if (nid != p->numa_preferred_nid)
+ sched_setnuma(p, nid);
}
/* No better CPU than the current one was found. */
@@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p)
return ret;
}
- ret = migrate_swap(p, env.best_task);
+ ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
+
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
put_task_struct(env.best_task);
@@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1, max_group_nid = -1;
- unsigned long max_faults = 0, max_group_faults = 0;
+ int seq, nid, max_nid = -1;
+ unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
u64 runtime, period;
@@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p)
}
}
- if (faults > max_faults) {
- max_faults = faults;
+ if (!p->numa_group) {
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_nid = nid;
+ }
+ } else if (group_faults > max_faults) {
+ max_faults = group_faults;
max_nid = nid;
}
-
- if (group_faults > max_group_faults) {
- max_group_faults = group_faults;
- max_group_nid = nid;
- }
}
- update_task_scan_period(p, fault_types[0], fault_types[1]);
-
if (p->numa_group) {
numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock);
- max_nid = preferred_group_nid(p, max_group_nid);
+ max_nid = preferred_group_nid(p, max_nid);
}
if (max_faults) {
/* Set the new preferred node */
if (max_nid != p->numa_preferred_nid)
sched_setnuma(p, max_nid);
-
- if (task_node(p) != p->numa_preferred_nid)
- numa_migrate_preferred(p);
}
+
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
}
static inline int get_numa_group(struct numa_group *grp)
@@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
numa_is_active_node(mem_node, ng))
local = 1;
- task_numa_placement(p);
-
/*
* Retry task to preferred node migration periodically, in case it
* case it previously failed, or the scheduler moved us.
*/
- if (time_after(jiffies, p->numa_migrate_retry))
+ if (time_after(jiffies, p->numa_migrate_retry)) {
+ task_numa_placement(p);
numa_migrate_preferred(p);
+ }
if (migrated)
p->numa_pages_migrated += pages;
@@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
} while (0)
#ifdef CONFIG_SMP
-/*
- * XXX we want to get rid of these helpers and use the full load resolution.
- */
-static inline long se_weight(struct sched_entity *se)
-{
- return scale_load_down(se->load.weight);
-}
-
-static inline long se_runnable(struct sched_entity *se)
-{
- return scale_load_down(se->runnable_weight);
-}
-
static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
#ifdef CONFIG_SMP
-/*
- * Approximate:
- * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
- */
-static u64 decay_load(u64 val, u64 n)
-{
- unsigned int local_n;
-
- if (unlikely(n > LOAD_AVG_PERIOD * 63))
- return 0;
-
- /* after bounds checking we can collapse to 32-bit */
- local_n = n;
-
- /*
- * As y^PERIOD = 1/2, we can combine
- * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
- * With a look-up table which covers y^n (n<PERIOD)
- *
- * To achieve constant time decay_load.
- */
- if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
- val >>= local_n / LOAD_AVG_PERIOD;
- local_n %= LOAD_AVG_PERIOD;
- }
-
- val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
- return val;
-}
-
-static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
-{
- u32 c1, c2, c3 = d3; /* y^0 == 1 */
-
- /*
- * c1 = d1 y^p
- */
- c1 = decay_load((u64)d1, periods);
-
- /*
- * p-1
- * c2 = 1024 \Sum y^n
- * n=1
- *
- * inf inf
- * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
- * n=0 n=p
- */
- c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
-
- return c1 + c2 + c3;
-}
-
-/*
- * Accumulate the three separate parts of the sum; d1 the remainder
- * of the last (incomplete) period, d2 the span of full periods and d3
- * the remainder of the (incomplete) current period.
- *
- * d1 d2 d3
- * ^ ^ ^
- * | | |
- * |<->|<----------------->|<--->|
- * ... |---x---|------| ... |------|-----x (now)
- *
- * p-1
- * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
- * n=1
- *
- * = u y^p + (Step 1)
- *
- * p-1
- * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
- * n=1
- */
-static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
- unsigned long load, unsigned long runnable, int running)
-{
- unsigned long scale_freq, scale_cpu;
- u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
- u64 periods;
-
- scale_freq = arch_scale_freq_capacity(cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- delta += sa->period_contrib;
- periods = delta / 1024; /* A period is 1024us (~1ms) */
-
- /*
- * Step 1: decay old *_sum if we crossed period boundaries.
- */
- if (periods) {
- sa->load_sum = decay_load(sa->load_sum, periods);
- sa->runnable_load_sum =
- decay_load(sa->runnable_load_sum, periods);
- sa->util_sum = decay_load((u64)(sa->util_sum), periods);
-
- /*
- * Step 2
- */
- delta %= 1024;
- contrib = __accumulate_pelt_segments(periods,
- 1024 - sa->period_contrib, delta);
- }
- sa->period_contrib = delta;
-
- contrib = cap_scale(contrib, scale_freq);
- if (load)
- sa->load_sum += load * contrib;
- if (runnable)
- sa->runnable_load_sum += runnable * contrib;
- if (running)
- sa->util_sum += contrib * scale_cpu;
-
- return periods;
-}
-
-/*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series. To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
- *
- * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
- * p0 p1 p2
- * (now) (~1ms ago) (~2ms ago)
- *
- * Let u_i denote the fraction of p_i that the entity was runnable.
- *
- * We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
- * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
- *
- * We choose y based on the with of a reasonably scheduling period, fixing:
- * y^32 = 0.5
- *
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
- *
- * When a period "rolls over" and we have new u_0`, multiplying the previous
- * sum again by y is sufficient to update:
- * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
- */
-static __always_inline int
-___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
- unsigned long load, unsigned long runnable, int running)
-{
- u64 delta;
-
- delta = now - sa->last_update_time;
- /*
- * This should only happen when time goes backwards, which it
- * unfortunately does during sched clock init when we swap over to TSC.
- */
- if ((s64)delta < 0) {
- sa->last_update_time = now;
- return 0;
- }
-
- /*
- * Use 1024ns as the unit of measurement since it's a reasonable
- * approximation of 1us and fast to compute.
- */
- delta >>= 10;
- if (!delta)
- return 0;
-
- sa->last_update_time += delta << 10;
-
- /*
- * running is a subset of runnable (weight) so running can't be set if
- * runnable is clear. But there are some corner cases where the current
- * se has been already dequeued but cfs_rq->curr still points to it.
- * This means that weight will be 0 but not running for a sched_entity
- * but also for a cfs_rq if the latter becomes idle. As an example,
- * this happens during idle_balance() which calls
- * update_blocked_averages()
- */
- if (!load)
- runnable = running = 0;
-
- /*
- * Now we know we crossed measurement unit boundaries. The *_avg
- * accrues by two steps:
- *
- * Step 1: accumulate *_sum since last_update_time. If we haven't
- * crossed period boundaries, finish.
- */
- if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
- return 0;
-
- return 1;
-}
-
-static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
-{
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
-
- /*
- * Step 2: update *_avg.
- */
- sa->load_avg = div_u64(load * sa->load_sum, divider);
- sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
- sa->util_avg = sa->util_sum / divider;
-}
-
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
-
-static inline void cfs_se_util_change(struct sched_avg *avg)
-{
- unsigned int enqueued;
-
- if (!sched_feat(UTIL_EST))
- return;
-
- /* Avoid store if the flag has been already set */
- enqueued = avg->util_est.enqueued;
- if (!(enqueued & UTIL_AVG_UNCHANGED))
- return;
-
- /* Reset flag to report util_avg has been updated */
- enqueued &= ~UTIL_AVG_UNCHANGED;
- WRITE_ONCE(avg->util_est.enqueued, enqueued);
-}
-
-/*
- * sched_entity:
- *
- * task:
- * se_runnable() == se_weight()
- *
- * group: [ see update_cfs_group() ]
- * se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
- *
- * load_sum := runnable_sum
- * load_avg = se_weight(se) * runnable_avg
- *
- * runnable_load_sum := runnable_sum
- * runnable_load_avg = se_runnable(se) * runnable_avg
- *
- * XXX collapse load_sum and runnable_load_sum
- *
- * cfq_rs:
- *
- * load_sum = \Sum se_weight(se) * se->avg.load_sum
- * load_avg = \Sum se->avg.load_avg
- *
- * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
- * runnable_load_avg = \Sum se->avg.runable_load_avg
- */
-
-static int
-__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
-{
- if (entity_is_task(se))
- se->runnable_weight = se->load.weight;
-
- if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
- ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
- return 1;
- }
-
- return 0;
-}
-
-static int
-__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- if (entity_is_task(se))
- se->runnable_weight = se->load.weight;
-
- if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
- cfs_rq->curr == se)) {
-
- ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
- cfs_se_util_change(&se->avg);
- return 1;
- }
-
- return 0;
-}
-
-static int
-__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
-{
- if (___update_load_sum(now, cpu, &cfs_rq->avg,
- scale_load_down(cfs_rq->load.weight),
- scale_load_down(cfs_rq->runnable_weight),
- cfs_rq->curr != NULL)) {
-
- ___update_load_avg(&cfs_rq->avg, 1, 1);
- return 1;
- }
-
- return 0;
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
* update_tg_load_avg - update the tg's load avg
@@ -4037,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
#else /* CONFIG_SMP */
-static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
-{
- return 0;
-}
-
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
@@ -4726,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg,
throttled_hierarchy(dest_cfs_rq);
}
-/* updated child weight may affect parent so we have to do this bottom up */
static int tg_unthrottle_up(struct task_group *tg, void *data)
{
struct rq *rq = data;
@@ -5653,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
}
-
- sched_avg_update(this_rq);
}
/* Used instead of source_load when we know the type == 0 */
@@ -7294,8 +6896,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
- unsigned long src_faults, dst_faults;
- int src_nid, dst_nid;
+ unsigned long src_weight, dst_weight;
+ int src_nid, dst_nid, dist;
if (!static_branch_likely(&sched_numa_balancing))
return -1;
@@ -7322,18 +6924,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
return 0;
/* Leaving a core idle is often worse than degrading locality. */
- if (env->idle != CPU_NOT_IDLE)
+ if (env->idle == CPU_IDLE)
return -1;
+ dist = node_distance(src_nid, dst_nid);
if (numa_group) {
- src_faults = group_faults(p, src_nid);
- dst_faults = group_faults(p, dst_nid);
+ src_weight = group_weight(p, src_nid, dist);
+ dst_weight = group_weight(p, dst_nid, dist);
} else {
- src_faults = task_faults(p, src_nid);
- dst_faults = task_faults(p, dst_nid);
+ src_weight = task_weight(p, src_nid, dist);
+ dst_weight = task_weight(p, dst_nid, dist);
}
- return dst_faults < src_faults;
+ return dst_weight < src_weight;
}
#else
@@ -7620,6 +7223,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
return false;
}
+static inline bool others_have_blocked(struct rq *rq)
+{
+ if (READ_ONCE(rq->avg_rt.util_avg))
+ return true;
+
+ if (READ_ONCE(rq->avg_dl.util_avg))
+ return true;
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if (READ_ONCE(rq->avg_irq.util_avg))
+ return true;
+#endif
+
+ return false;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7679,6 +7298,12 @@ static void update_blocked_averages(int cpu)
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_irq_load_avg(rq, 0);
+ /* Don't need periodic decay once load/util_avg are null */
+ if (others_have_blocked(rq))
+ done = false;
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -7744,9 +7369,12 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
- if (!cfs_rq_has_blocked(cfs_rq))
+ if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
rq->has_blocked_load = 0;
#endif
rq_unlock_irqrestore(rq, &rf);
@@ -7856,39 +7484,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, used, age_stamp, avg;
- s64 delta;
+ unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+ unsigned long used, free;
+ unsigned long irq;
- /*
- * Since we're reading these variables without serialization make sure
- * we read them once before doing sanity checks on them.
- */
- age_stamp = READ_ONCE(rq->age_stamp);
- avg = READ_ONCE(rq->rt_avg);
- delta = __rq_clock_broken(rq) - age_stamp;
+ irq = cpu_util_irq(rq);
- if (unlikely(delta < 0))
- delta = 0;
+ if (unlikely(irq >= max))
+ return 1;
- total = sched_avg_period() + delta;
+ used = READ_ONCE(rq->avg_rt.util_avg);
+ used += READ_ONCE(rq->avg_dl.util_avg);
- used = div_u64(avg, total);
+ if (unlikely(used >= max))
+ return 1;
- if (likely(used < SCHED_CAPACITY_SCALE))
- return SCHED_CAPACITY_SCALE - used;
+ free = max - used;
- return 1;
+ return scale_irq_capacity(free, irq, max);
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
+ unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = capacity;
-
- capacity *= scale_rt_capacity(cpu);
- capacity >>= SCHED_CAPACITY_SHIFT;
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
if (!capacity)
capacity = 1;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
new file mode 100644
index 000000000000..35475c0c5419
--- /dev/null
+++ b/kernel/sched/pelt.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per Entity Load Tracking
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Interactivity improvements by Mike Galbraith
+ * (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ * Various enhancements by Dmitry Adamushko.
+ * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ * Group scheduling enhancements by Srivatsa Vaddagiri
+ * Copyright IBM Corporation, 2007
+ * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ * Scaled math optimizations by Thomas Gleixner
+ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ * Move PELT related code from fair.c into this pelt.c file
+ * Author: Vincent Guittot <vincent.guittot@linaro.org>
+ */
+
+#include <linux/sched.h>
+#include "sched.h"
+#include "sched-pelt.h"
+#include "pelt.h"
+
+/*
+ * Approximate:
+ * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static u64 decay_load(u64 val, u64 n)
+{
+ unsigned int local_n;
+
+ if (unlikely(n > LOAD_AVG_PERIOD * 63))
+ return 0;
+
+ /* after bounds checking we can collapse to 32-bit */
+ local_n = n;
+
+ /*
+ * As y^PERIOD = 1/2, we can combine
+ * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+ * With a look-up table which covers y^n (n<PERIOD)
+ *
+ * To achieve constant time decay_load.
+ */
+ if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+ val >>= local_n / LOAD_AVG_PERIOD;
+ local_n %= LOAD_AVG_PERIOD;
+ }
+
+ val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ return val;
+}
+
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+ u32 c1, c2, c3 = d3; /* y^0 == 1 */
+
+ /*
+ * c1 = d1 y^p
+ */
+ c1 = decay_load((u64)d1, periods);
+
+ /*
+ * p-1
+ * c2 = 1024 \Sum y^n
+ * n=1
+ *
+ * inf inf
+ * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+ * n=0 n=p
+ */
+ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+ return c1 + c2 + c3;
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+/*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ * d1 d2 d3
+ * ^ ^ ^
+ * | | |
+ * |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ * p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ * n=1
+ *
+ * = u y^p + (Step 1)
+ *
+ * p-1
+ * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
+ * n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
+{
+ unsigned long scale_freq, scale_cpu;
+ u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+ u64 periods;
+
+ scale_freq = arch_scale_freq_capacity(cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+ /*
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ sa->runnable_load_sum =
+ decay_load(sa->runnable_load_sum, periods);
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_pelt_segments(periods,
+ 1024 - sa->period_contrib, delta);
+ }
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (load)
+ sa->load_sum += load * contrib;
+ if (runnable)
+ sa->runnable_load_sum += runnable * contrib;
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long load, unsigned long runnable, int running)
+{
+ u64 delta;
+
+ delta = now - sa->last_update_time;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_update_time = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+
+ sa->last_update_time += delta << 10;
+
+ /*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!load)
+ runnable = running = 0;
+
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+ return 0;
+
+ return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+{
+ u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(load * sa->load_sum, divider);
+ sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+ WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
+}
+
+/*
+ * sched_entity:
+ *
+ * task:
+ * se_runnable() == se_weight()
+ *
+ * group: [ see update_cfs_group() ]
+ * se_weight() = tg->weight * grq->load_avg / tg->load_avg
+ * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
+ * load_sum := runnable_sum
+ * load_avg = se_weight(se) * runnable_avg
+ *
+ * runnable_load_sum := runnable_sum
+ * runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
+ * cfq_rq:
+ *
+ * load_sum = \Sum se_weight(se) * se->avg.load_sum
+ * load_avg = \Sum se->avg.load_avg
+ *
+ * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ * runnable_load_avg = \Sum se->avg.runable_load_avg
+ */
+
+int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ return 1;
+ }
+
+ return 0;
+}
+
+int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ se->runnable_weight = se->load.weight;
+
+ if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ cfs_rq->curr == se)) {
+
+ ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ cfs_se_util_change(&se->avg);
+ return 1;
+ }
+
+ return 0;
+}
+
+int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+ if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ scale_load_down(cfs_rq->runnable_weight),
+ cfs_rq->curr != NULL)) {
+
+ ___update_load_avg(&cfs_rq->avg, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * rt_rq:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_load_sum = load_sum
+ *
+ * load_avg and runnable_load_avg are not supported and meaningless.
+ *
+ */
+
+int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
+ running,
+ running,
+ running)) {
+
+ ___update_load_avg(&rq->avg_rt, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * dl_rq:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_load_sum = load_sum
+ *
+ */
+
+int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
+ running,
+ running,
+ running)) {
+
+ ___update_load_avg(&rq->avg_dl, 1, 1);
+ return 1;
+ }
+
+ return 0;
+}
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+/*
+ * irq:
+ *
+ * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ * util_sum = cpu_scale * load_sum
+ * runnable_load_sum = load_sum
+ *
+ */
+
+int update_irq_load_avg(struct rq *rq, u64 running)
+{
+ int ret = 0;
+ /*
+ * We know the time that has been used by interrupt since last update
+ * but we don't when. Let be pessimistic and assume that interrupt has
+ * happened just before the update. This is not so far from reality
+ * because interrupt will most probably wake up task and trig an update
+ * of rq clock during which the metric si updated.
+ * We start to decay with normal context time and then we add the
+ * interrupt context time.
+ * We can safely remove running from rq->clock because
+ * rq->clock += delta with delta >= running
+ */
+ ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+ 0,
+ 0,
+ 0);
+ ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+ 1,
+ 1,
+ 1);
+
+ if (ret)
+ ___update_load_avg(&rq->avg_irq, 1, 1);
+
+ return ret;
+}
+#endif
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
new file mode 100644
index 000000000000..d2894db28955
--- /dev/null
+++ b/kernel/sched/pelt.h
@@ -0,0 +1,72 @@
+#ifdef CONFIG_SMP
+
+int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
+int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
+int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
+int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+int update_irq_load_avg(struct rq *rq, u64 running);
+#else
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+ return 0;
+}
+#endif
+
+/*
+ * When a task is dequeued, its estimated utilization should not be update if
+ * its util_avg has not been updated at least once.
+ * This flag is used to synchronize util_avg updates with util_est updates.
+ * We map this information into the LSB bit of the utilization saved at
+ * dequeue time (i.e. util_est.dequeued).
+ */
+#define UTIL_AVG_UNCHANGED 0x1
+
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Avoid store if the flag has been already set */
+ enqueued = avg->util_est.enqueued;
+ if (!(enqueued & UTIL_AVG_UNCHANGED))
+ return;
+
+ /* Reset flag to report util_avg has been updated */
+ enqueued &= ~UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(avg->util_est.enqueued, enqueued);
+}
+
+#else
+
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static inline int
+update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ return 0;
+}
+
+static inline int
+update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+ return 0;
+}
+
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+ return 0;
+}
+#endif
+
+
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index eaaec8364f96..2e2955a8cf8f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,6 +5,8 @@
*/
#include "sched.h"
+#include "pelt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -973,8 +975,6 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = now;
cgroup_account_cputime(curr, delta_exec);
- sched_rt_avg_update(rq, delta_exec);
-
if (!rt_bandwidth_enabled())
return;
@@ -1578,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rt_queue_push_tasks(rq);
+ /*
+ * If prev task was rt, put_prev_task() has already updated the
+ * utilization. We only care of the case where we start to schedule a
+ * rt task
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+
return p;
}
@@ -1585,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+
/*
* The previous task needs to be made eligible for pushing
* if it is still active
@@ -2314,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
+ update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
watchdog(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c7742dcc136c..4a2e8cae63c4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -594,6 +594,7 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
+
#endif /* CONFIG_SMP */
int rt_queued;
@@ -673,7 +674,26 @@ struct dl_rq {
u64 bw_ratio;
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se) (!se->my_q)
+#else
+#define entity_is_task(se) 1
+#endif
+
#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+ return scale_load_down(se->load.weight);
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return scale_load_down(se->runnable_weight);
+}
static inline bool sched_asym_prefer(int a, int b)
{
@@ -833,8 +853,12 @@ struct rq {
struct list_head cfs_tasks;
- u64 rt_avg;
- u64 age_stamp;
+ struct sched_avg avg_rt;
+ struct sched_avg avg_dl;
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#define HAVE_SCHED_AVG_IRQ
+ struct sched_avg avg_irq;
+#endif
u64 idle_stamp;
u64 avg_idle;
@@ -1075,7 +1099,8 @@ enum numa_faults_stats {
};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
-extern int migrate_swap(struct task_struct *, struct task_struct *);
+extern int migrate_swap(struct task_struct *p, struct task_struct *t,
+ int cpu, int scpu);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
#else
static inline void
@@ -1690,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-extern const_debug unsigned int sysctl_sched_time_avg;
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
-static inline u64 sched_avg_period(void)
-{
- return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
-
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1735,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
#endif
#ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
-
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1747,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
return SCHED_CAPACITY_SCALE;
}
#endif
-
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
- rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
- sched_avg_update(rq);
-}
#else
#ifndef arch_scale_cpu_capacity
static __always_inline
@@ -1761,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
return SCHED_CAPACITY_SCALE;
}
#endif
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
-static inline void sched_avg_update(struct rq *rq) { }
#endif
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -2177,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-static inline unsigned long cpu_util_dl(struct rq *rq)
+static inline unsigned long cpu_bw_dl(struct rq *rq)
{
return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
}
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_dl.util_avg);
+}
+
static inline unsigned long cpu_util_cfs(struct rq *rq)
{
unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
@@ -2193,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
return util;
}
+
+static inline unsigned long cpu_util_rt(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_rt.util_avg);
+}
+#endif
+
+#ifdef HAVE_SCHED_AVG_IRQ
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+ return rq->avg_irq.util_avg;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+ util *= (max - irq);
+ util /= max;
+
+ return util;
+
+}
+#else
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+ return 0;
+}
+
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+ return util;
+}
#endif
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index b6fb2c3b3ff7..66b59ac77c22 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_locked);
-void swake_up(struct swait_queue_head *q)
+void swake_up_one(struct swait_queue_head *q)
{
unsigned long flags;
@@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q)
swake_up_locked(q);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
-EXPORT_SYMBOL(swake_up);
+EXPORT_SYMBOL(swake_up_one);
/*
* Does not allow usage from IRQ disabled, since we must be able to
@@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q)
}
EXPORT_SYMBOL(swake_up_all);
-void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
wait->task = current;
if (list_empty(&wait->task_list))
- list_add(&wait->task_list, &q->task_list);
+ list_add_tail(&wait->task_list, &q->task_list);
}
-void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
unsigned long flags;
@@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int
set_current_state(state);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
-EXPORT_SYMBOL(prepare_to_swait);
+EXPORT_SYMBOL(prepare_to_swait_exclusive);
long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
- if (signal_pending_state(state, current))
- return -ERESTARTSYS;
+ unsigned long flags;
+ long ret = 0;
- prepare_to_swait(q, wait, state);
+ raw_spin_lock_irqsave(&q->lock, flags);
+ if (unlikely(signal_pending_state(state, current))) {
+ /*
+ * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
+ * must not see us.
+ */
+ list_del_init(&wait->task_list);
+ ret = -ERESTARTSYS;
+ } else {
+ __prepare_to_swait(q, wait);
+ set_current_state(state);
+ }
+ raw_spin_unlock_irqrestore(&q->lock, flags);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(prepare_to_swait_event);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 5043e7433f4b..c230c2dd48e1 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu)
mutex_lock(&smpboot_threads_lock);
list_for_each_entry(cur, &hotplug_threads, list)
- if (cpumask_test_cpu(cpu, cur->cpumask))
- smpboot_unpark_thread(cur, cpu);
+ smpboot_unpark_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
return 0;
}
@@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
- * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * smpboot_register_percpu_thread - Register a per_cpu thread related
* to hotplug
* @plug_thread: Hotplug thread descriptor
- * @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
-int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
- const struct cpumask *cpumask)
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
unsigned int cpu;
int ret = 0;
- if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
- return -ENOMEM;
- cpumask_copy(plug_thread->cpumask, cpumask);
-
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
- free_cpumask_var(plug_thread->cpumask);
goto out;
}
- if (cpumask_test_cpu(cpu, cpumask))
- smpboot_unpark_thread(plug_thread, cpu);
+ smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
@@ -315,7 +306,7 @@ out:
put_online_cpus();
return ret;
}
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
@@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
- free_cpumask_var(plug_thread->cpumask);
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
-/**
- * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
- * @plug_thread: Hotplug thread descriptor
- * @new: Revised mask to use
- *
- * The cpumask field in the smp_hotplug_thread must not be updated directly
- * by the client, but only by calling this function.
- * This function can only be called on a registered smp_hotplug_thread.
- */
-void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
- const struct cpumask *new)
-{
- struct cpumask *old = plug_thread->cpumask;
- static struct cpumask tmp;
- unsigned int cpu;
-
- lockdep_assert_cpus_held();
- mutex_lock(&smpboot_threads_lock);
-
- /* Park threads that were exclusively enabled on the old mask. */
- cpumask_andnot(&tmp, old, new);
- for_each_cpu_and(cpu, &tmp, cpu_online_mask)
- smpboot_park_thread(plug_thread, cpu);
-
- /* Unpark threads that are exclusively enabled on the new mask. */
- cpumask_andnot(&tmp, new, old);
- for_each_cpu_and(cpu, &tmp, cpu_online_mask)
- smpboot_unpark_thread(plug_thread, cpu);
-
- cpumask_copy(old, new);
-
- mutex_unlock(&smpboot_threads_lock);
-}
-
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 69eb76daed34..067cb83f37ea 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -238,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
DEFINE_WAKE_Q(wakeq);
int err;
+
retry:
+ /*
+ * The waking up of stopper threads has to happen in the same
+ * scheduling context as the queueing. Otherwise, there is a
+ * possibility of one of the above stoppers being woken up by another
+ * CPU, and preempting us. This will cause us to not wake up the other
+ * stopper forever.
+ */
+ preempt_disable();
raw_spin_lock_irq(&stopper1->lock);
raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
- err = -ENOENT;
- if (!stopper1->enabled || !stopper2->enabled)
+ if (!stopper1->enabled || !stopper2->enabled) {
+ err = -ENOENT;
goto unlock;
+ }
+
/*
* Ensure that if we race with __stop_cpus() the stoppers won't get
* queued up in reverse order leading to system deadlock.
@@ -255,36 +266,30 @@ retry:
* It can be falsely true but it is safe to spin until it is cleared,
* queue_stop_cpus_work() does everything under preempt_disable().
*/
- err = -EDEADLK;
- if (unlikely(stop_cpus_in_progress))
- goto unlock;
+ if (unlikely(stop_cpus_in_progress)) {
+ err = -EDEADLK;
+ goto unlock;
+ }
err = 0;
__cpu_stop_queue_work(stopper1, work1, &wakeq);
__cpu_stop_queue_work(stopper2, work2, &wakeq);
- /*
- * The waking up of stopper threads has to happen
- * in the same scheduling context as the queueing.
- * Otherwise, there is a possibility of one of the
- * above stoppers being woken up by another CPU,
- * and preempting us. This will cause us to n ot
- * wake up the other stopper forever.
- */
- preempt_disable();
+
unlock:
raw_spin_unlock(&stopper2->lock);
raw_spin_unlock_irq(&stopper1->lock);
if (unlikely(err == -EDEADLK)) {
+ preempt_enable();
+
while (stop_cpus_in_progress)
cpu_relax();
+
goto retry;
}
- if (!err) {
- wake_up_q(&wakeq);
- preempt_enable();
- }
+ wake_up_q(&wakeq);
+ preempt_enable();
return err;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d9837c0aff4..f22f76b7a138 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "sched_time_avg_ms",
- .data = &sysctl_sched_time_avg,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- },
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 576d18045811..5470dce212c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -18,18 +18,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
-#include <linux/smpboot.h>
-#include <linux/sched/rt.h>
-#include <uapi/linux/sched/types.h>
#include <linux/tick.h>
-#include <linux/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
+#include <linux/stop_machine.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
-#include <linux/kthread.h>
static DEFINE_MUTEX(watchdog_mutex);
@@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void)
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-static bool softlockup_threads_initialized __read_mostly;
+static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
-static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
@@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void)
__this_cpu_inc(hrtimer_interrupts);
}
+static DEFINE_PER_CPU(struct completion, softlockup_completion);
+static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
+
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every sample_period seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static int softlockup_fn(void *data)
+{
+ __this_cpu_write(soft_lockup_hrtimer_cnt,
+ __this_cpu_read(hrtimer_interrupts));
+ __touch_watchdog();
+ complete(this_cpu_ptr(&softlockup_completion));
+
+ return 0;
+}
+
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
watchdog_interrupt_count();
/* kick the softlockup detector */
- wake_up_process(__this_cpu_read(softlockup_watchdog));
+ if (completion_done(this_cpu_ptr(&softlockup_completion))) {
+ reinit_completion(this_cpu_ptr(&softlockup_completion));
+ stop_one_cpu_nowait(smp_processor_id(),
+ softlockup_fn, NULL,
+ this_cpu_ptr(&softlockup_stop_work));
+ }
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
@@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
-static void watchdog_set_prio(unsigned int policy, unsigned int prio)
-{
- struct sched_param param = { .sched_priority = prio };
-
- sched_setscheduler(current, policy, &param);
-}
-
static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
+ struct completion *done = this_cpu_ptr(&softlockup_completion);
+
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
+ init_completion(done);
+ complete(done);
/*
* Start the timer first to prevent the NMI watchdog triggering
@@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu)
/* Enable the perf event */
if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
watchdog_nmi_enable(cpu);
-
- watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
}
static void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
- watchdog_set_prio(SCHED_NORMAL, 0);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
/*
* Disable the perf event first. That prevents that a large delay
* between disabling the timer and disabling the perf event causes
@@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu)
*/
watchdog_nmi_disable(cpu);
hrtimer_cancel(hrtimer);
+ wait_for_completion(this_cpu_ptr(&softlockup_completion));
}
-static void watchdog_cleanup(unsigned int cpu, bool online)
+static int softlockup_stop_fn(void *data)
{
- watchdog_disable(cpu);
+ watchdog_disable(smp_processor_id());
+ return 0;
}
-static int watchdog_should_run(unsigned int cpu)
+static void softlockup_stop_all(void)
{
- return __this_cpu_read(hrtimer_interrupts) !=
- __this_cpu_read(soft_lockup_hrtimer_cnt);
+ int cpu;
+
+ if (!softlockup_initialized)
+ return;
+
+ for_each_cpu(cpu, &watchdog_allowed_mask)
+ smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
+
+ cpumask_clear(&watchdog_allowed_mask);
}
-/*
- * The watchdog thread function - touches the timestamp.
- *
- * It only runs once every sample_period seconds (4 seconds by
- * default) to reset the softlockup timestamp. If this gets delayed
- * for more than 2*watchdog_thresh seconds then the debug-printout
- * triggers in watchdog_timer_fn().
- */
-static void watchdog(unsigned int cpu)
+static int softlockup_start_fn(void *data)
{
- __this_cpu_write(soft_lockup_hrtimer_cnt,
- __this_cpu_read(hrtimer_interrupts));
- __touch_watchdog();
+ watchdog_enable(smp_processor_id());
+ return 0;
}
-static struct smp_hotplug_thread watchdog_threads = {
- .store = &softlockup_watchdog,
- .thread_should_run = watchdog_should_run,
- .thread_fn = watchdog,
- .thread_comm = "watchdog/%u",
- .setup = watchdog_enable,
- .cleanup = watchdog_cleanup,
- .park = watchdog_disable,
- .unpark = watchdog_enable,
-};
-
-static void softlockup_update_smpboot_threads(void)
+static void softlockup_start_all(void)
{
- lockdep_assert_held(&watchdog_mutex);
-
- if (!softlockup_threads_initialized)
- return;
+ int cpu;
- smpboot_update_cpumask_percpu_thread(&watchdog_threads,
- &watchdog_allowed_mask);
+ cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+ for_each_cpu(cpu, &watchdog_allowed_mask)
+ smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
}
-/* Temporarily park all watchdog threads */
-static void softlockup_park_all_threads(void)
+int lockup_detector_online_cpu(unsigned int cpu)
{
- cpumask_clear(&watchdog_allowed_mask);
- softlockup_update_smpboot_threads();
+ watchdog_enable(cpu);
+ return 0;
}
-/* Unpark enabled threads */
-static void softlockup_unpark_threads(void)
+int lockup_detector_offline_cpu(unsigned int cpu)
{
- cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
- softlockup_update_smpboot_threads();
+ watchdog_disable(cpu);
+ return 0;
}
static void lockup_detector_reconfigure(void)
{
cpus_read_lock();
watchdog_nmi_stop();
- softlockup_park_all_threads();
+
+ softlockup_stop_all();
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
- softlockup_unpark_threads();
+ softlockup_start_all();
+
watchdog_nmi_start();
cpus_read_unlock();
/*
@@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void)
*/
static __init void lockup_detector_setup(void)
{
- int ret;
-
/*
* If sysctl is off and watchdog got disabled on the command line,
* nothing to do here.
@@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void)
!(watchdog_enabled && watchdog_thresh))
return;
- ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
- &watchdog_allowed_mask);
- if (ret) {
- pr_err("Failed to initialize soft lockup detector threads\n");
- return;
- }
-
mutex_lock(&watchdog_mutex);
- softlockup_threads_initialized = true;
lockup_detector_reconfigure();
+ softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static inline int watchdog_park_threads(void) { return 0; }
-static inline void watchdog_unpark_threads(void) { }
-static inline int watchdog_enable_all_cpus(void) { return 0; }
-static inline void watchdog_disable_all_cpus(void) { }
static void lockup_detector_reconfigure(void)
{
cpus_read_lock();
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index e449a23e9d59..1f7020d65d0a 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void)
evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
watchdog_overflow_callback, NULL);
if (IS_ERR(evt)) {
- pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
- PTR_ERR(evt));
+ pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
+ PTR_ERR(evt));
return PTR_ERR(evt);
}
this_cpu_write(watchdog_ev, evt);