summaryrefslogtreecommitdiff
path: root/kernel/sched/idle.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/idle.c')
-rw-r--r--kernel/sched/idle.c197
1 files changed, 118 insertions, 79 deletions
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..c174afe1dd17 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,6 +6,11 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
+#include <linux/cpuidle.h>
+#include <linux/suspend.h>
+#include <linux/livepatch.h>
+#include "sched.h"
+#include "smp.h"
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -47,22 +52,26 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
-#endif
+#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */
static noinline int __cpuidle cpu_idle_poll(void)
{
+ instrumentation_begin();
trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
- ct_idle_enter();
- local_irq_enable();
+ ct_cpuidle_enter();
+ raw_local_irq_enable();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ raw_local_irq_disable();
- ct_idle_exit();
+ ct_cpuidle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ local_irq_enable();
+ instrumentation_end();
return 1;
}
@@ -71,13 +80,31 @@ static noinline int __cpuidle cpu_idle_poll(void)
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { }
+void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- raw_local_irq_enable();
}
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE
+DEFINE_STATIC_KEY_FALSE(arch_needs_tick_broadcast);
+
+static inline void cond_tick_broadcast_enter(void)
+{
+ if (static_branch_unlikely(&arch_needs_tick_broadcast))
+ tick_broadcast_enter();
+}
+
+static inline void cond_tick_broadcast_exit(void)
+{
+ if (static_branch_unlikely(&arch_needs_tick_broadcast))
+ tick_broadcast_exit();
+}
+#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */
+static inline void cond_tick_broadcast_enter(void) { }
+static inline void cond_tick_broadcast_exit(void) { }
+#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */
+
/**
* default_idle_call - Default CPU idle routine.
*
@@ -85,53 +112,32 @@ void __weak arch_cpu_idle(void)
*/
void __cpuidle default_idle_call(void)
{
- if (current_clr_polling_and_test()) {
- local_irq_enable();
- } else {
-
+ instrumentation_begin();
+ if (!current_clr_polling_and_test()) {
+ cond_tick_broadcast_enter();
trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
- /*
- * arch_cpu_idle() is supposed to enable IRQs, however
- * we can't do that because of RCU and tracing.
- *
- * Trace IRQs enable here, then switch off RCU, and have
- * arch_cpu_idle() use raw_local_irq_enable(). Note that
- * ct_idle_enter() relies on lockdep IRQ state, so switch that
- * last -- this is very similar to the entry code.
- */
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare();
- ct_idle_enter();
- lockdep_hardirqs_on(_THIS_IP_);
-
+ ct_cpuidle_enter();
arch_cpu_idle();
-
- /*
- * OK, so IRQs are enabled here, but RCU needs them disabled to
- * turn itself back on.. funny thing is that disabling IRQs
- * will cause tracing, which needs RCU. Jump through hoops to
- * make it 'work'.
- */
- raw_local_irq_disable();
- lockdep_hardirqs_off(_THIS_IP_);
- ct_idle_exit();
- lockdep_hardirqs_on(_THIS_IP_);
- raw_local_irq_enable();
+ ct_cpuidle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ cond_tick_broadcast_exit();
}
+ local_irq_enable();
+ instrumentation_end();
}
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+ struct cpuidle_device *dev,
+ u64 max_latency_ns)
{
if (current_clr_polling_and_test())
return -EBUSY;
- return cpuidle_enter_s2idle(drv, dev);
+ return cpuidle_enter_s2idle(drv, dev, max_latency_ns);
}
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -172,19 +178,13 @@ static void cpuidle_idle_call(void)
/*
* Check if the idle task must be rescheduled. If it is the
- * case, exit the function after re-enabling the local irq.
+ * case, exit the function after re-enabling the local IRQ.
*/
if (need_resched()) {
local_irq_enable();
return;
}
- /*
- * The RCU framework needs to be told that we are entering an idle
- * section, so no more rcu read side critical sections and one more
- * step to the grace period
- */
-
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
@@ -206,12 +206,13 @@ static void cpuidle_idle_call(void)
u64 max_latency_ns;
if (idle_should_enter_s2idle()) {
+ max_latency_ns = cpu_wakeup_latency_qos_limit() *
+ NSEC_PER_USEC;
- entered_state = call_cpuidle_s2idle(drv, dev);
+ entered_state = call_cpuidle_s2idle(drv, dev,
+ max_latency_ns);
if (entered_state > 0)
goto exit_idle;
-
- max_latency_ns = U64_MAX;
} else {
max_latency_ns = dev->forced_idle_latency_limit_ns;
}
@@ -244,7 +245,7 @@ exit_idle:
__current_set_polling();
/*
- * It is up to the idle functions to reenable local interrupts
+ * It is up to the idle functions to re-enable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
@@ -277,12 +278,40 @@ static void do_idle(void)
tick_nohz_idle_enter();
while (!need_resched()) {
- rmb();
+ /*
+ * Interrupts shouldn't be re-enabled from that point on until
+ * the CPU sleeping instruction is reached. Otherwise an interrupt
+ * may fire and queue a timer that would be ignored until the CPU
+ * wakes from the sleeping instruction. And testing need_resched()
+ * doesn't tell about pending needed timer reprogram.
+ *
+ * Several cases to consider:
+ *
+ * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as
+ * "wfi" or "mwait" are fine because they can be entered with
+ * interrupt disabled.
+ *
+ * - sti;mwait() couple is fine because the interrupts are
+ * re-enabled only upon the execution of mwait, leaving no gap
+ * in-between.
+ *
+ * - ROLLBACK based idle handlers with the sleeping instruction
+ * called with interrupts enabled are NOT fine. In this scheme
+ * when the interrupt detects it has interrupted an idle handler,
+ * it rolls back to its beginning which performs the
+ * need_resched() check before re-executing the sleeping
+ * instruction. This can leak a pending needed timer reprogram.
+ * If such a scheme is really mandatory due to the lack of an
+ * appropriate CPU sleeping instruction, then a FAST-FORWARD
+ * must instead be applied: when the interrupt detects it has
+ * interrupted an idle handler, it must resume to the end of
+ * this idle handler so that the generic idle loop is iterated
+ * again to reprogram the tick.
+ */
local_irq_disable();
if (cpu_is_offline(cpu)) {
- tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
@@ -291,7 +320,7 @@ static void do_idle(void)
rcu_nocb_flush_deferred_wakeup();
/*
- * In poll mode we reenable interrupts and spin. Also if we
+ * In poll mode we re-enable interrupts and spin. Also if we
* detected in the wakeup from idle path that the tick
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
@@ -376,8 +405,8 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
cpuidle_use_deepest_state(latency_ns);
it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- it.timer.function = idle_inject_timer_fn;
+ hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
HRTIMER_MODE_REL_PINNED_HARD);
@@ -394,6 +423,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise);
void cpu_startup_entry(enum cpuhp_state state)
{
+ current->flags |= PF_IDLE;
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
@@ -404,7 +434,6 @@ void cpu_startup_entry(enum cpuhp_state state)
* idle-task scheduling class.
*/
-#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int flags)
{
@@ -416,53 +445,49 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return WARN_ON_ONCE(1);
}
-#endif
/*
* Idle tasks are unconditionally rescheduled:
*/
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+static void update_curr_idle(struct rq *rq);
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
+ update_curr_idle(rq);
+ scx_update_idle(rq, false, true);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
+ scx_update_idle(rq, true, true);
schedstat_inc(rq->sched_goidle);
+ next->se.exec_start = rq_clock_task(rq);
}
-#ifdef CONFIG_SMP
-static struct task_struct *pick_task_idle(struct rq *rq)
+struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf)
{
+ scx_update_idle(rq, true, false);
return rq->idle;
}
-#endif
-
-struct task_struct *pick_next_task_idle(struct rq *rq)
-{
- struct task_struct *next = rq->idle;
-
- set_next_task_idle(rq, next, true);
-
- return next;
-}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
-static void
+static bool
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_rq_unlock_irq(rq);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
raw_spin_rq_lock_irq(rq);
+ return true;
}
/*
@@ -475,21 +500,36 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
*/
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
+ update_curr_idle(rq);
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
+static void switching_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
}
static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio)
{
+ if (p->prio == oldprio)
+ return;
+
BUG();
}
static void update_curr_idle(struct rq *rq)
{
+ struct sched_entity *se = &rq->idle->se;
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
+
+ delta_exec = now - se->exec_start;
+ if (unlikely(delta_exec <= 0))
+ return;
+
+ se->exec_start = now;
+
+ dl_server_update_idle(&rq->fair_server, delta_exec);
}
/*
@@ -497,27 +537,26 @@ static void update_curr_idle(struct rq *rq)
*/
DEFINE_SCHED_CLASS(idle) = {
+ .queue_mask = 0,
+
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
- .check_preempt_curr = check_preempt_curr_idle,
+ .wakeup_preempt = wakeup_preempt_idle,
- .pick_next_task = pick_next_task_idle,
+ .pick_task = pick_task_idle,
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
-#ifdef CONFIG_SMP
.balance = balance_idle,
- .pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
-#endif
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
- .switched_to = switched_to_idle,
+ .switching_to = switching_to_idle,
.update_curr = update_curr_idle,
};