From da6043fe85eb5ec621e34a92540735dcebbea134 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 25 Sep 2019 15:39:12 +0100 Subject: PM / hibernate: memory_bm_find_bit(): Tighten node optimisation When looking for a bit by number we make use of the cached result from the preceding lookup to speed up operation. Firstly we check if the requested pfn is within the cached zone and if not lookup the new zone. We then check if the offset for that pfn falls within the existing cached node. This happens regardless of whether the node is within the zone we are now scanning. With certain memory layouts it is possible for this to false trigger creating a temporary alias for the pfn to a different bit. This leads the hibernation code to free memory which it was never allocated with the expected fallout. Ensure the zone we are scanning matches the cached zone before considering the cached node. Deep thanks go to Andrea for many, many, many hours of hacking and testing that went into cornering this bug. Reported-by: Andrea Righi Tested-by: Andrea Righi Signed-off-by: Andy Whitcroft Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 83105874f255..26b9168321e7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -734,8 +734,15 @@ zone_found: * We have found the zone. Now walk the radix tree to find the leaf node * for our PFN. */ + + /* + * If the zone we wish to scan is the the current zone and the + * pfn falls into the current node then we do not need to walk + * the tree. + */ node = bm->cur.node; - if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + if (zone == bm->cur.zone && + ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; node = zone->rtree; -- cgit From c1d51f684c72b5eb2aecbbd47be3a2977a2dc903 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Nov 2019 15:25:12 +0100 Subject: cpuidle: Use nanoseconds as the unit of time Currently, the cpuidle subsystem uses microseconds as the unit of time which (among other things) causes the idle loop to incur some integer division overhead for no clear benefit. In order to allow cpuidle to measure time in nanoseconds, add two new fields, exit_latency_ns and target_residency_ns, to represent the exit latency and target residency of an idle state in nanoseconds, respectively, to struct cpuidle_state and initialize them with the help of the corresponding values in microseconds provided by drivers. Additionally, change cpuidle_governor_latency_req() to return the idle state exit latency constraint in nanoseconds. Also meeasure idle state residency (last_residency_ns in struct cpuidle_device and time_ns in struct cpuidle_driver) in nanoseconds and update the cpuidle core and governors accordingly. However, the menu governor still computes typical intervals in microseconds to avoid integer overflows. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Doug Smythies Tested-by: Doug Smythies --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8dad5aa600ea..1aa260702b38 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * update no idle residency and return. */ if (current_clr_polling_and_test()) { - dev->last_residency = 0; + dev->last_residency_ns = 0; local_irq_enable(); return -EBUSY; } -- cgit From c55b51a06b01d67a99457bb82a8c31081c7faa23 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 16 Nov 2019 14:16:12 +0100 Subject: cpuidle: Allow idle injection to apply exit latency limit In some cases it may be useful to specify an exit latency limit for the idle state to be used during CPU idle time injection. Instead of duplicating the information in struct cpuidle_device or propagating the latency limit in the call stack, replace the use_deepest_state field with forced_latency_limit_ns to represent that limit, so that the deepest idle state with exit latency within that limit is forced (i.e. no governors) when it is set. A zero exit latency limit for forced idle means to use governors in the usual way (analogous to use_deepest_state equal to "false" before this change). Additionally, add play_idle_precise() taking two arguments, the duration of forced idle and the idle state exit latency limit, both in nanoseconds, and redefine play_idle() as a wrapper around that new function. This change is preparatory, no functional impact is expected. Suggested-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano [ rjw: Subject, changelog, cpuidle_use_deepest_state() kerneldoc, whitespace ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1aa260702b38..cd05ffa0abfe 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -165,7 +165,7 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ - if (idle_should_enter_s2idle() || dev->use_deepest_state) { + if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -311,7 +311,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -void play_idle(unsigned long duration_us) +void play_idle_precise(u64 duration_ns, u64 latency_ns) { struct idle_timer it; @@ -323,29 +323,29 @@ void play_idle(unsigned long duration_us) WARN_ON_ONCE(current->nr_cpus_allowed != 1); WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); - WARN_ON_ONCE(!duration_us); + WARN_ON_ONCE(!duration_ns); rcu_sleep_check(); preempt_disable(); current->flags |= PF_IDLE; - cpuidle_use_deepest_state(true); + cpuidle_use_deepest_state(latency_ns); it.done = 0; hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); it.timer.function = idle_inject_timer_fn; - hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC), + hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED); while (!READ_ONCE(it.done)) do_idle(); - cpuidle_use_deepest_state(false); + cpuidle_use_deepest_state(0); current->flags &= ~PF_IDLE; preempt_fold_need_resched(); preempt_enable(); } -EXPORT_SYMBOL_GPL(play_idle); +EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { -- cgit From 5aa9ba6312e36c18626e73506b92d1513d815435 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 16 Nov 2019 14:16:13 +0100 Subject: cpuidle: Pass exit latency limit to cpuidle_use_deepest_state() Modify cpuidle_use_deepest_state() to take an additional exit latency limit argument to be passed to find_deepest_idle_state() and make cpuidle_idle_call() pass dev->forced_idle_latency_limit_ns to it for forced idle. Suggested-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano [ rjw: Rebase and rearrange code, subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cd05ffa0abfe..fc9604ddd802 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -166,6 +166,8 @@ static void cpuidle_idle_call(void) */ if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { + u64 max_latency_ns; + if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -176,12 +178,16 @@ static void cpuidle_idle_call(void) } rcu_idle_exit(); + + max_latency_ns = U64_MAX; + } else { + max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); rcu_idle_enter(); - next_state = cpuidle_find_deepest_state(drv, dev); + next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); } else { bool stop_tick = true; -- cgit