From 0e7767687fdabfc58d5046e7488632bf2ecd4d0c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 5 Apr 2018 18:58:27 +0200
Subject: time: tick-sched: Reorganize idle tick management code

Prepare the scheduler tick code for reworking the idle loop to
avoid stopping the tick in some cases.

The idea is to split the nohz idle entry call to decouple the idle
time stats accounting and preparatory work from the actual tick stop
code, in order to later be able to delay the tick stop once we reach
more power-knowledgeable callers.

Move away the tick_nohz_start_idle() invocation from
__tick_nohz_idle_enter(), rename the latter to
__tick_nohz_idle_stop_tick() and define tick_nohz_idle_stop_tick()
as a wrapper around it for calling it from the outside.

Make tick_nohz_idle_enter() only call tick_nohz_start_idle() instead
of calling the entire __tick_nohz_idle_enter(), add another wrapper
disabling and enabling interrupts around tick_nohz_idle_stop_tick()
and make the current callers of tick_nohz_idle_enter() call it too
to retain their current functionality.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/xen/smp_pv.c    |  1 +
 include/linux/tick.h     | 12 ++++++++++++
 kernel/sched/idle.c      |  1 +
 kernel/time/tick-sched.c | 46 +++++++++++++++++++++++++---------------------
 4 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index c0c756c76afe..2e20ae2fa2d6 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -425,6 +425,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
 	 * data back is to call:
 	 */
 	tick_nohz_idle_enter();
+	tick_nohz_idle_stop_tick_protected();
 
 	cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
 }
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7f8c9a127f5a..1d253df9ea3c 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -115,6 +115,7 @@ enum tick_dep_bits {
 extern bool tick_nohz_enabled;
 extern bool tick_nohz_tick_stopped(void);
 extern bool tick_nohz_tick_stopped_cpu(int cpu);
+extern void tick_nohz_idle_stop_tick(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
@@ -123,10 +124,19 @@ extern unsigned long tick_nohz_get_idle_calls(void);
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
+
+static inline void tick_nohz_idle_stop_tick_protected(void)
+{
+	local_irq_disable();
+	tick_nohz_idle_stop_tick();
+	local_irq_enable();
+}
+
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
 static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
+static inline void tick_nohz_idle_stop_tick(void) { }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 
@@ -136,6 +146,8 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
+
+static inline void tick_nohz_idle_stop_tick_protected(void) { }
 #endif /* !CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2975f195e1c4..c0bc111878e6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -216,6 +216,7 @@ static void do_idle(void)
 
 	__current_set_polling();
 	tick_nohz_idle_enter();
+	tick_nohz_idle_stop_tick_protected();
 
 	while (!need_resched()) {
 		check_pgt_cache();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5d4a0342f934..678349aec483 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -528,14 +528,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 	sched_clock_idle_wakeup_event();
 }
 
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static void tick_nohz_start_idle(struct tick_sched *ts)
 {
-	ktime_t now = ktime_get();
-
-	ts->idle_entrytime = now;
+	ts->idle_entrytime = ktime_get();
 	ts->idle_active = 1;
 	sched_clock_idle_sleep_event();
-	return now;
 }
 
 /**
@@ -894,19 +891,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	return true;
 }
 
-static void __tick_nohz_idle_enter(struct tick_sched *ts)
+static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
 {
-	ktime_t now, expires;
+	ktime_t expires;
 	int cpu = smp_processor_id();
 
-	now = tick_nohz_start_idle(ts);
-
 	if (can_stop_idle_tick(cpu, ts)) {
 		int was_stopped = ts->tick_stopped;
 
 		ts->idle_calls++;
 
-		expires = tick_nohz_stop_sched_tick(ts, now, cpu);
+		/*
+		 * The idle entry time should be a sufficient approximation of
+		 * the current time at this point.
+		 */
+		expires = tick_nohz_stop_sched_tick(ts, ts->idle_entrytime, cpu);
 		if (expires > 0LL) {
 			ts->idle_sleeps++;
 			ts->idle_expires = expires;
@@ -920,16 +919,19 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 }
 
 /**
- * tick_nohz_idle_enter - stop the idle tick from the idle task
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
  *
  * When the next event is more than a tick into the future, stop the idle tick
- * Called when we start the idle loop.
- *
- * The arch is responsible of calling:
+ */
+void tick_nohz_idle_stop_tick(void)
+{
+	__tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
+}
+
+/**
+ * tick_nohz_idle_enter - prepare for entering idle on the current CPU
  *
- * - rcu_idle_enter() after its last use of RCU before the CPU is put
- *  to sleep.
- * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ * Called when we start the idle loop.
  */
 void tick_nohz_idle_enter(void)
 {
@@ -941,7 +943,7 @@ void tick_nohz_idle_enter(void)
 
 	ts = this_cpu_ptr(&tick_cpu_sched);
 	ts->inidle = 1;
-	__tick_nohz_idle_enter(ts);
+	tick_nohz_start_idle(ts);
 
 	local_irq_enable();
 }
@@ -958,10 +960,12 @@ void tick_nohz_irq_exit(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 
-	if (ts->inidle)
-		__tick_nohz_idle_enter(ts);
-	else
+	if (ts->inidle) {
+		tick_nohz_start_idle(ts);
+		__tick_nohz_idle_stop_tick(ts);
+	} else {
 		tick_nohz_full_update_tick(ts);
+	}
 }
 
 /**
-- 
cgit 


From 2aaf709a518d26563b80fd7a42379d7aa7ffed4a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 15 Mar 2018 23:05:50 +0100
Subject: sched: idle: Do not stop the tick upfront in the idle loop

Push the decision whether or not to stop the tick somewhat deeper
into the idle loop.

Stopping the tick upfront leads to unpleasant outcomes in case the
idle governor doesn't agree with the nohz code on the duration of the
upcoming idle period.  Specifically, if the tick has been stopped and
the idle governor predicts short idle, the situation is bad regardless
of whether or not the prediction is accurate.  If it is accurate, the
tick has been stopped unnecessarily which means excessive overhead.
If it is not accurate, the CPU is likely to spend too much time in
the (shallow, because short idle has been predicted) idle state
selected by the governor [1].

As the first step towards addressing this problem, change the code
to make the tick stopping decision inside of the loop in do_idle().
In particular, do not stop the tick in the cpu_idle_poll() code path.
Also don't do that in tick_nohz_irq_exit() which doesn't really have
enough information on whether or not to stop the tick.

Link: https://marc.info/?l=linux-pm&m=150116085925208&w=2 # [1]
Link: https://tu-dresden.de/zih/forschung/ressourcen/dateien/projekte/haec/powernightmares.pdf
Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/tick.h     |  2 ++
 kernel/sched/idle.c      |  9 ++++++---
 kernel/time/tick-sched.c | 26 ++++++++++++++++++--------
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 1d253df9ea3c..fccebfba167e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -116,6 +116,7 @@ extern bool tick_nohz_enabled;
 extern bool tick_nohz_tick_stopped(void);
 extern bool tick_nohz_tick_stopped_cpu(int cpu);
 extern void tick_nohz_idle_stop_tick(void);
+extern void tick_nohz_idle_restart_tick(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
@@ -137,6 +138,7 @@ static inline void tick_nohz_idle_stop_tick_protected(void)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
 static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
 static inline void tick_nohz_idle_stop_tick(void) { }
+static inline void tick_nohz_idle_restart_tick(void) { }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c0bc111878e6..3777e83c0b5a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -216,13 +216,13 @@ static void do_idle(void)
 
 	__current_set_polling();
 	tick_nohz_idle_enter();
-	tick_nohz_idle_stop_tick_protected();
 
 	while (!need_resched()) {
 		check_pgt_cache();
 		rmb();
 
 		if (cpu_is_offline(cpu)) {
+			tick_nohz_idle_stop_tick_protected();
 			cpuhp_report_idle_dead();
 			arch_cpu_idle_dead();
 		}
@@ -236,10 +236,13 @@ static void do_idle(void)
 		 * broadcast device expired for us, we don't want to go deep
 		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		if (cpu_idle_force_poll || tick_check_broadcast_expired())
+		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
-		else
+		} else {
+			tick_nohz_idle_stop_tick();
 			cpuidle_idle_call();
+		}
 		arch_cpu_idle_exit();
 	}
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 678349aec483..f5d37788ea85 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -960,12 +960,10 @@ void tick_nohz_irq_exit(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 
-	if (ts->inidle) {
+	if (ts->inidle)
 		tick_nohz_start_idle(ts);
-		__tick_nohz_idle_stop_tick(ts);
-	} else {
+	else
 		tick_nohz_full_update_tick(ts);
-	}
 }
 
 /**
@@ -1026,6 +1024,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 #endif
 }
 
+static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
+{
+	tick_nohz_restart_sched_tick(ts, now);
+	tick_nohz_account_idle_ticks(ts);
+}
+
+void tick_nohz_idle_restart_tick(void)
+{
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	if (ts->tick_stopped)
+		__tick_nohz_idle_restart_tick(ts, ktime_get());
+}
+
 /**
  * tick_nohz_idle_exit - restart the idle tick from the idle task
  *
@@ -1050,10 +1062,8 @@ void tick_nohz_idle_exit(void)
 	if (ts->idle_active)
 		tick_nohz_stop_idle(ts, now);
 
-	if (ts->tick_stopped) {
-		tick_nohz_restart_sched_tick(ts, now);
-		tick_nohz_account_idle_ticks(ts);
-	}
+	if (ts->tick_stopped)
+		__tick_nohz_idle_restart_tick(ts, now);
 
 	local_irq_enable();
 }
-- 
cgit 


From ed98c34919985a9f87c3edacb9a8d8c283c9e243 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 15 Mar 2018 23:07:41 +0100
Subject: sched: idle: Do not stop the tick before cpuidle_idle_call()

Make cpuidle_idle_call() decide whether or not to stop the tick.

First, the cpuidle_enter_s2idle() path deals with the tick (and with
the entire timekeeping for that matter) by itself and it doesn't need
the tick to be stopped beforehand.

Second, to address the issue with short idle duration predictions
by the idle governor after the tick has been stopped, it will be
necessary to change the ordering of cpuidle_select() with respect
to tick_nohz_idle_stop_tick().  To prepare for that, put a
tick_nohz_idle_stop_tick() call in the same branch in which
cpuidle_select() is called.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/idle.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3777e83c0b5a..4f64835d38a8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -141,13 +141,15 @@ static void cpuidle_idle_call(void)
 	}
 
 	/*
-	 * Tell the RCU framework we are entering an idle section,
-	 * so no more rcu read side critical sections and one more
+	 * The RCU framework needs to be told that we are entering an idle
+	 * section, so no more rcu read side critical sections and one more
 	 * step to the grace period
 	 */
-	rcu_idle_enter();
 
 	if (cpuidle_not_available(drv, dev)) {
+		tick_nohz_idle_stop_tick();
+		rcu_idle_enter();
+
 		default_idle_call();
 		goto exit_idle;
 	}
@@ -164,16 +166,26 @@ static void cpuidle_idle_call(void)
 
 	if (idle_should_enter_s2idle() || dev->use_deepest_state) {
 		if (idle_should_enter_s2idle()) {
+			rcu_idle_enter();
+
 			entered_state = cpuidle_enter_s2idle(drv, dev);
 			if (entered_state > 0) {
 				local_irq_enable();
 				goto exit_idle;
 			}
+
+			rcu_idle_exit();
 		}
 
+		tick_nohz_idle_stop_tick();
+		rcu_idle_enter();
+
 		next_state = cpuidle_find_deepest_state(drv, dev);
 		call_cpuidle(drv, dev, next_state);
 	} else {
+		tick_nohz_idle_stop_tick();
+		rcu_idle_enter();
+
 		/*
 		 * Ask the cpuidle framework to choose a convenient idle state.
 		 */
@@ -240,7 +252,6 @@ static void do_idle(void)
 			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
 		} else {
-			tick_nohz_idle_stop_tick();
 			cpuidle_idle_call();
 		}
 		arch_cpu_idle_exit();
-- 
cgit 


From efefc97736e6f3261879bc9dddcb161224a455f5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 20 Mar 2018 10:11:28 +0100
Subject: jiffies: Introduce USER_TICK_USEC and redefine TICK_USEC

Since the subsequent changes will need a TICK_USEC definition
analogous to TICK_NSEC, rename the existing TICK_USEC as
USER_TICK_USEC, update its users and redefine TICK_USEC
accordingly.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
---
 drivers/net/ethernet/sfc/mcdi.c | 2 +-
 include/linux/jiffies.h         | 7 +++++--
 kernel/time/ntp.c               | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index 9c2567b0d93e..dfad93fca0a6 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -375,7 +375,7 @@ static int efx_mcdi_poll(struct efx_nic *efx)
 	 * because generally mcdi responses are fast. After that, back off
 	 * and poll once a jiffy (approximately)
 	 */
-	spins = TICK_USEC;
+	spins = USER_TICK_USEC;
 	finish = jiffies + MCDI_RPC_TIMEOUT;
 
 	while (1) {
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 9385aa57497b..a27cf6652327 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -62,8 +62,11 @@ extern int register_refined_jiffies(long clock_tick_rate);
 /* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */
 #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ)
 
-/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
-#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
+/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */
+#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ)
+
+/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
+#define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
 
 #ifndef __jiffy_arch_data
 #define __jiffy_arch_data
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8d70da1b9a0d..a09ded765f6c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@
 
 
 /* USER_HZ period (usecs): */
-unsigned long			tick_usec = TICK_USEC;
+unsigned long			tick_usec = USER_TICK_USEC;
 
 /* SHIFTED_HZ period (nsecs): */
 unsigned long			tick_nsec;
-- 
cgit 


From 45f1ff59e27ca59d33cc1a317e669d90022ccf7d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 22 Mar 2018 17:50:49 +0100
Subject: cpuidle: Return nohz hint from cpuidle_select()

Add a new pointer argument to cpuidle_select() and to the ->select
cpuidle governor callback to allow a boolean value indicating
whether or not the tick should be stopped before entering the
selected state to be returned from there.

Make the ladder governor ignore that pointer (to preserve its
current behavior) and make the menu governor return 'false" through
it if:
 (1) the idle exit latency is constrained at 0, or
 (2) the selected state is a polling one, or
 (3) the expected idle period duration is within the tick period
     range.

In addition to that, the correction factor computations in the menu
governor need to take the possibility that the tick may not be
stopped into account to avoid artificially small correction factor
values.  To that end, add a mechanism to record tick wakeups, as
suggested by Peter Zijlstra, and use it to modify the menu_update()
behavior when tick wakeup occurs.  Namely, if the CPU is woken up by
the tick and the return value of tick_nohz_get_sleep_length() is not
within the tick boundary, the predicted idle duration is likely too
short, so make menu_update() try to compensate for that by updating
the governor statistics as though the CPU was idle for a long time.

Since the value returned through the new argument pointer of
cpuidle_select() is not used by its caller yet, this change by
itself is not expected to alter the functionality of the code.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpuidle/cpuidle.c          | 10 +++++--
 drivers/cpuidle/governors/ladder.c |  3 +-
 drivers/cpuidle/governors/menu.c   | 59 ++++++++++++++++++++++++++++++--------
 include/linux/cpuidle.h            |  8 ++++--
 include/linux/tick.h               |  2 ++
 kernel/sched/idle.c                |  4 ++-
 kernel/time/tick-sched.c           | 20 +++++++++++++
 7 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 0003e9a02637..6df894d65d9e 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -272,12 +272,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
  *
  * @drv: the cpuidle driver
  * @dev: the cpuidle device
+ * @stop_tick: indication on whether or not to stop the tick
  *
  * Returns the index of the idle state.  The return value must not be negative.
+ *
+ * The memory location pointed to by @stop_tick is expected to be written the
+ * 'false' boolean value if the scheduler tick should not be stopped before
+ * entering the returned state.
  */
-int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		   bool *stop_tick)
 {
-	return cpuidle_curr_governor->select(drv, dev);
+	return cpuidle_curr_governor->select(drv, dev, stop_tick);
 }
 
 /**
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 1ad8745fd6d6..b24883f85c99 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -63,9 +63,10 @@ static inline void ladder_do_selection(struct ladder_device *ldev,
  * ladder_select_state - selects the next state to enter
  * @drv: cpuidle driver
  * @dev: the CPU
+ * @dummy: not used
  */
 static int ladder_select_state(struct cpuidle_driver *drv,
-				struct cpuidle_device *dev)
+			       struct cpuidle_device *dev, bool *dummy)
 {
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
 	struct device *device = get_cpu_device(dev->cpu);
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index aa390404e85f..f53a929bd2bd 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -123,6 +123,7 @@
 struct menu_device {
 	int		last_state_idx;
 	int             needs_update;
+	int             tick_wakeup;
 
 	unsigned int	next_timer_us;
 	unsigned int	predicted_us;
@@ -279,8 +280,10 @@ again:
  * menu_select - selects the next idle state to enter
  * @drv: cpuidle driver containing state data
  * @dev: the CPU
+ * @stop_tick: indication on whether or not to stop the tick
  */
-static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		       bool *stop_tick)
 {
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
 	struct device *device = get_cpu_device(dev->cpu);
@@ -303,8 +306,10 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		latency_req = resume_latency;
 
 	/* Special case when user has set very strict latency requirement */
-	if (unlikely(latency_req == 0))
+	if (unlikely(latency_req == 0)) {
+		*stop_tick = false;
 		return 0;
+	}
 
 	/* determine the expected residency time, round up */
 	data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
@@ -354,6 +359,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	if (latency_req > interactivity_req)
 		latency_req = interactivity_req;
 
+	expected_interval = data->predicted_us;
 	/*
 	 * Find the idle state with the lowest power while satisfying
 	 * our constraints.
@@ -369,15 +375,30 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 			idx = i; /* first enabled state */
 		if (s->target_residency > data->predicted_us)
 			break;
-		if (s->exit_latency > latency_req)
+		if (s->exit_latency > latency_req) {
+			/*
+			 * If we break out of the loop for latency reasons, use
+			 * the target residency of the selected state as the
+			 * expected idle duration so that the tick is retained
+			 * as long as that target residency is low enough.
+			 */
+			expected_interval = drv->states[idx].target_residency;
 			break;
-
+		}
 		idx = i;
 	}
 
 	if (idx == -1)
 		idx = 0; /* No states enabled. Must use 0. */
 
+	/*
+	 * Don't stop the tick if the selected state is a polling one or if the
+	 * expected idle duration is shorter than the tick period length.
+	 */
+	if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
+	    expected_interval < TICK_USEC)
+		*stop_tick = false;
+
 	data->last_state_idx = idx;
 
 	return data->last_state_idx;
@@ -397,6 +418,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index)
 
 	data->last_state_idx = index;
 	data->needs_update = 1;
+	data->tick_wakeup = tick_nohz_idle_got_tick();
 }
 
 /**
@@ -427,14 +449,27 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	 * assume the state was never reached and the exit latency is 0.
 	 */
 
-	/* measured value */
-	measured_us = cpuidle_get_last_residency(dev);
-
-	/* Deduct exit latency */
-	if (measured_us > 2 * target->exit_latency)
-		measured_us -= target->exit_latency;
-	else
-		measured_us /= 2;
+	if (data->tick_wakeup && data->next_timer_us > TICK_USEC) {
+		/*
+		 * The nohz code said that there wouldn't be any events within
+		 * the tick boundary (if the tick was stopped), but the idle
+		 * duration predictor had a differing opinion.  Since the CPU
+		 * was woken up by a tick (that wasn't stopped after all), the
+		 * predictor was not quite right, so assume that the CPU could
+		 * have been idle long (but not forever) to help the idle
+		 * duration predictor do a better job next time.
+		 */
+		measured_us = 9 * MAX_INTERESTING / 10;
+	} else {
+		/* measured value */
+		measured_us = cpuidle_get_last_residency(dev);
+
+		/* Deduct exit latency */
+		if (measured_us > 2 * target->exit_latency)
+			measured_us -= target->exit_latency;
+		else
+			measured_us /= 2;
+	}
 
 	/* Make sure our coefficients do not exceed unity */
 	if (measured_us > data->next_timer_us)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index a806e94c482f..1eefabf1621f 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -135,7 +135,8 @@ extern bool cpuidle_not_available(struct cpuidle_driver *drv,
 				  struct cpuidle_device *dev);
 
 extern int cpuidle_select(struct cpuidle_driver *drv,
-			  struct cpuidle_device *dev);
+			  struct cpuidle_device *dev,
+			  bool *stop_tick);
 extern int cpuidle_enter(struct cpuidle_driver *drv,
 			 struct cpuidle_device *dev, int index);
 extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
@@ -167,7 +168,7 @@ static inline bool cpuidle_not_available(struct cpuidle_driver *drv,
 					 struct cpuidle_device *dev)
 {return true; }
 static inline int cpuidle_select(struct cpuidle_driver *drv,
-				 struct cpuidle_device *dev)
+				 struct cpuidle_device *dev, bool *stop_tick)
 {return -ENODEV; }
 static inline int cpuidle_enter(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
@@ -250,7 +251,8 @@ struct cpuidle_governor {
 					struct cpuidle_device *dev);
 
 	int  (*select)		(struct cpuidle_driver *drv,
-					struct cpuidle_device *dev);
+					struct cpuidle_device *dev,
+					bool *stop_tick);
 	void (*reflect)		(struct cpuidle_device *dev, int index);
 };
 
diff --git a/include/linux/tick.h b/include/linux/tick.h
index fccebfba167e..ef0717e5e526 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -120,6 +120,7 @@ extern void tick_nohz_idle_restart_tick(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
+extern bool tick_nohz_idle_got_tick(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern unsigned long tick_nohz_get_idle_calls(void);
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
@@ -141,6 +142,7 @@ static inline void tick_nohz_idle_stop_tick(void) { }
 static inline void tick_nohz_idle_restart_tick(void) { }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
+static inline bool tick_nohz_idle_got_tick(void) { return false; }
 
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4f64835d38a8..a966bd2a6fa0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -183,13 +183,15 @@ static void cpuidle_idle_call(void)
 		next_state = cpuidle_find_deepest_state(drv, dev);
 		call_cpuidle(drv, dev, next_state);
 	} else {
+		bool stop_tick = true;
+
 		tick_nohz_idle_stop_tick();
 		rcu_idle_enter();
 
 		/*
 		 * Ask the cpuidle framework to choose a convenient idle state.
 		 */
-		next_state = cpuidle_select(drv, dev);
+		next_state = cpuidle_select(drv, dev, &stop_tick);
 		entered_state = call_cpuidle(drv, dev, next_state);
 		/*
 		 * Give the governor an opportunity to reflect on the outcome
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f5d37788ea85..69fe113cfc7f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -966,6 +966,20 @@ void tick_nohz_irq_exit(void)
 		tick_nohz_full_update_tick(ts);
 }
 
+/**
+ * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
+ */
+bool tick_nohz_idle_got_tick(void)
+{
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	if (ts->inidle > 1) {
+		ts->inidle = 1;
+		return true;
+	}
+	return false;
+}
+
 /**
  * tick_nohz_get_sleep_length - return the length of the current sleep
  *
@@ -1077,6 +1091,9 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 
+	if (ts->inidle)
+		ts->inidle = 2;
+
 	dev->next_event = KTIME_MAX;
 
 	tick_sched_do_timer(now);
@@ -1174,6 +1191,9 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 
+	if (ts->inidle)
+		ts->inidle = 2;
+
 	tick_sched_do_timer(now);
 
 	/*
-- 
cgit 


From 23a8d888107ce4ce444eab2dcebf4cfb3578770b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 5 Apr 2018 19:07:57 +0200
Subject: time: tick-sched: Split tick_nohz_stop_sched_tick()

In order to address the issue with short idle duration predictions
by the idle governor after the scheduler tick has been stopped, split
tick_nohz_stop_sched_tick() into two separate routines, one computing
the time to the next timer event and the other simply stopping the
tick when the time to the next timer event is known.

Prepare these two routines to be called separately, as one of them
will be called by the idle governor in the cpuidle_select() code
path after subsequent changes.

Update the former callers of tick_nohz_stop_sched_tick() to use
the new routines, tick_nohz_next_event() and tick_nohz_stop_tick(),
instead of it and move the updates of the sleep_length field in
struct tick_sched into __tick_nohz_idle_stop_tick() as it doesn't
need to be updated anywhere else.

There should be no intentional visible changes in functionality
resulting from this change.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/time/tick-sched.c | 124 +++++++++++++++++++++++++++++------------------
 kernel/time/tick-sched.h |   4 ++
 2 files changed, 82 insertions(+), 46 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69fe113cfc7f..f56d2c695712 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -641,13 +641,10 @@ static inline bool local_timer_softirq_pending(void)
 	return local_softirq_pending() & TIMER_SOFTIRQ;
 }
 
-static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
-					 ktime_t now, int cpu)
+static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 {
-	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
 	unsigned long seq, basejiff;
-	ktime_t	tick;
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
@@ -656,6 +653,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		basejiff = jiffies;
 	} while (read_seqretry(&jiffies_lock, seq));
 	ts->last_jiffies = basejiff;
+	ts->timer_expires_base = basemono;
 
 	/*
 	 * Keep the periodic tick, when RCU, architecture or irq_work
@@ -700,47 +698,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		 * next period, so no point in stopping it either, bail.
 		 */
 		if (!ts->tick_stopped) {
-			tick = 0;
+			ts->timer_expires = 0;
 			goto out;
 		}
 	}
 
+	/*
+	 * If this CPU is the one which had the do_timer() duty last, we limit
+	 * the sleep time to the timekeeping max_deferment value.
+	 * Otherwise we can sleep as long as we want.
+	 */
+	delta = timekeeping_max_deferment();
+	if (cpu != tick_do_timer_cpu &&
+	    (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
+		delta = KTIME_MAX;
+
+	/* Calculate the next expiry time */
+	if (delta < (KTIME_MAX - basemono))
+		expires = basemono + delta;
+	else
+		expires = KTIME_MAX;
+
+	ts->timer_expires = min_t(u64, expires, next_tick);
+
+out:
+	return ts->timer_expires;
+}
+
+static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
+{
+	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+	u64 basemono = ts->timer_expires_base;
+	u64 expires = ts->timer_expires;
+	ktime_t tick = expires;
+
+	/* Make sure we won't be trying to stop it twice in a row. */
+	ts->timer_expires_base = 0;
+
 	/*
 	 * If this CPU is the one which updates jiffies, then give up
 	 * the assignment and let it be taken by the CPU which runs
 	 * the tick timer next, which might be this CPU as well. If we
 	 * don't drop this here the jiffies might be stale and
 	 * do_timer() never invoked. Keep track of the fact that it
-	 * was the one which had the do_timer() duty last. If this CPU
-	 * is the one which had the do_timer() duty last, we limit the
-	 * sleep time to the timekeeping max_deferment value.
-	 * Otherwise we can sleep as long as we want.
+	 * was the one which had the do_timer() duty last.
 	 */
-	delta = timekeeping_max_deferment();
 	if (cpu == tick_do_timer_cpu) {
 		tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 		ts->do_timer_last = 1;
 	} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
-		delta = KTIME_MAX;
 		ts->do_timer_last = 0;
-	} else if (!ts->do_timer_last) {
-		delta = KTIME_MAX;
 	}
 
-	/* Calculate the next expiry time */
-	if (delta < (KTIME_MAX - basemono))
-		expires = basemono + delta;
-	else
-		expires = KTIME_MAX;
-
-	expires = min_t(u64, expires, next_tick);
-	tick = expires;
-
 	/* Skip reprogram of event if its not changed */
 	if (ts->tick_stopped && (expires == ts->next_tick)) {
 		/* Sanity check: make sure clockevent is actually programmed */
 		if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
-			goto out;
+			return;
 
 		WARN_ON_ONCE(1);
 		printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
@@ -774,7 +788,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	if (unlikely(expires == KTIME_MAX)) {
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 			hrtimer_cancel(&ts->sched_timer);
-		goto out;
+		return;
 	}
 
 	hrtimer_set_expires(&ts->sched_timer, tick);
@@ -783,15 +797,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
 	else
 		tick_program_event(tick, 1);
-out:
-	/*
-	 * Update the estimated sleep length until the next timer
-	 * (not only the tick).
-	 */
-	ts->sleep_length = ktime_sub(dev->next_event, now);
-	return tick;
 }
 
+static void tick_nohz_retain_tick(struct tick_sched *ts)
+{
+	ts->timer_expires_base = 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
+{
+	if (tick_nohz_next_event(ts, cpu))
+		tick_nohz_stop_tick(ts, cpu);
+	else
+		tick_nohz_retain_tick(ts);
+}
+#endif /* CONFIG_NO_HZ_FULL */
+
 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
 	/* Update jiffies first */
@@ -827,7 +849,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 		return;
 
 	if (can_stop_full_tick(cpu, ts))
-		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+		tick_nohz_stop_sched_tick(ts, cpu);
 	else if (ts->tick_stopped)
 		tick_nohz_restart_sched_tick(ts, ktime_get());
 #endif
@@ -853,10 +875,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 		return false;
 	}
 
-	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
-		ts->sleep_length = NSEC_PER_SEC / HZ;
+	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		return false;
-	}
 
 	if (need_resched())
 		return false;
@@ -893,29 +913,37 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 
 static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
 {
+	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 	ktime_t expires;
 	int cpu = smp_processor_id();
 
-	if (can_stop_idle_tick(cpu, ts)) {
+	WARN_ON_ONCE(ts->timer_expires_base);
+
+	if (!can_stop_idle_tick(cpu, ts))
+		goto out;
+
+	expires = tick_nohz_next_event(ts, cpu);
+
+	ts->idle_calls++;
+
+	if (expires > 0LL) {
 		int was_stopped = ts->tick_stopped;
 
-		ts->idle_calls++;
+		tick_nohz_stop_tick(ts, cpu);
 
-		/*
-		 * The idle entry time should be a sufficient approximation of
-		 * the current time at this point.
-		 */
-		expires = tick_nohz_stop_sched_tick(ts, ts->idle_entrytime, cpu);
-		if (expires > 0LL) {
-			ts->idle_sleeps++;
-			ts->idle_expires = expires;
-		}
+		ts->idle_sleeps++;
+		ts->idle_expires = expires;
 
 		if (!was_stopped && ts->tick_stopped) {
 			ts->idle_jiffies = ts->last_jiffies;
 			nohz_balance_enter_idle(cpu);
 		}
+	} else {
+		tick_nohz_retain_tick(ts);
 	}
+
+out:
+	ts->sleep_length = ktime_sub(dev->next_event, ts->idle_entrytime);
 }
 
 /**
@@ -942,6 +970,9 @@ void tick_nohz_idle_enter(void)
 	local_irq_disable();
 
 	ts = this_cpu_ptr(&tick_cpu_sched);
+
+	WARN_ON_ONCE(ts->timer_expires_base);
+
 	ts->inidle = 1;
 	tick_nohz_start_idle(ts);
 
@@ -1067,6 +1098,7 @@ void tick_nohz_idle_exit(void)
 	local_irq_disable();
 
 	WARN_ON_ONCE(!ts->inidle);
+	WARN_ON_ONCE(ts->timer_expires_base);
 
 	ts->inidle = 0;
 
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 954b43dbf21c..53e45a39bdbc 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -39,6 +39,8 @@ enum tick_nohz_mode {
  * @idle_sleeptime:	Sum of the time slept in idle with sched tick stopped
  * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding
  * @sleep_length:	Duration of the current idle sleep
+ * @timer_expires:	Anticipated timer expiration time (in case sched tick is stopped)
+ * @timer_expires_base:	Base time clock monotonic for @timer_expires
  * @do_timer_lst:	CPU was the last one doing do_timer before going idle
  */
 struct tick_sched {
@@ -60,6 +62,8 @@ struct tick_sched {
 	ktime_t				iowait_sleeptime;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
+	u64				timer_expires;
+	u64				timer_expires_base;
 	u64				next_timer;
 	ktime_t				idle_expires;
 	int				do_timer_last;
-- 
cgit 


From a59855cd8c613ba4bb95147f6176360d95f75e60 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 3 Apr 2018 23:17:00 +0200
Subject: time: hrtimer: Introduce hrtimer_next_event_without()

The next set of changes will need to compute the time to the next
hrtimer event over all hrtimers except for the scheduler tick one.

To that end introduce a new helper function,
hrtimer_next_event_without(), for computing the time until the next
hrtimer event over all timers except for one and modify the underlying
code in __hrtimer_next_event_base() to prepare it for being called by
that new function.

No intentional changes in functionality.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/hrtimer.h |  1 +
 kernel/time/hrtimer.c   | 55 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index c7902ca7c9f4..3892e9c8b2de 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -426,6 +426,7 @@ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 }
 
 extern u64 hrtimer_get_next_event(void);
+extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);
 
 extern bool hrtimer_active(const struct hrtimer *timer);
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 23788100e214..6d387dbd7304 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -490,6 +490,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
 	while ((base = __next_base((cpu_base), &(active))))
 
 static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+					 const struct hrtimer *exclude,
 					 unsigned int active,
 					 ktime_t expires_next)
 {
@@ -502,9 +503,24 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
 
 		next = timerqueue_getnext(&base->active);
 		timer = container_of(next, struct hrtimer, node);
+		if (timer == exclude) {
+			/* Get to the next timer in the queue. */
+			struct rb_node *rbn = rb_next(&next->node);
+
+			next = rb_entry_safe(rbn, struct timerqueue_node, node);
+			if (!next)
+				continue;
+
+			timer = container_of(next, struct hrtimer, node);
+		}
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 		if (expires < expires_next) {
 			expires_next = expires;
+
+			/* Skip cpu_base update if a timer is being excluded. */
+			if (exclude)
+				continue;
+
 			if (timer->is_soft)
 				cpu_base->softirq_next_timer = timer;
 			else
@@ -548,7 +564,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
 	if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
 		active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
 		cpu_base->softirq_next_timer = NULL;
-		expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
+		expires_next = __hrtimer_next_event_base(cpu_base, NULL,
+							 active, KTIME_MAX);
 
 		next_timer = cpu_base->softirq_next_timer;
 	}
@@ -556,7 +573,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
 	if (active_mask & HRTIMER_ACTIVE_HARD) {
 		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
 		cpu_base->next_timer = next_timer;
-		expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+		expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
+							 expires_next);
 	}
 
 	return expires_next;
@@ -1202,6 +1220,39 @@ u64 hrtimer_get_next_event(void)
 
 	return expires;
 }
+
+/**
+ * hrtimer_next_event_without - time until next expiry event w/o one timer
+ * @exclude:	timer to exclude
+ *
+ * Returns the next expiry time over all timers except for the @exclude one or
+ * KTIME_MAX if none of them is pending.
+ */
+u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	u64 expires = KTIME_MAX;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+	if (__hrtimer_hres_active(cpu_base)) {
+		unsigned int active;
+
+		if (!cpu_base->softirq_activated) {
+			active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+			expires = __hrtimer_next_event_base(cpu_base, exclude,
+							    active, KTIME_MAX);
+		}
+		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+		expires = __hrtimer_next_event_base(cpu_base, exclude, active,
+						    expires);
+	}
+
+	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
+	return expires;
+}
 #endif
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
-- 
cgit 


From 554c8aa8ecade210d58a252173bb8f2106552a44 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 3 Apr 2018 23:17:11 +0200
Subject: sched: idle: Select idle state before stopping the tick

In order to address the issue with short idle duration predictions
by the idle governor after the scheduler tick has been stopped,
reorder the code in cpuidle_idle_call() so that the governor idle
state selection runs before tick_nohz_idle_go_idle() and use the
"nohz" hint returned by cpuidle_select() to decide whether or not
to stop the tick.

This isn't straightforward, because menu_select() invokes
tick_nohz_get_sleep_length() to get the time to the next timer
event and the number returned by the latter comes from
__tick_nohz_idle_stop_tick().  Fortunately, however, it is possible
to compute that number without actually stopping the tick and with
the help of the existing code.

Namely, tick_nohz_get_sleep_length() can be made call
tick_nohz_next_event(), introduced earlier, to get the time to the
next non-highres timer event.  If that happens, tick_nohz_next_event()
need not be called by __tick_nohz_idle_stop_tick() again.

If it turns out that the scheduler tick cannot be stopped going
forward or the next timer event is too close for the tick to be
stopped, tick_nohz_get_sleep_length() can simply return the time to
the next event currently programmed into the corresponding clock
event device.

In addition to knowing the return value of tick_nohz_next_event(),
however, tick_nohz_get_sleep_length() needs to know the time to the
next highres timer event, but with the scheduler tick timer excluded,
which can be computed with the help of hrtimer_get_next_event().

That minimum of that number and the tick_nohz_next_event() return
value is the total time to the next timer event with the assumption
that the tick will be stopped.  It can be returned to the idle
governor which can use it for predicting idle duration (under the
assumption that the tick will be stopped) and deciding whether or
not it makes sense to stop the tick before putting the CPU into the
selected idle state.

With the above, the sleep_length field in struct tick_sched is not
necessary any more, so drop it.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227
Reported-by: Doug Smythies <dsmythies@telus.net>
Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/tick.h     |  2 ++
 kernel/sched/idle.c      | 11 ++++++---
 kernel/time/tick-sched.c | 61 ++++++++++++++++++++++++++++++++++++++----------
 kernel/time/tick-sched.h |  2 --
 4 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index ef0717e5e526..e8e7ff16b929 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -116,6 +116,7 @@ extern bool tick_nohz_enabled;
 extern bool tick_nohz_tick_stopped(void);
 extern bool tick_nohz_tick_stopped_cpu(int cpu);
 extern void tick_nohz_idle_stop_tick(void);
+extern void tick_nohz_idle_retain_tick(void);
 extern void tick_nohz_idle_restart_tick(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
@@ -139,6 +140,7 @@ static inline void tick_nohz_idle_stop_tick_protected(void)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
 static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
 static inline void tick_nohz_idle_stop_tick(void) { }
+static inline void tick_nohz_idle_retain_tick(void) { }
 static inline void tick_nohz_idle_restart_tick(void) { }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index a966bd2a6fa0..1a3e9bddd17b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -185,13 +185,18 @@ static void cpuidle_idle_call(void)
 	} else {
 		bool stop_tick = true;
 
-		tick_nohz_idle_stop_tick();
-		rcu_idle_enter();
-
 		/*
 		 * Ask the cpuidle framework to choose a convenient idle state.
 		 */
 		next_state = cpuidle_select(drv, dev, &stop_tick);
+
+		if (stop_tick)
+			tick_nohz_idle_stop_tick();
+		else
+			tick_nohz_idle_retain_tick();
+
+		rcu_idle_enter();
+
 		entered_state = call_cpuidle(drv, dev, next_state);
 		/*
 		 * Give the governor an opportunity to reflect on the outcome
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f56d2c695712..c57c98c7e953 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -913,16 +913,19 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 
 static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
 {
-	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 	ktime_t expires;
 	int cpu = smp_processor_id();
 
-	WARN_ON_ONCE(ts->timer_expires_base);
-
-	if (!can_stop_idle_tick(cpu, ts))
-		goto out;
-
-	expires = tick_nohz_next_event(ts, cpu);
+	/*
+	 * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
+	 * tick timer expiration time is known already.
+	 */
+	if (ts->timer_expires_base)
+		expires = ts->timer_expires;
+	else if (can_stop_idle_tick(cpu, ts))
+		expires = tick_nohz_next_event(ts, cpu);
+	else
+		return;
 
 	ts->idle_calls++;
 
@@ -941,9 +944,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
 	} else {
 		tick_nohz_retain_tick(ts);
 	}
-
-out:
-	ts->sleep_length = ktime_sub(dev->next_event, ts->idle_entrytime);
 }
 
 /**
@@ -956,6 +956,16 @@ void tick_nohz_idle_stop_tick(void)
 	__tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
 }
 
+void tick_nohz_idle_retain_tick(void)
+{
+	tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
+	/*
+	 * Undo the effect of get_next_timer_interrupt() called from
+	 * tick_nohz_next_event().
+	 */
+	timer_clear_idle();
+}
+
 /**
  * tick_nohz_idle_enter - prepare for entering idle on the current CPU
  *
@@ -1012,15 +1022,42 @@ bool tick_nohz_idle_got_tick(void)
 }
 
 /**
- * tick_nohz_get_sleep_length - return the length of the current sleep
+ * tick_nohz_get_sleep_length - return the expected length of the current sleep
  *
  * Called from power state control code with interrupts disabled
  */
 ktime_t tick_nohz_get_sleep_length(void)
 {
+	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+	int cpu = smp_processor_id();
+	/*
+	 * The idle entry time is expected to be a sufficient approximation of
+	 * the current time at this point.
+	 */
+	ktime_t now = ts->idle_entrytime;
+	ktime_t next_event;
+
+	WARN_ON_ONCE(!ts->inidle);
+
+	if (!can_stop_idle_tick(cpu, ts))
+		goto out_dev;
+
+	next_event = tick_nohz_next_event(ts, cpu);
+	if (!next_event)
+		goto out_dev;
+
+	/*
+	 * If the next highres timer to expire is earlier than next_event, the
+	 * idle governor needs to know that.
+	 */
+	next_event = min_t(u64, next_event,
+			   hrtimer_next_event_without(&ts->sched_timer));
+
+	return ktime_sub(next_event, now);
 
-	return ts->sleep_length;
+out_dev:
+	return ktime_sub(dev->next_event, now);
 }
 
 /**
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 53e45a39bdbc..2b845f2c44b1 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -38,7 +38,6 @@ enum tick_nohz_mode {
  * @idle_exittime:	Time when the idle state was left
  * @idle_sleeptime:	Sum of the time slept in idle with sched tick stopped
  * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @sleep_length:	Duration of the current idle sleep
  * @timer_expires:	Anticipated timer expiration time (in case sched tick is stopped)
  * @timer_expires_base:	Base time clock monotonic for @timer_expires
  * @do_timer_lst:	CPU was the last one doing do_timer before going idle
@@ -60,7 +59,6 @@ struct tick_sched {
 	ktime_t				idle_exittime;
 	ktime_t				idle_sleeptime;
 	ktime_t				iowait_sleeptime;
-	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
 	u64				timer_expires;
 	u64				timer_expires_base;
-- 
cgit 


From 296bb1e51a4838a6488ec5ce676607093482ecbc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 5 Apr 2018 19:12:34 +0200
Subject: cpuidle: menu: Refine idle state selection for running tick

If the tick isn't stopped, the target residency of the state selected
by the menu governor may be greater than the actual time to the next
tick and that means lost energy.

To avoid that, make tick_nohz_get_sleep_length() return the current
time to the next event (before stopping the tick) in addition to the
estimated one via an extra pointer argument and make menu_select()
use that value to refine the state selection when necessary.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpuidle/governors/menu.c | 27 +++++++++++++++++++++++++--
 include/linux/tick.h             |  7 ++++---
 kernel/time/tick-sched.c         | 12 ++++++------
 3 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index f53a929bd2bd..267982e471e0 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -295,6 +295,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	unsigned int expected_interval;
 	unsigned long nr_iowaiters, cpu_load;
 	int resume_latency = dev_pm_qos_raw_read_value(device);
+	ktime_t delta_next;
 
 	if (data->needs_update) {
 		menu_update(drv, dev);
@@ -312,7 +313,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	}
 
 	/* determine the expected residency time, round up */
-	data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+	data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next));
 
 	get_iowait_load(&nr_iowaiters, &cpu_load);
 	data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
@@ -396,9 +397,31 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * expected idle duration is shorter than the tick period length.
 	 */
 	if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-	    expected_interval < TICK_USEC)
+	    expected_interval < TICK_USEC) {
+		unsigned int delta_next_us = ktime_to_us(delta_next);
+
 		*stop_tick = false;
 
+		if (!tick_nohz_tick_stopped() && idx > 0 &&
+		    drv->states[idx].target_residency > delta_next_us) {
+			/*
+			 * The tick is not going to be stopped and the target
+			 * residency of the state to be returned is not within
+			 * the time until the next timer event including the
+			 * tick, so try to correct that.
+			 */
+			for (i = idx - 1; i >= 0; i--) {
+			    if (drv->states[i].disabled ||
+			        dev->states_usage[i].disable)
+					continue;
+
+				idx = i;
+				if (drv->states[i].target_residency <= delta_next_us)
+					break;
+			}
+		}
+	}
+
 	data->last_state_idx = idx;
 
 	return data->last_state_idx;
diff --git a/include/linux/tick.h b/include/linux/tick.h
index e8e7ff16b929..55388ab45fd4 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -122,7 +122,7 @@ extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern bool tick_nohz_idle_got_tick(void);
-extern ktime_t tick_nohz_get_sleep_length(void);
+extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next);
 extern unsigned long tick_nohz_get_idle_calls(void);
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
@@ -146,9 +146,10 @@ static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 static inline bool tick_nohz_idle_got_tick(void) { return false; }
 
-static inline ktime_t tick_nohz_get_sleep_length(void)
+static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 {
-	return NSEC_PER_SEC / HZ;
+	*delta_next = TICK_NSEC;
+	return *delta_next;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c57c98c7e953..edb9d49b4996 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1023,10 +1023,11 @@ bool tick_nohz_idle_got_tick(void)
 
 /**
  * tick_nohz_get_sleep_length - return the expected length of the current sleep
+ * @delta_next: duration until the next event if the tick cannot be stopped
  *
  * Called from power state control code with interrupts disabled
  */
-ktime_t tick_nohz_get_sleep_length(void)
+ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1040,12 +1041,14 @@ ktime_t tick_nohz_get_sleep_length(void)
 
 	WARN_ON_ONCE(!ts->inidle);
 
+	*delta_next = ktime_sub(dev->next_event, now);
+
 	if (!can_stop_idle_tick(cpu, ts))
-		goto out_dev;
+		return *delta_next;
 
 	next_event = tick_nohz_next_event(ts, cpu);
 	if (!next_event)
-		goto out_dev;
+		return *delta_next;
 
 	/*
 	 * If the next highres timer to expire is earlier than next_event, the
@@ -1055,9 +1058,6 @@ ktime_t tick_nohz_get_sleep_length(void)
 			   hrtimer_next_event_without(&ts->sched_timer));
 
 	return ktime_sub(next_event, now);
-
-out_dev:
-	return ktime_sub(dev->next_event, now);
 }
 
 /**
-- 
cgit 


From 87c9fe6ee495f78f36d39cb37f6a714444a093ee Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 5 Apr 2018 19:12:43 +0200
Subject: cpuidle: menu: Avoid selecting shallow states with stopped tick

If the scheduler tick has been stopped already and the governor
selects a shallow idle state, the CPU can spend a long time in that
state if the selection is based on an inaccurate prediction of idle
time.  That effect turns out to be relevant, so it needs to be
mitigated.

To that end, modify the menu governor to discard the result of the
idle time prediction if the tick is stopped and the predicted idle
time is less than the tick period length, unless the tick timer is
going to expire soon.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpuidle/governors/menu.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 267982e471e0..1bfe03ceb236 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -352,13 +352,28 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 */
 	data->predicted_us = min(data->predicted_us, expected_interval);
 
-	/*
-	 * Use the performance multiplier and the user-configurable
-	 * latency_req to determine the maximum exit latency.
-	 */
-	interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
-	if (latency_req > interactivity_req)
-		latency_req = interactivity_req;
+	if (tick_nohz_tick_stopped()) {
+		/*
+		 * If the tick is already stopped, the cost of possible short
+		 * idle duration misprediction is much higher, because the CPU
+		 * may be stuck in a shallow idle state for a long time as a
+		 * result of it.  In that case say we might mispredict and try
+		 * to force the CPU into a state for which we would have stopped
+		 * the tick, unless a timer is going to expire really soon
+		 * anyway.
+		 */
+		if (data->predicted_us < TICK_USEC)
+			data->predicted_us = min_t(unsigned int, TICK_USEC,
+						   ktime_to_us(delta_next));
+	} else {
+		/*
+		 * Use the performance multiplier and the user-configurable
+		 * latency_req to determine the maximum exit latency.
+		 */
+		interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
+		if (latency_req > interactivity_req)
+			latency_req = interactivity_req;
+	}
 
 	expected_interval = data->predicted_us;
 	/*
-- 
cgit 


From 2bc629a692a76b9ee3dab9c303e3f501bece66a4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 6 Apr 2018 04:32:37 +0200
Subject: nohz: Gather tick_sched booleans under a common flag field

Optimize the space and leave plenty of room for further flags.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
[ rjw: Do not use __this_cpu_read() to access tick_stopped and add
       got_idle_tick to avoid overloading inidle ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/time/tick-sched.c | 12 +++++++-----
 kernel/time/tick-sched.h | 12 ++++++++----
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index edb9d49b4996..a9d5cc7406d3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -465,7 +465,9 @@ __setup("nohz=", setup_tick_nohz);
 
 bool tick_nohz_tick_stopped(void)
 {
-	return __this_cpu_read(tick_cpu_sched.tick_stopped);
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	return ts->tick_stopped;
 }
 
 bool tick_nohz_tick_stopped_cpu(int cpu)
@@ -1014,8 +1016,8 @@ bool tick_nohz_idle_got_tick(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 
-	if (ts->inidle > 1) {
-		ts->inidle = 1;
+	if (ts->got_idle_tick) {
+		ts->got_idle_tick = 0;
 		return true;
 	}
 	return false;
@@ -1161,7 +1163,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	ktime_t now = ktime_get();
 
 	if (ts->inidle)
-		ts->inidle = 2;
+		ts->got_idle_tick = 1;
 
 	dev->next_event = KTIME_MAX;
 
@@ -1261,7 +1263,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	ktime_t now = ktime_get();
 
 	if (ts->inidle)
-		ts->inidle = 2;
+		ts->got_idle_tick = 1;
 
 	tick_sched_do_timer(now);
 
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 2b845f2c44b1..6de959a854b2 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -41,19 +41,24 @@ enum tick_nohz_mode {
  * @timer_expires:	Anticipated timer expiration time (in case sched tick is stopped)
  * @timer_expires_base:	Base time clock monotonic for @timer_expires
  * @do_timer_lst:	CPU was the last one doing do_timer before going idle
+ * @got_idle_tick:	Tick timer function has run with @inidle set
  */
 struct tick_sched {
 	struct hrtimer			sched_timer;
 	unsigned long			check_clocks;
 	enum tick_nohz_mode		nohz_mode;
+
+	unsigned int			inidle		: 1;
+	unsigned int			tick_stopped	: 1;
+	unsigned int			idle_active	: 1;
+	unsigned int			do_timer_last	: 1;
+	unsigned int			got_idle_tick	: 1;
+
 	ktime_t				last_tick;
 	ktime_t				next_tick;
-	int				inidle;
-	int				tick_stopped;
 	unsigned long			idle_jiffies;
 	unsigned long			idle_calls;
 	unsigned long			idle_sleeps;
-	int				idle_active;
 	ktime_t				idle_entrytime;
 	ktime_t				idle_waketime;
 	ktime_t				idle_exittime;
@@ -64,7 +69,6 @@ struct tick_sched {
 	u64				timer_expires_base;
 	u64				next_timer;
 	ktime_t				idle_expires;
-	int				do_timer_last;
 	atomic_t			tick_dep_mask;
 };
 
-- 
cgit 


From ff7de6203131e3d60cda60aeda12c69373ca5d43 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 6 Apr 2018 14:59:13 +0200
Subject: nohz: Avoid duplication of code related to got_idle_tick

Move the code setting ts->got_idle_tick into tick_sched_do_timer() to
avoid code duplication.

No intentional changes in functionality.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/time/tick-sched.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a9d5cc7406d3..956831cf6cfb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -113,8 +113,7 @@ static ktime_t tick_init_jiffy_update(void)
 	return period;
 }
 
-
-static void tick_sched_do_timer(ktime_t now)
+static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 {
 	int cpu = smp_processor_id();
 
@@ -134,6 +133,9 @@ static void tick_sched_do_timer(ktime_t now)
 	/* Check, if the jiffies need an update */
 	if (tick_do_timer_cpu == cpu)
 		tick_do_update_jiffies64(now);
+
+	if (ts->inidle)
+		ts->got_idle_tick = 1;
 }
 
 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
@@ -1162,12 +1164,9 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 
-	if (ts->inidle)
-		ts->got_idle_tick = 1;
-
 	dev->next_event = KTIME_MAX;
 
-	tick_sched_do_timer(now);
+	tick_sched_do_timer(ts, now);
 	tick_sched_handle(ts, regs);
 
 	/* No need to reprogram if we are running tickless  */
@@ -1262,10 +1261,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 
-	if (ts->inidle)
-		ts->got_idle_tick = 1;
-
-	tick_sched_do_timer(now);
+	tick_sched_do_timer(ts, now);
 
 	/*
 	 * Do not call, when we are not in irq context and have
-- 
cgit 


From 7d2f6abb402ae38ec4bb7beabb3980bb834b1e0d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 7 Apr 2018 19:11:55 +0200
Subject: time: hrtimer: Use timerqueue_iterate_next() to get to the next timer

Use timerqueue_iterate_next() to get to the next timer in
__hrtimer_next_event_base() without browsing the timerqueue
details diredctly.

No intentional changes in functionality.

Suggested-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/time/hrtimer.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 6d387dbd7304..14e858753d76 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -505,9 +505,7 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
 		timer = container_of(next, struct hrtimer, node);
 		if (timer == exclude) {
 			/* Get to the next timer in the queue. */
-			struct rb_node *rbn = rb_next(&next->node);
-
-			next = rb_entry_safe(rbn, struct timerqueue_node, node);
+			next = timerqueue_iterate_next(next);
 			if (!next)
 				continue;
 
-- 
cgit 


From 792ccb457ade4107ad9f1892cb23b78c13a10fb1 Mon Sep 17 00:00:00 2001
From: "Prakash, Prashanth" <pprakash@codeaurora.org>
Date: Wed, 4 Apr 2018 12:10:44 -0600
Subject: cpuidle: Add definition of residency to sysfs documentation

Add definition of minimum residency to sysfs documentation and
update the tree to include the residency sysfs entry.

Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpuidle/sysfs.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/cpuidle/sysfs.txt b/Documentation/cpuidle/sysfs.txt
index b6f44f490ed7..d1587f434e7b 100644
--- a/Documentation/cpuidle/sysfs.txt
+++ b/Documentation/cpuidle/sysfs.txt
@@ -40,6 +40,7 @@ total 0
 -r--r--r-- 1 root root 4096 Feb  8 10:42 latency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 name
 -r--r--r-- 1 root root 4096 Feb  8 10:42 power
+-r--r--r-- 1 root root 4096 Feb  8 10:42 residency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 time
 -r--r--r-- 1 root root 4096 Feb  8 10:42 usage
 
@@ -50,6 +51,7 @@ total 0
 -r--r--r-- 1 root root 4096 Feb  8 10:42 latency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 name
 -r--r--r-- 1 root root 4096 Feb  8 10:42 power
+-r--r--r-- 1 root root 4096 Feb  8 10:42 residency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 time
 -r--r--r-- 1 root root 4096 Feb  8 10:42 usage
 
@@ -60,6 +62,7 @@ total 0
 -r--r--r-- 1 root root 4096 Feb  8 10:42 latency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 name
 -r--r--r-- 1 root root 4096 Feb  8 10:42 power
+-r--r--r-- 1 root root 4096 Feb  8 10:42 residency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 time
 -r--r--r-- 1 root root 4096 Feb  8 10:42 usage
 
@@ -70,6 +73,7 @@ total 0
 -r--r--r-- 1 root root 4096 Feb  8 10:42 latency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 name
 -r--r--r-- 1 root root 4096 Feb  8 10:42 power
+-r--r--r-- 1 root root 4096 Feb  8 10:42 residency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 time
 -r--r--r-- 1 root root 4096 Feb  8 10:42 usage
 --------------------------------------------------------------------------------
@@ -78,6 +82,8 @@ total 0
 * desc : Small description about the idle state (string)
 * disable : Option to disable this idle state (bool) -> see note below
 * latency : Latency to exit out of this idle state (in microseconds)
+* residency : Time after which a state becomes more effecient than any
+  shallower state (in microseconds)
 * name : Name of the idle state (string)
 * power : Power consumed while in this idle state (in milliwatts)
 * time : Total time spent in this idle state (in microseconds)
-- 
cgit 


From fe43e2ce526979271f6c682727c6af9e7c19bec5 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 30 Mar 2018 16:18:44 -0500
Subject: PM / QoS: mark expected switch fall-throughs

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/qos.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d7503910ce2..fa39092b7aea 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -295,6 +295,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
 		 * changed
 		 */
 		plist_del(node, &c->list);
+		/* fall through */
 	case PM_QOS_ADD_REQ:
 		plist_node_init(node, new_value);
 		plist_add(node, &c->list);
@@ -367,6 +368,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
 		break;
 	case PM_QOS_UPDATE_REQ:
 		pm_qos_flags_remove_req(pqf, req);
+		/* fall through */
 	case PM_QOS_ADD_REQ:
 		req->flags = val;
 		INIT_LIST_HEAD(&req->node);
-- 
cgit 


From b258dfeafb4a7a6b6c81fb80e6d3df8e8235524f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 30 Mar 2018 11:52:19 +0200
Subject: cpufreq: intel_pstate: Do not include debugfs.h

The intel_pstate driver doesn't use debugfs any more, so drop
linux/debugfs.h from the list of included headers in it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/intel_pstate.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 6d084c61ee25..17e566afbb41 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -26,7 +26,6 @@
 #include <linux/sysfs.h>
 #include <linux/types.h>
 #include <linux/fs.h>
-#include <linux/debugfs.h>
 #include <linux/acpi.h>
 #include <linux/vmalloc.h>
 #include <trace/events/power.h>
-- 
cgit 


From 44a264ee1e91316b24a808d2b5b6e1815bcb0fee Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 2 Apr 2018 10:58:38 +0530
Subject: cpufreq: ti-cpufreq: Use builtin_platform_driver()

This driver can not be built as a module and there is no need of the
platform driver unregister part. Use builtin_platform_driver() instead
of module_platform_driver().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/ti-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index a099b7bf74cd..6ba709b6f095 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -304,7 +304,7 @@ static struct platform_driver ti_cpufreq_driver = {
 		.name = "ti-cpufreq",
 	},
 };
-module_platform_driver(ti_cpufreq_driver);
+builtin_platform_driver(ti_cpufreq_driver);
 
 MODULE_DESCRIPTION("TI CPUFreq/OPP hw-supported driver");
 MODULE_AUTHOR("Dave Gerlach <d-gerlach@ti.com>");
-- 
cgit 


From b8b10bc2015cd91350aac68447377c8410a48865 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 2 Apr 2018 12:50:53 +0530
Subject: cpufreq: CPPC: Don't set transition_latency

Now that the driver has started to set transition_delay_us directly,
there is no need to set transition_latency along with it, as it is not
used by the cpufreq core.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 8300a9fcb80c..38f65c371139 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -162,7 +162,6 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		cpu->perf_caps.highest_perf;
 	policy->cpuinfo.max_freq = cppc_dmi_max_khz;
 
-	policy->cpuinfo.transition_latency = cppc_get_transition_latency(cpu_num);
 	policy->transition_delay_us = cppc_get_transition_latency(cpu_num) /
 		NSEC_PER_USEC;
 	policy->shared_type = cpu->shared_type;
-- 
cgit 


From bbcc328561040292f7d6796954d478e4a2335e6f Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@bootlin.com>
Date: Wed, 4 Apr 2018 16:44:44 +0200
Subject: cpufreq: armada-37xx: Fix clock leak

There was no clk_put() balancing the clk_get(). This commit fixes it.

Fixes: 92ce45fb875d (cpufreq: Add DVFS support for Armada 37xx)
Cc: 4.16+ <stable@vger.kernel.org> # 4.16+
Reported-by: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/armada-37xx-cpufreq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c
index c6ebc88a7d8d..72a2975499db 100644
--- a/drivers/cpufreq/armada-37xx-cpufreq.c
+++ b/drivers/cpufreq/armada-37xx-cpufreq.c
@@ -202,6 +202,7 @@ static int __init armada37xx_cpufreq_driver_init(void)
 	cur_frequency = clk_get_rate(clk);
 	if (!cur_frequency) {
 		dev_err(cpu_dev, "Failed to get clock rate for CPU\n");
+		clk_put(clk);
 		return -EINVAL;
 	}
 
@@ -210,6 +211,7 @@ static int __init armada37xx_cpufreq_driver_init(void)
 		return -EINVAL;
 
 	armada37xx_cpufreq_dvfs_setup(nb_pm_base, clk, dvfs->divider);
+	clk_put(clk);
 
 	for (load_lvl = ARMADA_37XX_DVFS_LOAD_0; load_lvl < LOAD_LEVEL_NR;
 	     load_lvl++) {
-- 
cgit 


From 8913315e9459b146e5888ab5138e10daa061b885 Mon Sep 17 00:00:00 2001
From: Shunyong Yang <shunyong.yang@hxt-semitech.com>
Date: Fri, 6 Apr 2018 10:43:49 +0800
Subject: cpufreq: CPPC: Initialize shared perf capabilities of CPUs

When multiple CPUs are related in one cpufreq policy, the first online
CPU will be chosen by default to handle cpufreq operations. Let's take
cpu0 and cpu1 as an example.

When cpu0 is offline, policy->cpu will be shifted to cpu1. cpu1's perf
capabilities should be initialized. Otherwise, perf capabilities are 0s
and speed change can not take effect.

This patch copies perf capabilities of the first online CPU to other
shared CPUs when policy shared type is CPUFREQ_SHARED_TYPE_ANY.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Shunyong Yang <shunyong.yang@hxt-semitech.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 38f65c371139..bc5fc1630876 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -166,9 +166,19 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		NSEC_PER_USEC;
 	policy->shared_type = cpu->shared_type;
 
-	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
+	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+		int i;
+
 		cpumask_copy(policy->cpus, cpu->shared_cpu_map);
-	else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
+
+		for_each_cpu(i, policy->cpus) {
+			if (unlikely(i == policy->cpu))
+				continue;
+
+			memcpy(&all_cpu_data[i]->perf_caps, &cpu->perf_caps,
+			       sizeof(cpu->perf_caps));
+		}
+	} else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
 		/* Support only SW_ANY for now. */
 		pr_debug("Unsupported CPU co-ord type\n");
 		return -EFAULT;
-- 
cgit 


From d983af9864b273e2fa19f564e6fa80d67638e539 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 3 Apr 2018 15:37:38 +0530
Subject: cpufreq: SCMI: Don't validate the frequency table twice

The cpufreq core is already validating the CPU frequency table after
calling the ->init() callback of the cpufreq drivers and the drivers
don't need to do the same anymore. Though they need to set the
policy->freq_table field directly from the ->init() callback now.

Stop validating the frequency table from SCMI driver.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scmi-cpufreq.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 959a1dbe3835..b4dbc77459b6 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -159,13 +159,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	priv->domain_id = handle->perf_ops->device_domain_id(cpu_dev);
 
 	policy->driver_data = priv;
-
-	ret = cpufreq_table_validate_and_show(policy, freq_table);
-	if (ret) {
-		dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__,
-			ret);
-		goto out_free_cpufreq_table;
-	}
+	policy->freq_table = freq_table;
 
 	/* SCMI allows DVFS request for any domain from any CPU */
 	policy->dvfs_possible_from_any_cpu = true;
@@ -179,8 +173,6 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	policy->fast_switch_possible = true;
 	return 0;
 
-out_free_cpufreq_table:
-	dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
 out_free_priv:
 	kfree(priv);
 out_free_opp:
-- 
cgit 


From 2dd0df8472ff9bb520673cb5862b08be9290c9fa Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 3 Apr 2018 15:37:39 +0530
Subject: cpufreq: Drop cpufreq_table_validate_and_show()

This isn't used anymore. Remove the helper and update documentation
accordingly.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/core.txt        | 12 +++++-------
 Documentation/cpu-freq/cpu-drivers.txt |  6 ++----
 drivers/cpufreq/freq_table.c           | 14 --------------
 include/linux/cpufreq.h                |  2 --
 4 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
index 978463a7c81e..073f128af5a7 100644
--- a/Documentation/cpu-freq/core.txt
+++ b/Documentation/cpu-freq/core.txt
@@ -97,12 +97,10 @@ flags	- flags of the cpufreq driver
 ==================================================================
 For details about OPP, see Documentation/power/opp.txt
 
-dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with
-	cpufreq_table_validate_and_show() which is provided with the list of
-	frequencies that are available for operation. This function provides
-	a ready to use conversion routine to translate the OPP layer's internal
-	information about the available frequencies into a format readily
-	providable to cpufreq.
+dev_pm_opp_init_cpufreq_table -
+	This function provides a ready to use conversion routine to translate
+	the OPP layer's internal information about the available frequencies
+	into a format readily providable to cpufreq.
 
 	WARNING: Do not use this function in interrupt context.
 
@@ -112,7 +110,7 @@ dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with
 		/* Do things */
 		r = dev_pm_opp_init_cpufreq_table(dev, &freq_table);
 		if (!r)
-			cpufreq_table_validate_and_show(policy, freq_table);
+			policy->freq_table = freq_table;
 		/* Do other things */
 	 }
 
diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt
index 61546ac578d6..6e353d00cdc6 100644
--- a/Documentation/cpu-freq/cpu-drivers.txt
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -259,10 +259,8 @@ CPUFREQ_ENTRY_INVALID. The entries don't need to be in sorted in any
 particular order, but if they are cpufreq core will do DVFS a bit
 quickly for them as search for best match is faster.
 
-By calling cpufreq_table_validate_and_show(), the cpuinfo.min_freq and
-cpuinfo.max_freq values are detected, and policy->min and policy->max
-are set to the same values. This is helpful for the per-CPU
-initialization stage.
+The cpufreq table is verified automatically by the core if the policy contains a
+valid pointer in its policy->freq_table field.
 
 cpufreq_frequency_table_verify() assures that at least one valid
 frequency is within policy->min and policy->max, and all other criteria
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index 10e119ae66dd..3a8cc99e6815 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -352,20 +352,6 @@ static int set_freq_table_sorted(struct cpufreq_policy *policy)
 	return 0;
 }
 
-int cpufreq_table_validate_and_show(struct cpufreq_policy *policy,
-				      struct cpufreq_frequency_table *table)
-{
-	int ret;
-
-	ret = cpufreq_frequency_table_cpuinfo(policy, table);
-	if (ret)
-		return ret;
-
-	policy->freq_table = table;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(cpufreq_table_validate_and_show);
-
 int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy)
 {
 	int ret;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1fe49724da9e..87f48dd932eb 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -960,8 +960,6 @@ extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
 extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
 extern struct freq_attr *cpufreq_generic_attr[];
-int cpufreq_table_validate_and_show(struct cpufreq_policy *policy,
-				      struct cpufreq_frequency_table *table);
 int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy);
 
 unsigned int cpufreq_generic_get(unsigned int cpu);
-- 
cgit 


From bbe9a70a478129f3f9b2003415d0c36afcea210f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 9 Apr 2018 14:23:33 +0200
Subject: tick-sched: avoid a maybe-uninitialized warning

The use of bitfields seems to confuse gcc, leading to a false-positive
warning in all compiler versions:

kernel/time/tick-sched.c: In function 'tick_nohz_idle_exit':
kernel/time/tick-sched.c:538:2: error: 'now' may be used uninitialized in this function [-Werror=maybe-uninitialized]

This introduces a temporary variable to track the flags so gcc
doesn't have to evaluate twice, eliminating the code path that
leads to the warning.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85301
Fixes: 1cae544d42d2 ("nohz: Gather tick_sched booleans under a common flag field")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/time/tick-sched.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 956831cf6cfb..e35a6fced00c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1134,6 +1134,7 @@ void tick_nohz_idle_restart_tick(void)
 void tick_nohz_idle_exit(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+	bool idle_active, tick_stopped;
 	ktime_t now;
 
 	local_irq_disable();
@@ -1142,14 +1143,16 @@ void tick_nohz_idle_exit(void)
 	WARN_ON_ONCE(ts->timer_expires_base);
 
 	ts->inidle = 0;
+	idle_active = ts->idle_active;
+	tick_stopped = ts->tick_stopped;
 
-	if (ts->idle_active || ts->tick_stopped)
+	if (idle_active || tick_stopped)
 		now = ktime_get();
 
-	if (ts->idle_active)
+	if (idle_active)
 		tick_nohz_stop_idle(ts, now);
 
-	if (ts->tick_stopped)
+	if (tick_stopped)
 		__tick_nohz_idle_restart_tick(ts, now);
 
 	local_irq_enable();
-- 
cgit