summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/cpu-freq/core.txt12
-rw-r--r--Documentation/cpu-freq/cpu-drivers.txt6
-rw-r--r--Documentation/cpuidle/sysfs.txt6
-rw-r--r--arch/x86/xen/smp_pv.c1
-rw-r--r--drivers/cpufreq/armada-37xx-cpufreq.c2
-rw-r--r--drivers/cpufreq/cppc_cpufreq.c15
-rw-r--r--drivers/cpufreq/freq_table.c14
-rw-r--r--drivers/cpufreq/intel_pstate.c1
-rw-r--r--drivers/cpufreq/scmi-cpufreq.c10
-rw-r--r--drivers/cpufreq/ti-cpufreq.c2
-rw-r--r--drivers/cpuidle/cpuidle.c10
-rw-r--r--drivers/cpuidle/governors/ladder.c3
-rw-r--r--drivers/cpuidle/governors/menu.c113
-rw-r--r--drivers/net/ethernet/sfc/mcdi.c2
-rw-r--r--include/linux/cpufreq.h2
-rw-r--r--include/linux/cpuidle.h8
-rw-r--r--include/linux/hrtimer.h1
-rw-r--r--include/linux/jiffies.h7
-rw-r--r--include/linux/tick.h25
-rw-r--r--kernel/power/qos.c2
-rw-r--r--kernel/sched/idle.c34
-rw-r--r--kernel/time/hrtimer.c53
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-sched.c250
-rw-r--r--kernel/time/tick-sched.h18
25 files changed, 438 insertions, 161 deletions
diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
index 978463a7c81e..073f128af5a7 100644
--- a/Documentation/cpu-freq/core.txt
+++ b/Documentation/cpu-freq/core.txt
@@ -97,12 +97,10 @@ flags - flags of the cpufreq driver
==================================================================
For details about OPP, see Documentation/power/opp.txt
-dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with
- cpufreq_table_validate_and_show() which is provided with the list of
- frequencies that are available for operation. This function provides
- a ready to use conversion routine to translate the OPP layer's internal
- information about the available frequencies into a format readily
- providable to cpufreq.
+dev_pm_opp_init_cpufreq_table -
+ This function provides a ready to use conversion routine to translate
+ the OPP layer's internal information about the available frequencies
+ into a format readily providable to cpufreq.
WARNING: Do not use this function in interrupt context.
@@ -112,7 +110,7 @@ dev_pm_opp_init_cpufreq_table - cpufreq framework typically is initialized with
/* Do things */
r = dev_pm_opp_init_cpufreq_table(dev, &freq_table);
if (!r)
- cpufreq_table_validate_and_show(policy, freq_table);
+ policy->freq_table = freq_table;
/* Do other things */
}
diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt
index 61546ac578d6..6e353d00cdc6 100644
--- a/Documentation/cpu-freq/cpu-drivers.txt
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -259,10 +259,8 @@ CPUFREQ_ENTRY_INVALID. The entries don't need to be in sorted in any
particular order, but if they are cpufreq core will do DVFS a bit
quickly for them as search for best match is faster.
-By calling cpufreq_table_validate_and_show(), the cpuinfo.min_freq and
-cpuinfo.max_freq values are detected, and policy->min and policy->max
-are set to the same values. This is helpful for the per-CPU
-initialization stage.
+The cpufreq table is verified automatically by the core if the policy contains a
+valid pointer in its policy->freq_table field.
cpufreq_frequency_table_verify() assures that at least one valid
frequency is within policy->min and policy->max, and all other criteria
diff --git a/Documentation/cpuidle/sysfs.txt b/Documentation/cpuidle/sysfs.txt
index b6f44f490ed7..d1587f434e7b 100644
--- a/Documentation/cpuidle/sysfs.txt
+++ b/Documentation/cpuidle/sysfs.txt
@@ -40,6 +40,7 @@ total 0
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
@@ -50,6 +51,7 @@ total 0
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
@@ -60,6 +62,7 @@ total 0
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
@@ -70,6 +73,7 @@ total 0
-r--r--r-- 1 root root 4096 Feb 8 10:42 latency
-r--r--r-- 1 root root 4096 Feb 8 10:42 name
-r--r--r-- 1 root root 4096 Feb 8 10:42 power
+-r--r--r-- 1 root root 4096 Feb 8 10:42 residency
-r--r--r-- 1 root root 4096 Feb 8 10:42 time
-r--r--r-- 1 root root 4096 Feb 8 10:42 usage
--------------------------------------------------------------------------------
@@ -78,6 +82,8 @@ total 0
* desc : Small description about the idle state (string)
* disable : Option to disable this idle state (bool) -> see note below
* latency : Latency to exit out of this idle state (in microseconds)
+* residency : Time after which a state becomes more effecient than any
+ shallower state (in microseconds)
* name : Name of the idle state (string)
* power : Power consumed while in this idle state (in milliwatts)
* time : Total time spent in this idle state (in microseconds)
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index c0c756c76afe..2e20ae2fa2d6 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -425,6 +425,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
* data back is to call:
*/
tick_nohz_idle_enter();
+ tick_nohz_idle_stop_tick_protected();
cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
}
diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c
index c6ebc88a7d8d..72a2975499db 100644
--- a/drivers/cpufreq/armada-37xx-cpufreq.c
+++ b/drivers/cpufreq/armada-37xx-cpufreq.c
@@ -202,6 +202,7 @@ static int __init armada37xx_cpufreq_driver_init(void)
cur_frequency = clk_get_rate(clk);
if (!cur_frequency) {
dev_err(cpu_dev, "Failed to get clock rate for CPU\n");
+ clk_put(clk);
return -EINVAL;
}
@@ -210,6 +211,7 @@ static int __init armada37xx_cpufreq_driver_init(void)
return -EINVAL;
armada37xx_cpufreq_dvfs_setup(nb_pm_base, clk, dvfs->divider);
+ clk_put(clk);
for (load_lvl = ARMADA_37XX_DVFS_LOAD_0; load_lvl < LOAD_LEVEL_NR;
load_lvl++) {
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 8300a9fcb80c..bc5fc1630876 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -162,14 +162,23 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
cpu->perf_caps.highest_perf;
policy->cpuinfo.max_freq = cppc_dmi_max_khz;
- policy->cpuinfo.transition_latency = cppc_get_transition_latency(cpu_num);
policy->transition_delay_us = cppc_get_transition_latency(cpu_num) /
NSEC_PER_USEC;
policy->shared_type = cpu->shared_type;
- if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
+ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+ int i;
+
cpumask_copy(policy->cpus, cpu->shared_cpu_map);
- else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
+
+ for_each_cpu(i, policy->cpus) {
+ if (unlikely(i == policy->cpu))
+ continue;
+
+ memcpy(&all_cpu_data[i]->perf_caps, &cpu->perf_caps,
+ sizeof(cpu->perf_caps));
+ }
+ } else if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL) {
/* Support only SW_ANY for now. */
pr_debug("Unsupported CPU co-ord type\n");
return -EFAULT;
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index 10e119ae66dd..3a8cc99e6815 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -352,20 +352,6 @@ static int set_freq_table_sorted(struct cpufreq_policy *policy)
return 0;
}
-int cpufreq_table_validate_and_show(struct cpufreq_policy *policy,
- struct cpufreq_frequency_table *table)
-{
- int ret;
-
- ret = cpufreq_frequency_table_cpuinfo(policy, table);
- if (ret)
- return ret;
-
- policy->freq_table = table;
- return 0;
-}
-EXPORT_SYMBOL_GPL(cpufreq_table_validate_and_show);
-
int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy)
{
int ret;
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 6d084c61ee25..17e566afbb41 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -26,7 +26,6 @@
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/fs.h>
-#include <linux/debugfs.h>
#include <linux/acpi.h>
#include <linux/vmalloc.h>
#include <trace/events/power.h>
diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 959a1dbe3835..b4dbc77459b6 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -159,13 +159,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
priv->domain_id = handle->perf_ops->device_domain_id(cpu_dev);
policy->driver_data = priv;
-
- ret = cpufreq_table_validate_and_show(policy, freq_table);
- if (ret) {
- dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__,
- ret);
- goto out_free_cpufreq_table;
- }
+ policy->freq_table = freq_table;
/* SCMI allows DVFS request for any domain from any CPU */
policy->dvfs_possible_from_any_cpu = true;
@@ -179,8 +173,6 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
policy->fast_switch_possible = true;
return 0;
-out_free_cpufreq_table:
- dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table);
out_free_priv:
kfree(priv);
out_free_opp:
diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index a099b7bf74cd..6ba709b6f095 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -304,7 +304,7 @@ static struct platform_driver ti_cpufreq_driver = {
.name = "ti-cpufreq",
},
};
-module_platform_driver(ti_cpufreq_driver);
+builtin_platform_driver(ti_cpufreq_driver);
MODULE_DESCRIPTION("TI CPUFreq/OPP hw-supported driver");
MODULE_AUTHOR("Dave Gerlach <d-gerlach@ti.com>");
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 0003e9a02637..6df894d65d9e 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -272,12 +272,18 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
*
* @drv: the cpuidle driver
* @dev: the cpuidle device
+ * @stop_tick: indication on whether or not to stop the tick
*
* Returns the index of the idle state. The return value must not be negative.
+ *
+ * The memory location pointed to by @stop_tick is expected to be written the
+ * 'false' boolean value if the scheduler tick should not be stopped before
+ * entering the returned state.
*/
-int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ bool *stop_tick)
{
- return cpuidle_curr_governor->select(drv, dev);
+ return cpuidle_curr_governor->select(drv, dev, stop_tick);
}
/**
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 1ad8745fd6d6..b24883f85c99 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -63,9 +63,10 @@ static inline void ladder_do_selection(struct ladder_device *ldev,
* ladder_select_state - selects the next state to enter
* @drv: cpuidle driver
* @dev: the CPU
+ * @dummy: not used
*/
static int ladder_select_state(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+ struct cpuidle_device *dev, bool *dummy)
{
struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
struct device *device = get_cpu_device(dev->cpu);
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index aa390404e85f..1bfe03ceb236 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -123,6 +123,7 @@
struct menu_device {
int last_state_idx;
int needs_update;
+ int tick_wakeup;
unsigned int next_timer_us;
unsigned int predicted_us;
@@ -279,8 +280,10 @@ again:
* menu_select - selects the next idle state to enter
* @drv: cpuidle driver containing state data
* @dev: the CPU
+ * @stop_tick: indication on whether or not to stop the tick
*/
-static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ bool *stop_tick)
{
struct menu_device *data = this_cpu_ptr(&menu_devices);
struct device *device = get_cpu_device(dev->cpu);
@@ -292,6 +295,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
unsigned int expected_interval;
unsigned long nr_iowaiters, cpu_load;
int resume_latency = dev_pm_qos_raw_read_value(device);
+ ktime_t delta_next;
if (data->needs_update) {
menu_update(drv, dev);
@@ -303,11 +307,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
latency_req = resume_latency;
/* Special case when user has set very strict latency requirement */
- if (unlikely(latency_req == 0))
+ if (unlikely(latency_req == 0)) {
+ *stop_tick = false;
return 0;
+ }
/* determine the expected residency time, round up */
- data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+ data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next));
get_iowait_load(&nr_iowaiters, &cpu_load);
data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
@@ -346,14 +352,30 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
*/
data->predicted_us = min(data->predicted_us, expected_interval);
- /*
- * Use the performance multiplier and the user-configurable
- * latency_req to determine the maximum exit latency.
- */
- interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
- if (latency_req > interactivity_req)
- latency_req = interactivity_req;
+ if (tick_nohz_tick_stopped()) {
+ /*
+ * If the tick is already stopped, the cost of possible short
+ * idle duration misprediction is much higher, because the CPU
+ * may be stuck in a shallow idle state for a long time as a
+ * result of it. In that case say we might mispredict and try
+ * to force the CPU into a state for which we would have stopped
+ * the tick, unless a timer is going to expire really soon
+ * anyway.
+ */
+ if (data->predicted_us < TICK_USEC)
+ data->predicted_us = min_t(unsigned int, TICK_USEC,
+ ktime_to_us(delta_next));
+ } else {
+ /*
+ * Use the performance multiplier and the user-configurable
+ * latency_req to determine the maximum exit latency.
+ */
+ interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
+ if (latency_req > interactivity_req)
+ latency_req = interactivity_req;
+ }
+ expected_interval = data->predicted_us;
/*
* Find the idle state with the lowest power while satisfying
* our constraints.
@@ -369,15 +391,52 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
idx = i; /* first enabled state */
if (s->target_residency > data->predicted_us)
break;
- if (s->exit_latency > latency_req)
+ if (s->exit_latency > latency_req) {
+ /*
+ * If we break out of the loop for latency reasons, use
+ * the target residency of the selected state as the
+ * expected idle duration so that the tick is retained
+ * as long as that target residency is low enough.
+ */
+ expected_interval = drv->states[idx].target_residency;
break;
-
+ }
idx = i;
}
if (idx == -1)
idx = 0; /* No states enabled. Must use 0. */
+ /*
+ * Don't stop the tick if the selected state is a polling one or if the
+ * expected idle duration is shorter than the tick period length.
+ */
+ if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
+ expected_interval < TICK_USEC) {
+ unsigned int delta_next_us = ktime_to_us(delta_next);
+
+ *stop_tick = false;
+
+ if (!tick_nohz_tick_stopped() && idx > 0 &&
+ drv->states[idx].target_residency > delta_next_us) {
+ /*
+ * The tick is not going to be stopped and the target
+ * residency of the state to be returned is not within
+ * the time until the next timer event including the
+ * tick, so try to correct that.
+ */
+ for (i = idx - 1; i >= 0; i--) {
+ if (drv->states[i].disabled ||
+ dev->states_usage[i].disable)
+ continue;
+
+ idx = i;
+ if (drv->states[i].target_residency <= delta_next_us)
+ break;
+ }
+ }
+ }
+
data->last_state_idx = idx;
return data->last_state_idx;
@@ -397,6 +456,7 @@ static void menu_reflect(struct cpuidle_device *dev, int index)
data->last_state_idx = index;
data->needs_update = 1;
+ data->tick_wakeup = tick_nohz_idle_got_tick();
}
/**
@@ -427,14 +487,27 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
* assume the state was never reached and the exit latency is 0.
*/
- /* measured value */
- measured_us = cpuidle_get_last_residency(dev);
-
- /* Deduct exit latency */
- if (measured_us > 2 * target->exit_latency)
- measured_us -= target->exit_latency;
- else
- measured_us /= 2;
+ if (data->tick_wakeup && data->next_timer_us > TICK_USEC) {
+ /*
+ * The nohz code said that there wouldn't be any events within
+ * the tick boundary (if the tick was stopped), but the idle
+ * duration predictor had a differing opinion. Since the CPU
+ * was woken up by a tick (that wasn't stopped after all), the
+ * predictor was not quite right, so assume that the CPU could
+ * have been idle long (but not forever) to help the idle
+ * duration predictor do a better job next time.
+ */
+ measured_us = 9 * MAX_INTERESTING / 10;
+ } else {
+ /* measured value */
+ measured_us = cpuidle_get_last_residency(dev);
+
+ /* Deduct exit latency */
+ if (measured_us > 2 * target->exit_latency)
+ measured_us -= target->exit_latency;
+ else
+ measured_us /= 2;
+ }
/* Make sure our coefficients do not exceed unity */
if (measured_us > data->next_timer_us)
diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index 9c2567b0d93e..dfad93fca0a6 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -375,7 +375,7 @@ static int efx_mcdi_poll(struct efx_nic *efx)
* because generally mcdi responses are fast. After that, back off
* and poll once a jiffy (approximately)
*/
- spins = TICK_USEC;
+ spins = USER_TICK_USEC;
finish = jiffies + MCDI_RPC_TIMEOUT;
while (1) {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1fe49724da9e..87f48dd932eb 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -960,8 +960,6 @@ extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
extern struct freq_attr *cpufreq_generic_attr[];
-int cpufreq_table_validate_and_show(struct cpufreq_policy *policy,
- struct cpufreq_frequency_table *table);
int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy);
unsigned int cpufreq_generic_get(unsigned int cpu);
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index a806e94c482f..1eefabf1621f 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -135,7 +135,8 @@ extern bool cpuidle_not_available(struct cpuidle_driver *drv,
struct cpuidle_device *dev);
extern int cpuidle_select(struct cpuidle_driver *drv,
- struct cpuidle_device *dev);
+ struct cpuidle_device *dev,
+ bool *stop_tick);
extern int cpuidle_enter(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index);
extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
@@ -167,7 +168,7 @@ static inline bool cpuidle_not_available(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
{return true; }
static inline int cpuidle_select(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+ struct cpuidle_device *dev, bool *stop_tick)
{return -ENODEV; }
static inline int cpuidle_enter(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index)
@@ -250,7 +251,8 @@ struct cpuidle_governor {
struct cpuidle_device *dev);
int (*select) (struct cpuidle_driver *drv,
- struct cpuidle_device *dev);
+ struct cpuidle_device *dev,
+ bool *stop_tick);
void (*reflect) (struct cpuidle_device *dev, int index);
};
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 78f456fcd242..a2656c3ebe81 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -424,6 +424,7 @@ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
}
extern u64 hrtimer_get_next_event(void);
+extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);
extern bool hrtimer_active(const struct hrtimer *timer);
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 9385aa57497b..a27cf6652327 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -62,8 +62,11 @@ extern int register_refined_jiffies(long clock_tick_rate);
/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */
#define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ)
-/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
-#define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
+/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */
+#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ)
+
+/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
+#define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
#ifndef __jiffy_arch_data
#define __jiffy_arch_data
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7f8c9a127f5a..55388ab45fd4 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -115,27 +115,46 @@ enum tick_dep_bits {
extern bool tick_nohz_enabled;
extern bool tick_nohz_tick_stopped(void);
extern bool tick_nohz_tick_stopped_cpu(int cpu);
+extern void tick_nohz_idle_stop_tick(void);
+extern void tick_nohz_idle_retain_tick(void);
+extern void tick_nohz_idle_restart_tick(void);
extern void tick_nohz_idle_enter(void);
extern void tick_nohz_idle_exit(void);
extern void tick_nohz_irq_exit(void);
-extern ktime_t tick_nohz_get_sleep_length(void);
+extern bool tick_nohz_idle_got_tick(void);
+extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next);
extern unsigned long tick_nohz_get_idle_calls(void);
extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
+
+static inline void tick_nohz_idle_stop_tick_protected(void)
+{
+ local_irq_disable();
+ tick_nohz_idle_stop_tick();
+ local_irq_enable();
+}
+
#else /* !CONFIG_NO_HZ_COMMON */
#define tick_nohz_enabled (0)
static inline int tick_nohz_tick_stopped(void) { return 0; }
static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
+static inline void tick_nohz_idle_stop_tick(void) { }
+static inline void tick_nohz_idle_retain_tick(void) { }
+static inline void tick_nohz_idle_restart_tick(void) { }
static inline void tick_nohz_idle_enter(void) { }
static inline void tick_nohz_idle_exit(void) { }
+static inline bool tick_nohz_idle_got_tick(void) { return false; }
-static inline ktime_t tick_nohz_get_sleep_length(void)
+static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
- return NSEC_PER_SEC / HZ;
+ *delta_next = TICK_NSEC;
+ return *delta_next;
}
static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
+
+static inline void tick_nohz_idle_stop_tick_protected(void) { }
#endif /* !CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d7503910ce2..fa39092b7aea 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -295,6 +295,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
* changed
*/
plist_del(node, &c->list);
+ /* fall through */
case PM_QOS_ADD_REQ:
plist_node_init(node, new_value);
plist_add(node, &c->list);
@@ -367,6 +368,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
break;
case PM_QOS_UPDATE_REQ:
pm_qos_flags_remove_req(pqf, req);
+ /* fall through */
case PM_QOS_ADD_REQ:
req->flags = val;
INIT_LIST_HEAD(&req->node);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2975f195e1c4..1a3e9bddd17b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -141,13 +141,15 @@ static void cpuidle_idle_call(void)
}
/*
- * Tell the RCU framework we are entering an idle section,
- * so no more rcu read side critical sections and one more
+ * The RCU framework needs to be told that we are entering an idle
+ * section, so no more rcu read side critical sections and one more
* step to the grace period
*/
- rcu_idle_enter();
if (cpuidle_not_available(drv, dev)) {
+ tick_nohz_idle_stop_tick();
+ rcu_idle_enter();
+
default_idle_call();
goto exit_idle;
}
@@ -164,20 +166,37 @@ static void cpuidle_idle_call(void)
if (idle_should_enter_s2idle() || dev->use_deepest_state) {
if (idle_should_enter_s2idle()) {
+ rcu_idle_enter();
+
entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
+
+ rcu_idle_exit();
}
+ tick_nohz_idle_stop_tick();
+ rcu_idle_enter();
+
next_state = cpuidle_find_deepest_state(drv, dev);
call_cpuidle(drv, dev, next_state);
} else {
+ bool stop_tick = true;
+
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
- next_state = cpuidle_select(drv, dev);
+ next_state = cpuidle_select(drv, dev, &stop_tick);
+
+ if (stop_tick)
+ tick_nohz_idle_stop_tick();
+ else
+ tick_nohz_idle_retain_tick();
+
+ rcu_idle_enter();
+
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
@@ -222,6 +241,7 @@ static void do_idle(void)
rmb();
if (cpu_is_offline(cpu)) {
+ tick_nohz_idle_stop_tick_protected();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
@@ -235,10 +255,12 @@ static void do_idle(void)
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
- if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ tick_nohz_idle_restart_tick();
cpu_idle_poll();
- else
+ } else {
cpuidle_idle_call();
+ }
arch_cpu_idle_exit();
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9b082ce86325..eda1210ce50f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -480,6 +480,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
while ((base = __next_base((cpu_base), &(active))))
static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+ const struct hrtimer *exclude,
unsigned int active,
ktime_t expires_next)
{
@@ -492,9 +493,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
+ if (timer == exclude) {
+ /* Get to the next timer in the queue. */
+ next = timerqueue_iterate_next(next);
+ if (!next)
+ continue;
+
+ timer = container_of(next, struct hrtimer, node);
+ }
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
+
+ /* Skip cpu_base update if a timer is being excluded. */
+ if (exclude)
+ continue;
+
if (timer->is_soft)
cpu_base->softirq_next_timer = timer;
else
@@ -538,7 +552,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
cpu_base->softirq_next_timer = NULL;
- expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
+ expires_next = __hrtimer_next_event_base(cpu_base, NULL,
+ active, KTIME_MAX);
next_timer = cpu_base->softirq_next_timer;
}
@@ -546,7 +561,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
if (active_mask & HRTIMER_ACTIVE_HARD) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
cpu_base->next_timer = next_timer;
- expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+ expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
+ expires_next);
}
return expires_next;
@@ -1190,6 +1206,39 @@ u64 hrtimer_get_next_event(void)
return expires;
}
+
+/**
+ * hrtimer_next_event_without - time until next expiry event w/o one timer
+ * @exclude: timer to exclude
+ *
+ * Returns the next expiry time over all timers except for the @exclude one or
+ * KTIME_MAX if none of them is pending.
+ */
+u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ u64 expires = KTIME_MAX;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+ if (__hrtimer_hres_active(cpu_base)) {
+ unsigned int active;
+
+ if (!cpu_base->softirq_activated) {
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ expires = __hrtimer_next_event_base(cpu_base, exclude,
+ active, KTIME_MAX);
+ }
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ expires = __hrtimer_next_event_base(cpu_base, exclude, active,
+ expires);
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
+ return expires;
+}
#endif
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8d70da1b9a0d..a09ded765f6c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@
/* USER_HZ period (usecs): */
-unsigned long tick_usec = TICK_USEC;
+unsigned long tick_usec = USER_TICK_USEC;
/* SHIFTED_HZ period (nsecs): */
unsigned long tick_nsec;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f3ab08caa2c3..646645e981f9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -122,8 +122,7 @@ static ktime_t tick_init_jiffy_update(void)
return period;
}
-
-static void tick_sched_do_timer(ktime_t now)
+static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
@@ -143,6 +142,9 @@ static void tick_sched_do_timer(ktime_t now)
/* Check, if the jiffies need an update */
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
+
+ if (ts->inidle)
+ ts->got_idle_tick = 1;
}
static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
@@ -474,7 +476,9 @@ __setup("nohz=", setup_tick_nohz);
bool tick_nohz_tick_stopped(void)
{
- return __this_cpu_read(tick_cpu_sched.tick_stopped);
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->tick_stopped;
}
bool tick_nohz_tick_stopped_cpu(int cpu)
@@ -537,14 +541,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
sched_clock_idle_wakeup_event();
}
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static void tick_nohz_start_idle(struct tick_sched *ts)
{
- ktime_t now = ktime_get();
-
- ts->idle_entrytime = now;
+ ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
sched_clock_idle_sleep_event();
- return now;
}
/**
@@ -653,13 +654,10 @@ static inline bool local_timer_softirq_pending(void)
return local_softirq_pending() & TIMER_SOFTIRQ;
}
-static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
- ktime_t now, int cpu)
+static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
- struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
unsigned long seq, basejiff;
- ktime_t tick;
/* Read jiffies and the time when jiffies were updated last */
do {
@@ -668,6 +666,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
+ ts->timer_expires_base = basemono;
/*
* Keep the periodic tick, when RCU, architecture or irq_work
@@ -712,47 +711,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
* next period, so no point in stopping it either, bail.
*/
if (!ts->tick_stopped) {
- tick = 0;
+ ts->timer_expires = 0;
goto out;
}
}
/*
+ * If this CPU is the one which had the do_timer() duty last, we limit
+ * the sleep time to the timekeeping max_deferment value.
+ * Otherwise we can sleep as long as we want.
+ */
+ delta = timekeeping_max_deferment();
+ if (cpu != tick_do_timer_cpu &&
+ (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
+ delta = KTIME_MAX;
+
+ /* Calculate the next expiry time */
+ if (delta < (KTIME_MAX - basemono))
+ expires = basemono + delta;
+ else
+ expires = KTIME_MAX;
+
+ ts->timer_expires = min_t(u64, expires, next_tick);
+
+out:
+ return ts->timer_expires;
+}
+
+static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
+{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+ u64 basemono = ts->timer_expires_base;
+ u64 expires = ts->timer_expires;
+ ktime_t tick = expires;
+
+ /* Make sure we won't be trying to stop it twice in a row. */
+ ts->timer_expires_base = 0;
+
+ /*
* If this CPU is the one which updates jiffies, then give up
* the assignment and let it be taken by the CPU which runs
* the tick timer next, which might be this CPU as well. If we
* don't drop this here the jiffies might be stale and
* do_timer() never invoked. Keep track of the fact that it
- * was the one which had the do_timer() duty last. If this CPU
- * is the one which had the do_timer() duty last, we limit the
- * sleep time to the timekeeping max_deferment value.
- * Otherwise we can sleep as long as we want.
+ * was the one which had the do_timer() duty last.
*/
- delta = timekeeping_max_deferment();
if (cpu == tick_do_timer_cpu) {
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
ts->do_timer_last = 1;
} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
- delta = KTIME_MAX;
ts->do_timer_last = 0;
- } else if (!ts->do_timer_last) {
- delta = KTIME_MAX;
}
- /* Calculate the next expiry time */
- if (delta < (KTIME_MAX - basemono))
- expires = basemono + delta;
- else
- expires = KTIME_MAX;
-
- expires = min_t(u64, expires, next_tick);
- tick = expires;
-
/* Skip reprogram of event if its not changed */
if (ts->tick_stopped && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
- goto out;
+ return;
WARN_ON_ONCE(1);
printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
@@ -786,7 +801,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
if (unlikely(expires == KTIME_MAX)) {
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_cancel(&ts->sched_timer);
- goto out;
+ return;
}
hrtimer_set_expires(&ts->sched_timer, tick);
@@ -795,15 +810,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
else
tick_program_event(tick, 1);
-out:
- /*
- * Update the estimated sleep length until the next timer
- * (not only the tick).
- */
- ts->sleep_length = ktime_sub(dev->next_event, now);
- return tick;
}
+static void tick_nohz_retain_tick(struct tick_sched *ts)
+{
+ ts->timer_expires_base = 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
+{
+ if (tick_nohz_next_event(ts, cpu))
+ tick_nohz_stop_tick(ts, cpu);
+ else
+ tick_nohz_retain_tick(ts);
+}
+#endif /* CONFIG_NO_HZ_FULL */
+
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
@@ -839,7 +862,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
return;
if (can_stop_full_tick(cpu, ts))
- tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ tick_nohz_stop_sched_tick(ts, cpu);
else if (ts->tick_stopped)
tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
@@ -865,10 +888,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return false;
}
- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
- ts->sleep_length = NSEC_PER_SEC / HZ;
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
return false;
- }
if (need_resched())
return false;
@@ -903,42 +924,65 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return true;
}
-static void __tick_nohz_idle_enter(struct tick_sched *ts)
+static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
{
- ktime_t now, expires;
+ ktime_t expires;
int cpu = smp_processor_id();
- now = tick_nohz_start_idle(ts);
+ /*
+ * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
+ * tick timer expiration time is known already.
+ */
+ if (ts->timer_expires_base)
+ expires = ts->timer_expires;
+ else if (can_stop_idle_tick(cpu, ts))
+ expires = tick_nohz_next_event(ts, cpu);
+ else
+ return;
+
+ ts->idle_calls++;
- if (can_stop_idle_tick(cpu, ts)) {
+ if (expires > 0LL) {
int was_stopped = ts->tick_stopped;
- ts->idle_calls++;
+ tick_nohz_stop_tick(ts, cpu);
- expires = tick_nohz_stop_sched_tick(ts, now, cpu);
- if (expires > 0LL) {
- ts->idle_sleeps++;
- ts->idle_expires = expires;
- }
+ ts->idle_sleeps++;
+ ts->idle_expires = expires;
if (!was_stopped && ts->tick_stopped) {
ts->idle_jiffies = ts->last_jiffies;
nohz_balance_enter_idle(cpu);
}
+ } else {
+ tick_nohz_retain_tick(ts);
}
}
/**
- * tick_nohz_idle_enter - stop the idle tick from the idle task
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
*
* When the next event is more than a tick into the future, stop the idle tick
- * Called when we start the idle loop.
- *
- * The arch is responsible of calling:
+ */
+void tick_nohz_idle_stop_tick(void)
+{
+ __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
+}
+
+void tick_nohz_idle_retain_tick(void)
+{
+ tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
+ /*
+ * Undo the effect of get_next_timer_interrupt() called from
+ * tick_nohz_next_event().
+ */
+ timer_clear_idle();
+}
+
+/**
+ * tick_nohz_idle_enter - prepare for entering idle on the current CPU
*
- * - rcu_idle_enter() after its last use of RCU before the CPU is put
- * to sleep.
- * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ * Called when we start the idle loop.
*/
void tick_nohz_idle_enter(void)
{
@@ -949,8 +993,11 @@ void tick_nohz_idle_enter(void)
local_irq_disable();
ts = this_cpu_ptr(&tick_cpu_sched);
+
+ WARN_ON_ONCE(ts->timer_expires_base);
+
ts->inidle = 1;
- __tick_nohz_idle_enter(ts);
+ tick_nohz_start_idle(ts);
local_irq_enable();
}
@@ -968,21 +1015,62 @@ void tick_nohz_irq_exit(void)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->inidle)
- __tick_nohz_idle_enter(ts);
+ tick_nohz_start_idle(ts);
else
tick_nohz_full_update_tick(ts);
}
/**
- * tick_nohz_get_sleep_length - return the length of the current sleep
+ * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
+ */
+bool tick_nohz_idle_got_tick(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->got_idle_tick) {
+ ts->got_idle_tick = 0;
+ return true;
+ }
+ return false;
+}
+
+/**
+ * tick_nohz_get_sleep_length - return the expected length of the current sleep
+ * @delta_next: duration until the next event if the tick cannot be stopped
*
* Called from power state control code with interrupts disabled
*/
-ktime_t tick_nohz_get_sleep_length(void)
+ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
+ struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ int cpu = smp_processor_id();
+ /*
+ * The idle entry time is expected to be a sufficient approximation of
+ * the current time at this point.
+ */
+ ktime_t now = ts->idle_entrytime;
+ ktime_t next_event;
+
+ WARN_ON_ONCE(!ts->inidle);
+
+ *delta_next = ktime_sub(dev->next_event, now);
+
+ if (!can_stop_idle_tick(cpu, ts))
+ return *delta_next;
+
+ next_event = tick_nohz_next_event(ts, cpu);
+ if (!next_event)
+ return *delta_next;
+
+ /*
+ * If the next highres timer to expire is earlier than next_event, the
+ * idle governor needs to know that.
+ */
+ next_event = min_t(u64, next_event,
+ hrtimer_next_event_without(&ts->sched_timer));
- return ts->sleep_length;
+ return ktime_sub(next_event, now);
}
/**
@@ -1031,6 +1119,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#endif
}
+static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
+{
+ tick_nohz_restart_sched_tick(ts, now);
+ tick_nohz_account_idle_ticks(ts);
+}
+
+void tick_nohz_idle_restart_tick(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ if (ts->tick_stopped)
+ __tick_nohz_idle_restart_tick(ts, ktime_get());
+}
+
/**
* tick_nohz_idle_exit - restart the idle tick from the idle task
*
@@ -1041,24 +1143,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
void tick_nohz_idle_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+ bool idle_active, tick_stopped;
ktime_t now;
local_irq_disable();
WARN_ON_ONCE(!ts->inidle);
+ WARN_ON_ONCE(ts->timer_expires_base);
ts->inidle = 0;
+ idle_active = ts->idle_active;
+ tick_stopped = ts->tick_stopped;
- if (ts->idle_active || ts->tick_stopped)
+ if (idle_active || tick_stopped)
now = ktime_get();
- if (ts->idle_active)
+ if (idle_active)
tick_nohz_stop_idle(ts, now);
- if (ts->tick_stopped) {
- tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_ticks(ts);
- }
+ if (tick_stopped)
+ __tick_nohz_idle_restart_tick(ts, now);
local_irq_enable();
}
@@ -1074,7 +1178,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
dev->next_event = KTIME_MAX;
- tick_sched_do_timer(now);
+ tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
/* No need to reprogram if we are running tickless */
@@ -1169,7 +1273,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
- tick_sched_do_timer(now);
+ tick_sched_do_timer(ts, now);
/*
* Do not call, when we are not in irq context and have
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 954b43dbf21c..6de959a854b2 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -38,31 +38,37 @@ enum tick_nohz_mode {
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @sleep_length: Duration of the current idle sleep
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
* @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @got_idle_tick: Tick timer function has run with @inidle set
*/
struct tick_sched {
struct hrtimer sched_timer;
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode;
+
+ unsigned int inidle : 1;
+ unsigned int tick_stopped : 1;
+ unsigned int idle_active : 1;
+ unsigned int do_timer_last : 1;
+ unsigned int got_idle_tick : 1;
+
ktime_t last_tick;
ktime_t next_tick;
- int inidle;
- int tick_stopped;
unsigned long idle_jiffies;
unsigned long idle_calls;
unsigned long idle_sleeps;
- int idle_active;
ktime_t idle_entrytime;
ktime_t idle_waketime;
ktime_t idle_exittime;
ktime_t idle_sleeptime;
ktime_t iowait_sleeptime;
- ktime_t sleep_length;
unsigned long last_jiffies;
+ u64 timer_expires;
+ u64 timer_expires_base;
u64 next_timer;
ktime_t idle_expires;
- int do_timer_last;
atomic_t tick_dep_mask;
};