diff options
Diffstat (limited to 'drivers/cpuidle')
-rw-r--r-- | drivers/cpuidle/coupled.c | 13 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-arm.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-haltpoll.c | 1 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-kirkwood.c | 3 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-psci-domain.c | 21 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-psci.c | 31 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-psci.h | 20 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-pseries.c | 1 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-qcom-spm.c | 2 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle-riscv-sbi.c | 92 | ||||
-rw-r--r-- | drivers/cpuidle/cpuidle.c | 19 | ||||
-rw-r--r-- | drivers/cpuidle/driver.c | 7 | ||||
-rw-r--r-- | drivers/cpuidle/dt_idle_genpd.c | 14 | ||||
-rw-r--r-- | drivers/cpuidle/governors/haltpoll.c | 9 | ||||
-rw-r--r-- | drivers/cpuidle/governors/ladder.c | 1 | ||||
-rw-r--r-- | drivers/cpuidle/governors/menu.c | 83 | ||||
-rw-r--r-- | drivers/cpuidle/governors/teo.c | 420 |
17 files changed, 238 insertions, 501 deletions
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index 9acde71558d5..bb8761c8a42e 100644 --- a/drivers/cpuidle/coupled.c +++ b/drivers/cpuidle/coupled.c @@ -439,13 +439,8 @@ static int cpuidle_coupled_clear_pokes(int cpu) static bool cpuidle_coupled_any_pokes_pending(struct cpuidle_coupled *coupled) { - cpumask_t cpus; - int ret; - - cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus); - ret = cpumask_and(&cpus, &cpuidle_coupled_poke_pending, &cpus); - - return ret; + return cpumask_first_and_and(cpu_online_mask, &coupled->coupled_cpus, + &cpuidle_coupled_poke_pending) < nr_cpu_ids; } /** @@ -626,9 +621,7 @@ out: static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled) { - cpumask_t cpus; - cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus); - coupled->online_count = cpumask_weight(&cpus); + coupled->online_count = cpumask_weight_and(cpu_online_mask, &coupled->coupled_cpus); } /** diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c index 7cfb980a357d..caba6f4bb1b7 100644 --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -139,7 +139,7 @@ out_kfree_drv: * * Initializes arm cpuidle driver for all CPUs, if any CPU fails * to register cpuidle driver then rollback to cancel all CPUs - * registeration. + * registration. */ static int __init arm_idle_init(void) { diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c index d8515d5c0853..bcd03e893a0a 100644 --- a/drivers/cpuidle/cpuidle-haltpoll.c +++ b/drivers/cpuidle/cpuidle-haltpoll.c @@ -141,5 +141,6 @@ static void __exit haltpoll_exit(void) module_init(haltpoll_init); module_exit(haltpoll_exit); +MODULE_DESCRIPTION("cpuidle driver for haltpoll governor"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>"); diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c index 13bf743f885b..5235e6e8f360 100644 --- a/drivers/cpuidle/cpuidle-kirkwood.c +++ b/drivers/cpuidle/cpuidle-kirkwood.c @@ -59,10 +59,9 @@ static int kirkwood_cpuidle_probe(struct platform_device *pdev) return cpuidle_register(&kirkwood_idle_driver, NULL); } -static int kirkwood_cpuidle_remove(struct platform_device *pdev) +static void kirkwood_cpuidle_remove(struct platform_device *pdev) { cpuidle_unregister(&kirkwood_idle_driver); - return 0; } static struct platform_driver kirkwood_cpuidle_driver = { diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index b88af1262f1a..5fb5228f6bf1 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -20,6 +20,7 @@ #include <linux/string.h> #include "cpuidle-psci.h" +#include "dt_idle_genpd.h" struct psci_pd_provider { struct list_head link; @@ -66,12 +67,17 @@ static int psci_pd_init(struct device_node *np, bool use_osi) /* * Allow power off when OSI has been successfully enabled. - * PREEMPT_RT is not yet ready to enter domain idle states. + * On a PREEMPT_RT based configuration the domain idle states are + * supported, but only during system-wide suspend. */ - if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT)) + if (use_osi) { pd->power_off = psci_pd_power_off; - else + pd->flags |= GENPD_FLAG_ACTIVE_WAKEUP; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + pd->flags |= GENPD_FLAG_RPM_ALWAYS_ON; + } else { pd->flags |= GENPD_FLAG_ALWAYS_ON; + } /* Use governor for CPU PM domains if it has some states to manage. */ pd_gov = pd->states ? &pm_domain_cpu_gov : NULL; @@ -137,7 +143,6 @@ static const struct of_device_id psci_of_match[] = { static int psci_cpuidle_domain_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; - struct device_node *node; bool use_osi = psci_has_osi_support(); int ret = 0, pd_count = 0; @@ -148,15 +153,13 @@ static int psci_cpuidle_domain_probe(struct platform_device *pdev) * Parse child nodes for the "#power-domain-cells" property and * initialize a genpd/genpd-of-provider pair when it's found. */ - for_each_child_of_node(np, node) { + for_each_child_of_node_scoped(np, node) { if (!of_property_present(node, "#power-domain-cells")) continue; ret = psci_pd_init(node, use_osi); - if (ret) { - of_node_put(node); + if (ret) goto exit; - } pd_count++; } @@ -200,4 +203,4 @@ static int __init psci_idle_init_domains(void) { return platform_driver_register(&psci_cpuidle_domain_driver); } -subsys_initcall(psci_idle_init_domains); +core_initcall(psci_idle_init_domains); diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index bf68920d038a..2562dc001fc1 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -28,6 +28,7 @@ #include "cpuidle-psci.h" #include "dt_idle_states.h" +#include "dt_idle_genpd.h" struct psci_cpuidle_data { u32 *psci_states; @@ -36,6 +37,7 @@ struct psci_cpuidle_data { static DEFINE_PER_CPU_READ_MOSTLY(struct psci_cpuidle_data, psci_cpuidle_data); static DEFINE_PER_CPU(u32, domain_state); +static bool psci_cpuidle_use_syscore; static bool psci_cpuidle_use_cpuhp; void psci_set_domain_state(u32 state) @@ -165,6 +167,12 @@ static struct syscore_ops psci_idle_syscore_ops = { .resume = psci_idle_syscore_resume, }; +static void psci_idle_init_syscore(void) +{ + if (psci_cpuidle_use_syscore) + register_syscore_ops(&psci_idle_syscore_ops); +} + static void psci_idle_init_cpuhp(void) { int err; @@ -172,8 +180,6 @@ static void psci_idle_init_cpuhp(void) if (!psci_cpuidle_use_cpuhp) return; - register_syscore_ops(&psci_idle_syscore_ops); - err = cpuhp_setup_state_nocalls(CPUHP_AP_CPU_PM_STARTING, "cpuidle/psci:online", psci_idle_cpuhp_up, @@ -221,22 +227,23 @@ static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv, if (!psci_has_osi_support()) return 0; - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - return 0; - - data->dev = psci_dt_attach_cpu(cpu); + data->dev = dt_idle_attach_cpu(cpu, "psci"); if (IS_ERR_OR_NULL(data->dev)) return PTR_ERR_OR_ZERO(data->dev); + psci_cpuidle_use_syscore = true; + /* * Using the deepest state for the CPU to trigger a potential selection * of a shared state for the domain, assumes the domain states are all - * deeper states. + * deeper states. On PREEMPT_RT the hierarchical topology is limited to + * s2ram and s2idle. */ - drv->states[state_count - 1].flags |= CPUIDLE_FLAG_RCU_IDLE; - drv->states[state_count - 1].enter = psci_enter_domain_idle_state; drv->states[state_count - 1].enter_s2idle = psci_enter_s2idle_domain_idle_state; - psci_cpuidle_use_cpuhp = true; + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + drv->states[state_count - 1].enter = psci_enter_domain_idle_state; + psci_cpuidle_use_cpuhp = true; + } return 0; } @@ -311,7 +318,8 @@ static void psci_cpu_deinit_idle(int cpu) { struct psci_cpuidle_data *data = per_cpu_ptr(&psci_cpuidle_data, cpu); - psci_dt_detach_cpu(data->dev); + dt_idle_detach_cpu(data->dev); + psci_cpuidle_use_syscore = false; psci_cpuidle_use_cpuhp = false; } @@ -408,6 +416,7 @@ static int psci_cpuidle_probe(struct platform_device *pdev) goto out_fail; } + psci_idle_init_syscore(); psci_idle_init_cpuhp(); return 0; diff --git a/drivers/cpuidle/cpuidle-psci.h b/drivers/cpuidle/cpuidle-psci.h index 4e132640ed64..ef004ec7a7c5 100644 --- a/drivers/cpuidle/cpuidle-psci.h +++ b/drivers/cpuidle/cpuidle-psci.h @@ -3,29 +3,9 @@ #ifndef __CPUIDLE_PSCI_H #define __CPUIDLE_PSCI_H -struct device; struct device_node; void psci_set_domain_state(u32 state); int psci_dt_parse_state_node(struct device_node *np, u32 *state); -#ifdef CONFIG_ARM_PSCI_CPUIDLE_DOMAIN - -#include "dt_idle_genpd.h" - -static inline struct device *psci_dt_attach_cpu(int cpu) -{ - return dt_idle_attach_cpu(cpu, "psci"); -} - -static inline void psci_dt_detach_cpu(struct device *dev) -{ - dt_idle_detach_cpu(dev); -} - -#else -static inline struct device *psci_dt_attach_cpu(int cpu) { return NULL; } -static inline void psci_dt_detach_cpu(struct device *dev) { } -#endif - #endif /* __CPUIDLE_PSCI_H */ diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 14db9b7d985d..f68c65f1d023 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -22,6 +22,7 @@ #include <asm/idle.h> #include <asm/plpar_wrappers.h> #include <asm/rtas.h> +#include <asm/time.h> static struct cpuidle_driver pseries_idle_driver = { .name = "pseries_idle", diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c index 1fc9968eae19..3ab240e0e122 100644 --- a/drivers/cpuidle/cpuidle-qcom-spm.c +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -48,7 +48,7 @@ static int qcom_cpu_spc(struct spm_driver_data *drv) ret = cpu_suspend(0, qcom_pm_collapse); /* * ARM common code executes WFI without calling into our driver and - * if the SPM mode is not reset, then we may accidently power down the + * if the SPM mode is not reset, then we may accidentally power down the * cpu when we intended only to gate the cpu clock. * Ensure the state is set to standby before returning. */ diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index e8094fc92491..0c92a628bbd4 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "cpuidle-riscv-sbi: " fmt +#include <linux/cleanup.h> #include <linux/cpuhotplug.h> #include <linux/cpuidle.h> #include <linux/cpumask.h> @@ -25,6 +26,7 @@ #include <asm/smp.h> #include <asm/suspend.h> +#include "cpuidle.h" #include "dt_idle_states.h" #include "dt_idle_genpd.h" @@ -73,26 +75,6 @@ static inline bool sbi_is_domain_state_available(void) return data->available; } -static int sbi_suspend_finisher(unsigned long suspend_type, - unsigned long resume_addr, - unsigned long opaque) -{ - struct sbiret ret; - - ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, - suspend_type, resume_addr, opaque, 0, 0, 0); - - return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0; -} - -static int sbi_suspend(u32 state) -{ - if (state & SBI_HSM_SUSP_NON_RET_BIT) - return cpu_suspend(state, sbi_suspend_finisher); - else - return sbi_suspend_finisher(state, 0, 0); -} - static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, int idx) { @@ -100,9 +82,9 @@ static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev, u32 state = states[idx]; if (state & SBI_HSM_SUSP_NON_RET_BIT) - return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, state); + return CPU_PM_CPU_IDLE_ENTER_PARAM(riscv_sbi_hart_suspend, idx, state); else - return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(sbi_suspend, + return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(riscv_sbi_hart_suspend, idx, state); } @@ -133,7 +115,7 @@ static __cpuidle int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, else state = states[idx]; - ret = sbi_suspend(state) ? -1 : idx; + ret = riscv_sbi_hart_suspend(state) ? -1 : idx; ct_cpuidle_exit(); @@ -206,17 +188,6 @@ static const struct of_device_id sbi_cpuidle_state_match[] = { { }, }; -static bool sbi_suspend_state_is_valid(u32 state) -{ - if (state > SBI_HSM_SUSPEND_RET_DEFAULT && - state < SBI_HSM_SUSPEND_RET_PLATFORM) - return false; - if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT && - state < SBI_HSM_SUSPEND_NON_RET_PLATFORM) - return false; - return true; -} - static int sbi_dt_parse_state_node(struct device_node *np, u32 *state) { int err = of_property_read_u32(np, "riscv,sbi-suspend-param", state); @@ -226,7 +197,7 @@ static int sbi_dt_parse_state_node(struct device_node *np, u32 *state) return err; } - if (!sbi_suspend_state_is_valid(*state)) { + if (!riscv_sbi_suspend_state_is_valid(*state)) { pr_warn("Invalid SBI suspend state %#x\n", *state); return -EINVAL; } @@ -267,19 +238,16 @@ static int sbi_cpuidle_dt_init_states(struct device *dev, { struct sbi_cpuidle_data *data = per_cpu_ptr(&sbi_cpuidle_data, cpu); struct device_node *state_node; - struct device_node *cpu_node; u32 *states; int i, ret; - cpu_node = of_cpu_device_node_get(cpu); + struct device_node *cpu_node __free(device_node) = of_cpu_device_node_get(cpu); if (!cpu_node) return -ENODEV; states = devm_kcalloc(dev, state_count, sizeof(*states), GFP_KERNEL); - if (!states) { - ret = -ENOMEM; - goto fail; - } + if (!states) + return -ENOMEM; /* Parse SBI specific details from state DT nodes */ for (i = 1; i < state_count; i++) { @@ -295,10 +263,8 @@ static int sbi_cpuidle_dt_init_states(struct device *dev, pr_debug("sbi-state %#x index %d\n", states[i], i); } - if (i != state_count) { - ret = -ENODEV; - goto fail; - } + if (i != state_count) + return -ENODEV; /* Initialize optional data, used for the hierarchical topology. */ ret = sbi_dt_cpu_init_topology(drv, data, state_count, cpu); @@ -308,10 +274,7 @@ static int sbi_cpuidle_dt_init_states(struct device *dev, /* Store states in the per-cpu struct. */ data->states = states; -fail: - of_node_put(cpu_node); - - return ret; + return 0; } static void sbi_cpuidle_deinit_cpu(int cpu) @@ -367,6 +330,9 @@ static int sbi_cpuidle_init_cpu(struct device *dev, int cpu) return ret; } + if (cpuidle_disabled()) + return 0; + ret = cpuidle_register(drv, NULL); if (ret) goto deinit; @@ -486,7 +452,6 @@ static void sbi_pd_remove(void) static int sbi_genpd_probe(struct device_node *np) { - struct device_node *node; int ret = 0, pd_count = 0; if (!np) @@ -496,13 +461,13 @@ static int sbi_genpd_probe(struct device_node *np) * Parse child nodes for the "#power-domain-cells" property and * initialize a genpd/genpd-of-provider pair when it's found. */ - for_each_child_of_node(np, node) { + for_each_child_of_node_scoped(np, node) { if (!of_property_present(node, "#power-domain-cells")) continue; ret = sbi_pd_init(node); if (ret) - goto put_node; + goto remove_pd; pd_count++; } @@ -518,8 +483,6 @@ static int sbi_genpd_probe(struct device_node *np) return 0; -put_node: - of_node_put(node); remove_pd: sbi_pd_remove(); pr_err("failed to create CPU PM domains ret=%d\n", ret); @@ -541,12 +504,12 @@ static int sbi_cpuidle_probe(struct platform_device *pdev) int cpu, ret; struct cpuidle_driver *drv; struct cpuidle_device *dev; - struct device_node *np, *pds_node; + struct device_node *pds_node; /* Detect OSI support based on CPU DT nodes */ sbi_cpuidle_use_osi = true; for_each_possible_cpu(cpu) { - np = of_cpu_device_node_get(cpu); + struct device_node *np __free(device_node) = of_cpu_device_node_get(cpu); if (np && of_property_present(np, "power-domains") && of_property_present(np, "power-domain-names")) { @@ -579,7 +542,10 @@ static int sbi_cpuidle_probe(struct platform_device *pdev) /* Setup CPU hotplut notifiers */ sbi_idle_init_cpuhp(); - pr_info("idle driver registered for all CPUs\n"); + if (cpuidle_disabled()) + pr_info("cpuidle is disabled\n"); + else + pr_info("idle driver registered for all CPUs\n"); return 0; @@ -607,16 +573,8 @@ static int __init sbi_cpuidle_init(void) int ret; struct platform_device *pdev; - /* - * The SBI HSM suspend function is only available when: - * 1) SBI version is 0.3 or higher - * 2) SBI HSM extension is available - */ - if ((sbi_spec_version < sbi_mk_version(0, 3)) || - !sbi_probe_extension(SBI_EXT_HSM)) { - pr_info("HSM suspend not available\n"); + if (!riscv_sbi_hsm_is_supported()) return 0; - } ret = platform_driver_register(&sbi_cpuidle_driver); if (ret) @@ -631,4 +589,4 @@ static int __init sbi_cpuidle_init(void) return 0; } -device_initcall(sbi_cpuidle_init); +arch_initcall(sbi_cpuidle_init); diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 737a026ef58a..0835da449db8 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -69,11 +69,15 @@ int cpuidle_play_dead(void) if (!drv) return -ENODEV; - /* Find lowest-power state that supports long-term idle */ - for (i = drv->state_count - 1; i >= 0; i--) + for (i = drv->state_count - 1; i >= 0; i--) { if (drv->states[i].enter_dead) - return drv->states[i].enter_dead(dev, i); + drv->states[i].enter_dead(dev, i); + } + /* + * If :enter_dead() is successful, it will never return, so reaching + * here means that all of them failed above or were not present. + */ return -ENODEV; } @@ -228,16 +232,13 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev, if (broadcast && tick_broadcast_enter()) { index = find_deepest_state(drv, dev, target_state->exit_latency_ns, CPUIDLE_FLAG_TIMER_STOP, false); - if (index < 0) { - default_idle_call(); - return -EBUSY; - } + target_state = &drv->states[index]; broadcast = false; } if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED) - leave_mm(dev->cpu); + leave_mm(); /* Take note of the planned idle state. */ sched_idle_set_state(target_state); @@ -409,7 +410,7 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index) * Min polling interval of 10usec is a guess. It is assuming that * for most users, the time for a single ping-pong workload like * perf bench pipe would generally complete within 10usec but - * this is hardware dependant. Actual time can be estimated with + * this is hardware dependent. Actual time can be estimated with * * perf bench sched pipe -l 10000 * diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index d9cda7f6ccb9..9bbfa594c442 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -16,6 +16,7 @@ #include <linux/cpumask.h> #include <linux/tick.h> #include <linux/cpu.h> +#include <linux/math64.h> #include "cpuidle.h" @@ -187,7 +188,7 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC); if (s->exit_latency > 0) - s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC; + s->exit_latency_ns = mul_u32_u32(s->exit_latency, NSEC_PER_USEC); else if (s->exit_latency_ns < 0) s->exit_latency_ns = 0; else @@ -260,7 +261,7 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv) * @drv: a pointer to a valid struct cpuidle_driver * * Register the driver under a lock to prevent concurrent attempts to - * [un]register the driver from occuring at the same time. + * [un]register the driver from occurring at the same time. * * Returns 0 on success, a negative error code (returned by * __cpuidle_register_driver()) otherwise. @@ -295,7 +296,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver); * @drv: a pointer to a valid struct cpuidle_driver * * Unregisters the cpuidle driver under a lock to prevent concurrent attempts - * to [un]register the driver from occuring at the same time. @drv has to + * to [un]register the driver from occurring at the same time. @drv has to * match the currently registered driver. */ void cpuidle_unregister_driver(struct cpuidle_driver *drv) diff --git a/drivers/cpuidle/dt_idle_genpd.c b/drivers/cpuidle/dt_idle_genpd.c index 1af63c189039..203e9b754aea 100644 --- a/drivers/cpuidle/dt_idle_genpd.c +++ b/drivers/cpuidle/dt_idle_genpd.c @@ -130,11 +130,10 @@ out: int dt_idle_pd_init_topology(struct device_node *np) { - struct device_node *node; struct of_phandle_args child, parent; int ret; - for_each_child_of_node(np, node) { + for_each_child_of_node_scoped(np, node) { if (of_parse_phandle_with_args(node, "power-domains", "#power-domain-cells", 0, &parent)) continue; @@ -143,10 +142,8 @@ int dt_idle_pd_init_topology(struct device_node *np) child.args_count = 0; ret = of_genpd_add_subdomain(&parent, &child); of_node_put(parent.np); - if (ret) { - of_node_put(node); + if (ret) return ret; - } } return 0; @@ -154,11 +151,10 @@ int dt_idle_pd_init_topology(struct device_node *np) int dt_idle_pd_remove_topology(struct device_node *np) { - struct device_node *node; struct of_phandle_args child, parent; int ret; - for_each_child_of_node(np, node) { + for_each_child_of_node_scoped(np, node) { if (of_parse_phandle_with_args(node, "power-domains", "#power-domain-cells", 0, &parent)) continue; @@ -167,10 +163,8 @@ int dt_idle_pd_remove_topology(struct device_node *np) child.args_count = 0; ret = of_genpd_remove_subdomain(&parent, &child); of_node_put(parent.np); - if (ret) { - of_node_put(node); + if (ret) return ret; - } } return 0; diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c index 1dff3a52917d..663b7f164d20 100644 --- a/drivers/cpuidle/governors/haltpoll.c +++ b/drivers/cpuidle/governors/haltpoll.c @@ -98,10 +98,15 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns) unsigned int shrink = guest_halt_poll_shrink; val = dev->poll_limit_ns; - if (shrink == 0) + if (shrink == 0) { val = 0; - else + } else { val /= shrink; + /* Reset value to 0 if shrunk below grow_start */ + if (val < guest_halt_poll_grow_start) + val = 0; + } + trace_guest_halt_poll_ns_shrink(val, dev->poll_limit_ns); dev->poll_limit_ns = val; } diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 8e9058c4ea63..6617eb494a11 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -44,6 +44,7 @@ static DEFINE_PER_CPU(struct ladder_device, ladder_devices); /** * ladder_do_selection - prepares private data for a state change + * @dev: the CPU * @ldev: the ladder device * @old_idx: the current state index * @new_idx: the new target state index diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b96e3da0fedd..28363bfa3e4c 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -14,14 +14,12 @@ #include <linux/ktime.h> #include <linux/hrtimer.h> #include <linux/tick.h> -#include <linux/sched.h> -#include <linux/sched/loadavg.h> #include <linux/sched/stat.h> #include <linux/math64.h> #include "gov.h" -#define BUCKETS 12 +#define BUCKETS 6 #define INTERVAL_SHIFT 3 #define INTERVALS (1UL << INTERVAL_SHIFT) #define RESOLUTION 1024 @@ -31,12 +29,11 @@ /* * Concepts and ideas behind the menu governor * - * For the menu governor, there are 3 decision factors for picking a C + * For the menu governor, there are 2 decision factors for picking a C * state: * 1) Energy break even point - * 2) Performance impact - * 3) Latency tolerance (from pmqos infrastructure) - * These three factors are treated independently. + * 2) Latency tolerance (from pmqos infrastructure) + * These two factors are treated independently. * * Energy break even point * ----------------------- @@ -77,35 +74,6 @@ * intervals and if the stand deviation of these 8 intervals is below a * threshold value, we use the average of these intervals as prediction. * - * Limiting Performance Impact - * --------------------------- - * C states, especially those with large exit latencies, can have a real - * noticeable impact on workloads, which is not acceptable for most sysadmins, - * and in addition, less performance has a power price of its own. - * - * As a general rule of thumb, menu assumes that the following heuristic - * holds: - * The busier the system, the less impact of C states is acceptable - * - * This rule-of-thumb is implemented using a performance-multiplier: - * If the exit latency times the performance multiplier is longer than - * the predicted duration, the C state is not considered a candidate - * for selection due to a too high performance impact. So the higher - * this multiplier is, the longer we need to be idle to pick a deep C - * state, and thus the less likely a busy CPU will hit such a deep - * C state. - * - * Two factors are used in determing this multiplier: - * a value of 10 is added for each point of "per cpu load average" we have. - * a value of 5 points is added for each process that is waiting for - * IO on this CPU. - * (these values are experimentally determined) - * - * The load average factor gives a longer term (few seconds) input to the - * decision, while the iowait value gives a cpu local instantanious input. - * The iowait factor may look low, but realize that this is also already - * represented in the system load average. - * */ struct menu_device { @@ -119,19 +87,10 @@ struct menu_device { int interval_ptr; }; -static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) +static inline int which_bucket(u64 duration_ns) { int bucket = 0; - /* - * We keep two groups of stats; one with no - * IO pending, one without. - * This allows us to calculate - * E(duration)|iowait - */ - if (nr_iowaiters) - bucket = BUCKETS/2; - if (duration_ns < 10ULL * NSEC_PER_USEC) return bucket; if (duration_ns < 100ULL * NSEC_PER_USEC) @@ -145,19 +104,6 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) return bucket + 5; } -/* - * Return a multiplier for the exit latency that is intended - * to take performance requirements into account. - * The more performance critical we estimate the system - * to be, the higher this multiplier, and thus the higher - * the barrier to go to an expensive C state. - */ -static inline int performance_multiplier(unsigned int nr_iowaiters) -{ - /* for IO wait tasks (per cpu!) we add 10x each */ - return 1 + 10 * nr_iowaiters; -} - static DEFINE_PER_CPU(struct menu_device, menu_devices); static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); @@ -265,8 +211,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct menu_device *data = this_cpu_ptr(&menu_devices); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); u64 predicted_ns; - u64 interactivity_req; - unsigned int nr_iowaiters; ktime_t delta, delta_tick; int i, idx; @@ -275,8 +219,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->needs_update = 0; } - nr_iowaiters = nr_iowait_cpu(dev->cpu); - /* Find the shortest expected idle interval. */ predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; if (predicted_ns > RESIDENCY_THRESHOLD_NS) { @@ -290,7 +232,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } data->next_timer_ns = delta; - data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); + data->bucket = which_bucket(data->next_timer_ns); /* Round up the result for half microseconds. */ timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + @@ -308,7 +250,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ data->next_timer_ns = KTIME_MAX; delta_tick = TICK_NSEC / 2; - data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); + data->bucket = which_bucket(KTIME_MAX); } if (unlikely(drv->state_count <= 1 || latency_req == 0) || @@ -335,15 +277,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ if (predicted_ns < TICK_NSEC) predicted_ns = data->next_timer_ns; - } else { - /* - * Use the performance multiplier and the user-configurable - * latency_req to determine the maximum exit latency. - */ - interactivity_req = div64_u64(predicted_ns, - performance_multiplier(nr_iowaiters)); - if (latency_req > interactivity_req) - latency_req = interactivity_req; + } else if (latency_req > predicted_ns) { + latency_req = predicted_ns; } /* diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 7244f71c59c5..8fe5e1b47ef9 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -2,38 +2,35 @@ /* * Timer events oriented CPU idle governor * - * TEO governor: * Copyright (C) 2018 - 2021 Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> - * - * Util-awareness mechanism: - * Copyright (C) 2022 Arm Ltd. - * Author: Kajetan Puchalski <kajetan.puchalski@arm.com> */ /** * DOC: teo-description * * The idea of this governor is based on the observation that on many systems - * timer events are two or more orders of magnitude more frequent than any - * other interrupts, so they are likely to be the most significant cause of CPU - * wakeups from idle states. Moreover, information about what happened in the - * (relatively recent) past can be used to estimate whether or not the deepest - * idle state with target residency within the (known) time till the closest - * timer event, referred to as the sleep length, is likely to be suitable for - * the upcoming CPU idle period and, if not, then which of the shallower idle - * states to choose instead of it. + * timer interrupts are two or more orders of magnitude more frequent than any + * other interrupt types, so they are likely to dominate CPU wakeup patterns. + * Moreover, in principle, the time when the next timer event is going to occur + * can be determined at the idle state selection time, although doing that may + * be costly, so it can be regarded as the most reliable source of information + * for idle state selection. * - * Of course, non-timer wakeup sources are more important in some use cases - * which can be covered by taking a few most recent idle time intervals of the - * CPU into account. However, even in that context it is not necessary to - * consider idle duration values greater than the sleep length, because the - * closest timer will ultimately wake up the CPU anyway unless it is woken up - * earlier. + * Of course, non-timer wakeup sources are more important in some use cases, + * but even then it is generally unnecessary to consider idle duration values + * greater than the time time till the next timer event, referred as the sleep + * length in what follows, because the closest timer will ultimately wake up the + * CPU anyway unless it is woken up earlier. * - * Thus this governor estimates whether or not the prospective idle duration of - * a CPU is likely to be significantly shorter than the sleep length and selects - * an idle state for it accordingly. + * However, since obtaining the sleep length may be costly, the governor first + * checks if it can select a shallow idle state using wakeup pattern information + * from recent times, in which case it can do without knowing the sleep length + * at all. For this purpose, it counts CPU wakeup events and looks for an idle + * state whose target residency has not exceeded the idle duration (measured + * after wakeup) in the majority of relevant recent cases. If the target + * residency of that state is small enough, it may be used right away and the + * sleep length need not be determined. * * The computations carried out by this governor are based on using bins whose * boundaries are aligned with the target residency parameter values of the CPU @@ -54,105 +51,65 @@ * sleep length and the idle duration measured after CPU wakeup fall into the * same bin (that is, the CPU appears to wake up "on time" relative to the sleep * length). In turn, the "intercepts" metric reflects the relative frequency of - * situations in which the measured idle duration is so much shorter than the - * sleep length that the bin it falls into corresponds to an idle state - * shallower than the one whose bin is fallen into by the sleep length (these - * situations are referred to as "intercepts" below). + * non-timer wakeup events for which the measured idle duration falls into a bin + * that corresponds to an idle state shallower than the one whose bin is fallen + * into by the sleep length (these events are also referred to as "intercepts" + * below). * - * In addition to the metrics described above, the governor counts recent - * intercepts (that is, intercepts that have occurred during the last - * %NR_RECENT invocations of it for the given CPU) for each bin. + * The governor also counts "intercepts" with the measured idle duration below + * the tick period length and uses this information when deciding whether or not + * to stop the scheduler tick. * * In order to select an idle state for a CPU, the governor takes the following * steps (modulo the possible latency constraint that must be taken into account * too): * - * 1. Find the deepest CPU idle state whose target residency does not exceed - * the current sleep length (the candidate idle state) and compute 3 sums as - * follows: - * - * - The sum of the "hits" and "intercepts" metrics for the candidate state - * and all of the deeper idle states (it represents the cases in which the - * CPU was idle long enough to avoid being intercepted if the sleep length - * had been equal to the current one). + * 1. Find the deepest enabled CPU idle state (the candidate idle state) and + * compute 2 sums as follows: * - * - The sum of the "intercepts" metrics for all of the idle states shallower - * than the candidate one (it represents the cases in which the CPU was not - * idle long enough to avoid being intercepted if the sleep length had been - * equal to the current one). + * - The sum of the "hits" metric for all of the idle states shallower than + * the candidate one (it represents the cases in which the CPU was likely + * woken up by a timer). * - * - The sum of the numbers of recent intercepts for all of the idle states - * shallower than the candidate one. + * - The sum of the "intercepts" metric for all of the idle states shallower + * than the candidate one (it represents the cases in which the CPU was + * likely woken up by a non-timer wakeup source). * - * 2. If the second sum is greater than the first one or the third sum is - * greater than %NR_RECENT / 2, the CPU is likely to wake up early, so look - * for an alternative idle state to select. + * 2. If the second sum computed in step 1 is greater than a half of the sum of + * both metrics for the candidate state bin and all subsequent bins(if any), + * a shallower idle state is likely to be more suitable, so look for it. * - * - Traverse the idle states shallower than the candidate one in the + * - Traverse the enabled idle states shallower than the candidate one in the * descending order. * - * - For each of them compute the sum of the "intercepts" metrics and the sum - * of the numbers of recent intercepts over all of the idle states between - * it and the candidate one (including the former and excluding the - * latter). - * - * - If each of these sums that needs to be taken into account (because the - * check related to it has indicated that the CPU is likely to wake up - * early) is greater than a half of the corresponding sum computed in step - * 1 (which means that the target residency of the state in question had - * not exceeded the idle duration in over a half of the relevant cases), - * select the given idle state instead of the candidate one. - * - * 3. By default, select the candidate state. - * - * Util-awareness mechanism: + * - For each of them compute the sum of the "intercepts" metrics over all + * of the idle states between it and the candidate one (including the + * former and excluding the latter). * - * The idea behind the util-awareness extension is that there are two distinct - * scenarios for the CPU which should result in two different approaches to idle - * state selection - utilized and not utilized. + * - If this sum is greater than a half of the second sum computed in step 1, + * use the given idle state as the new candidate one. * - * In this case, 'utilized' means that the average runqueue util of the CPU is - * above a certain threshold. + * 3. If the current candidate state is state 0 or its target residency is short + * enough, return it and prevent the scheduler tick from being stopped. * - * When the CPU is utilized while going into idle, more likely than not it will - * be woken up to do more work soon and so a shallower idle state should be - * selected to minimise latency and maximise performance. When the CPU is not - * being utilized, the usual metrics-based approach to selecting the deepest - * available idle state should be preferred to take advantage of the power - * saving. - * - * In order to achieve this, the governor uses a utilization threshold. - * The threshold is computed per-CPU as a percentage of the CPU's capacity - * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%) - * seems to be getting the best results. - * - * Before selecting the next idle state, the governor compares the current CPU - * util to the precomputed util threshold. If it's below, it defaults to the - * TEO metrics mechanism. If it's above, the closest shallower idle state will - * be selected instead, as long as is not a polling state. + * 4. Obtain the sleep length value and check if it is below the target + * residency of the current candidate state, in which case a new shallower + * candidate state needs to be found, so look for it. */ #include <linux/cpuidle.h> #include <linux/jiffies.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/sched/clock.h> -#include <linux/sched/topology.h> #include <linux/tick.h> #include "gov.h" /* - * The number of bits to shift the CPU's capacity by in order to determine - * the utilized threshold. - * - * 6 was chosen based on testing as the number that achieved the best balance - * of power and performance on average. - * - * The resulting threshold is high enough to not be triggered by background - * noise and low enough to react quickly when activity starts to ramp up. + * Idle state exit latency threshold used for deciding whether or not to check + * the time till the closest expected timer event. */ -#define UTIL_THRESHOLD_SHIFT 6 +#define LATENCY_THRESHOLD_NS (RESIDENCY_THRESHOLD_NS / 2) /* * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value @@ -161,66 +118,37 @@ #define PULSE 1024 #define DECAY_SHIFT 3 -/* - * Number of the most recent idle duration values to take into consideration for - * the detection of recent early wakeup patterns. - */ -#define NR_RECENT 9 - /** * struct teo_bin - Metrics used by the TEO cpuidle governor. * @intercepts: The "intercepts" metric. * @hits: The "hits" metric. - * @recent: The number of recent "intercepts". */ struct teo_bin { unsigned int intercepts; unsigned int hits; - unsigned int recent; }; /** * struct teo_cpu - CPU data used by the TEO cpuidle governor. - * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. * @total: Grand total of the "intercepts" and "hits" metrics for all bins. - * @next_recent_idx: Index of the next @recent_idx entry to update. - * @recent_idx: Indices of bins corresponding to recent "intercepts". - * @tick_hits: Number of "hits" after TICK_NSEC. - * @util_threshold: Threshold above which the CPU is considered utilized + * @tick_intercepts: "Intercepts" before TICK_NSEC. + * @short_idles: Wakeups after short idle periods. + * @artificial_wakeup: Set if the wakeup has been triggered by a safety net. */ struct teo_cpu { - s64 time_span_ns; s64 sleep_length_ns; struct teo_bin state_bins[CPUIDLE_STATE_MAX]; unsigned int total; - int next_recent_idx; - int recent_idx[NR_RECENT]; - unsigned int tick_hits; - unsigned long util_threshold; + unsigned int tick_intercepts; + unsigned int short_idles; + bool artificial_wakeup; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); /** - * teo_cpu_is_utilized - Check if the CPU's util is above the threshold - * @cpu: Target CPU - * @cpu_data: Governor CPU data for the target CPU - */ -#ifdef CONFIG_SMP -static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) -{ - return sched_cpu_util(cpu) > cpu_data->util_threshold; -} -#else -static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data) -{ - return false; -} -#endif - -/** * teo_update - Update CPU metrics after wakeup. * @drv: cpuidle driver containing state data. * @dev: Target CPU. @@ -232,23 +160,17 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) s64 target_residency_ns; u64 measured_ns; - if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { + cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; + + if (cpu_data->artificial_wakeup) { /* - * One of the safety nets has triggered or the wakeup was close - * enough to the closest timer event expected at the idle state - * selection time to be discarded. + * If one of the safety nets has triggered, assume that this + * might have been a long sleep. */ measured_ns = U64_MAX; } else { u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; - /* - * The computations below are to determine whether or not the - * (saved) time till the next timer event and the measured idle - * duration fall into the same "bin", so use last_residency_ns - * for that instead of time_span_ns which includes the cpuidle - * overhead. - */ measured_ns = dev->last_residency_ns; /* * The delay between the wakeup and the first instruction @@ -256,14 +178,16 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * time, so take 1/2 of the exit latency as a very rough * approximation of the average of it. */ - if (measured_ns >= lat_ns) + if (measured_ns >= lat_ns) { measured_ns -= lat_ns / 2; - else + if (measured_ns < RESIDENCY_THRESHOLD_NS) + cpu_data->short_idles += PULSE; + } else { measured_ns /= 2; + cpu_data->short_idles += PULSE; + } } - cpu_data->total = 0; - /* * Decay the "hits" and "intercepts" metrics for all of the bins and * find the bins that the sleep length and the measured idle duration @@ -275,8 +199,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) bin->hits -= bin->hits >> DECAY_SHIFT; bin->intercepts -= bin->intercepts >> DECAY_SHIFT; - cpu_data->total += bin->hits + bin->intercepts; - target_residency_ns = drv->states[i].target_residency_ns; if (target_residency_ns <= cpu_data->sleep_length_ns) { @@ -286,33 +208,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } - i = cpu_data->next_recent_idx++; - if (cpu_data->next_recent_idx >= NR_RECENT) - cpu_data->next_recent_idx = 0; - - if (cpu_data->recent_idx[i] >= 0) - cpu_data->state_bins[cpu_data->recent_idx[i]].recent--; - - /* - * If the deepest state's target residency is below the tick length, - * make a record of it to help teo_select() decide whether or not - * to stop the tick. This effectively adds an extra hits-only bin - * beyond the last state-related one. - */ - if (target_residency_ns < TICK_NSEC) { - cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT; - - cpu_data->total += cpu_data->tick_hits; - - if (TICK_NSEC <= cpu_data->sleep_length_ns) { - idx_timer = drv->state_count; - if (TICK_NSEC <= measured_ns) { - cpu_data->tick_hits += PULSE; - goto end; - } - } - } - + cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT; /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. @@ -321,14 +217,13 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) */ if (idx_timer == idx_duration) { cpu_data->state_bins[idx_timer].hits += PULSE; - cpu_data->recent_idx[i] = -1; } else { cpu_data->state_bins[idx_duration].intercepts += PULSE; - cpu_data->state_bins[idx_duration].recent++; - cpu_data->recent_idx[i] = idx_duration; + if (TICK_NSEC <= measured_ns) + cpu_data->tick_intercepts += PULSE; } -end: + cpu_data->total -= cpu_data->total >> DECAY_SHIFT; cpu_data->total += PULSE; } @@ -376,17 +271,12 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ktime_t delta_tick = TICK_NSEC / 2; - unsigned int tick_intercept_sum = 0; unsigned int idx_intercept_sum = 0; unsigned int intercept_sum = 0; - unsigned int idx_recent_sum = 0; - unsigned int recent_sum = 0; unsigned int idx_hit_sum = 0; unsigned int hit_sum = 0; int constraint_idx = 0; int idx0 = 0, idx = -1; - bool alt_intercepts, alt_recent; - bool cpu_utilized; s64 duration_ns; int i; @@ -395,10 +285,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, dev->last_state_idx = -1; } - cpu_data->time_span_ns = local_clock(); /* - * Set the expected sleep length to infinity in case of an early - * return. + * Set the sleep length to infinity in case the invocation of + * tick_nohz_get_sleep_length() below is skipped, in which case it won't + * be known whether or not the subsequent wakeup is caused by a timer. + * It is generally fine to count the wakeup as an intercept then, except + * for the cases when the CPU is mostly woken up by timers and there may + * be opportunities to ask for a deeper idle state when no imminent + * timers are scheduled which may be missed. */ cpu_data->sleep_length_ns = KTIME_MAX; @@ -411,32 +305,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (!dev->states_usage[0].disable) idx = 0; - cpu_utilized = teo_cpu_is_utilized(dev->cpu, cpu_data); - /* - * If the CPU is being utilized over the threshold and there are only 2 - * states to choose from, the metrics need not be considered, so choose - * the shallowest non-polling state and exit. - */ - if (drv->state_count < 3 && cpu_utilized) { - /* - * If state 0 is enabled and it is not a polling one, select it - * right away unless the scheduler tick has been stopped, in - * which case care needs to be taken to leave the CPU in a deep - * enough state in case it is not woken up any time soon after - * all. If state 1 is disabled, though, state 0 must be used - * anyway. - */ - if ((!idx && !(drv->states[0].flags & CPUIDLE_FLAG_POLLING) && - teo_state_ok(0, drv)) || dev->states_usage[1].disable) { - idx = 0; - goto out_tick; - } - /* Assume that state 1 is not a polling one and use it. */ - idx = 1; - duration_ns = drv->states[1].target_residency_ns; - goto end; - } - /* Compute the sums of metrics for early wakeup pattern detection. */ for (i = 1; i < drv->state_count; i++) { struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; @@ -448,7 +316,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ intercept_sum += prev_bin->intercepts; hit_sum += prev_bin->hits; - recent_sum += prev_bin->recent; if (dev->states_usage[i].disable) continue; @@ -464,7 +331,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, /* Save the sums for the current state. */ idx_intercept_sum = intercept_sum; idx_hit_sum = hit_sum; - idx_recent_sum = recent_sum; } /* Avoid unnecessary overhead. */ @@ -482,74 +348,68 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto end; } - tick_intercept_sum = intercept_sum + - cpu_data->state_bins[drv->state_count-1].intercepts; - /* * If the sum of the intercepts metric for all of the idle states * shallower than the current candidate one (idx) is greater than the * sum of the intercepts and hits metrics for the candidate state and - * all of the deeper states, or the sum of the numbers of recent - * intercepts over all of the states shallower than the candidate one - * is greater than a half of the number of recent events taken into - * account, a shallower idle state is likely to be a better choice. + * all of the deeper states, a shallower idle state is likely to be a + * better choice. */ - alt_intercepts = 2 * idx_intercept_sum > cpu_data->total - idx_hit_sum; - alt_recent = idx_recent_sum > NR_RECENT / 2; - if (alt_recent || alt_intercepts) { + if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { int first_suitable_idx = idx; /* * Look for the deepest idle state whose target residency had * not exceeded the idle duration in over a half of the relevant - * cases (both with respect to intercepts overall and with - * respect to the recent intercepts only) in the past. + * cases in the past. * * Take the possible duration limitation present if the tick * has been stopped already into account. */ intercept_sum = 0; - recent_sum = 0; for (i = idx - 1; i >= 0; i--) { struct teo_bin *bin = &cpu_data->state_bins[i]; intercept_sum += bin->intercepts; - recent_sum += bin->recent; - if ((!alt_recent || 2 * recent_sum > idx_recent_sum) && - (!alt_intercepts || - 2 * intercept_sum > idx_intercept_sum)) { + if (2 * intercept_sum > idx_intercept_sum) { /* * Use the current state unless it is too * shallow or disabled, in which case take the * first enabled state that is deep enough. */ if (teo_state_ok(i, drv) && - !dev->states_usage[i].disable) + !dev->states_usage[i].disable) { idx = i; - else - idx = first_suitable_idx; - + break; + } + idx = first_suitable_idx; break; } if (dev->states_usage[i].disable) continue; - if (!teo_state_ok(i, drv)) { + if (teo_state_ok(i, drv)) { /* - * The current state is too shallow, but if an - * alternative candidate state has been found, - * it may still turn out to be a better choice. + * The current state is deep enough, but still + * there may be a better one. */ - if (first_suitable_idx != idx) - continue; - - break; + first_suitable_idx = i; + continue; } - first_suitable_idx = i; + /* + * The current state is too shallow, so if no suitable + * states other than the initial candidate have been + * found, give up (the remaining states to check are + * shallower still), but otherwise the first suitable + * state other than the initial candidate may turn out + * to be preferable. + */ + if (first_suitable_idx == idx) + break; } } @@ -561,38 +421,41 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, idx = constraint_idx; /* - * If the CPU is being utilized over the threshold, choose a shallower - * non-polling state to improve latency, unless the scheduler tick has - * been stopped already and the shallower state's target residency is - * not sufficiently large. + * If either the candidate state is state 0 or its target residency is + * low enough, there is basically nothing more to do, but if the sleep + * length is not updated, the subsequent wakeup will be counted as an + * "intercept" which may be problematic in the cases when timer wakeups + * are dominant. Namely, it may effectively prevent deeper idle states + * from being selected at one point even if no imminent timers are + * scheduled. + * + * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one + * CPU are unlikely (user space has a default 50 us slack value for + * hrtimers and there are relatively few timers with a lower deadline + * value in the kernel), and even if they did happen, the potential + * benefit from using a deep idle state in that case would be + * questionable anyway for latency reasons. Thus if the measured idle + * duration falls into that range in the majority of cases, assume + * non-timer wakeups to be dominant and skip updating the sleep length + * to reduce latency. + * + * Also, if the latency constraint is sufficiently low, it will force + * shallow idle states regardless of the wakeup type, so the sleep + * length need not be known in that case. */ - if (cpu_utilized) { - i = teo_find_shallower_state(drv, dev, idx, KTIME_MAX, true); - if (teo_state_ok(i, drv)) - idx = i; - } - - /* - * Skip the timers check if state 0 is the current candidate one, - * because an immediate non-timer wakeup is expected in that case. - */ - if (!idx) - goto out_tick; - - /* - * If state 0 is a polling one, check if the target residency of - * the current candidate state is low enough and skip the timers - * check in that case too. - */ - if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) && - drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) + if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && + (2 * cpu_data->short_idles >= cpu_data->total || + latency_req < LATENCY_THRESHOLD_NS)) goto out_tick; duration_ns = tick_nohz_get_sleep_length(&delta_tick); cpu_data->sleep_length_ns = duration_ns; + if (!idx) + goto out_tick; + /* - * If the closest expected timer is before the terget residency of the + * If the closest expected timer is before the target residency of the * candidate state, a shallower one needs to be found. */ if (drv->states[idx].target_residency_ns > duration_ns) { @@ -607,7 +470,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * total wakeup events, do not stop the tick. */ if (drv->states[idx].target_residency_ns < TICK_NSEC && - tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8) + cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8) duration_ns = TICK_NSEC / 2; end: @@ -644,17 +507,16 @@ static void teo_reflect(struct cpuidle_device *dev, int state) struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); dev->last_state_idx = state; - /* - * If the wakeup was not "natural", but triggered by one of the safety - * nets, assume that the CPU might have been idle for the entire sleep - * length time. - */ if (dev->poll_time_limit || (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { + /* + * The wakeup was not "genuine", but triggered by one of the + * safety nets. + */ dev->poll_time_limit = false; - cpu_data->time_span_ns = cpu_data->sleep_length_ns; + cpu_data->artificial_wakeup = true; } else { - cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; + cpu_data->artificial_wakeup = false; } } @@ -667,14 +529,8 @@ static int teo_enable_device(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); - unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu); - int i; memset(cpu_data, 0, sizeof(*cpu_data)); - cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT; - - for (i = 0; i < NR_RECENT; i++) - cpu_data->recent_idx[i] = -1; return 0; } |