diff options
Diffstat (limited to 'kernel/sched/cpufreq_schedutil.c')
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 259 |
1 files changed, 165 insertions, 94 deletions
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 4492608b7d7f..0ab5f9d4bc59 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ +#include <uapi/linux/sched/types.h> +#include "sched.h" #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) @@ -47,7 +49,7 @@ struct sugov_cpu { u64 last_update; unsigned long util; - unsigned long bw_dl; + unsigned long bw_min; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -81,9 +83,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + + return true; + } else if (sg_policy->need_freq_update) { + /* ignore_dl_rate_limit() wants a new frequency to be found. */ return true; } @@ -95,10 +111,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); - else if (sg_policy->next_freq == next_freq) + if (sg_policy->need_freq_update) { + sg_policy->need_freq_update = false; + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -115,6 +143,32 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy) } /** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + /* + * Apply a 25% margin so that we select a higher frequency than + * the current one before the CPU is fully busy: + */ + return policy->cur + (policy->cur >> 2); +} + +/** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. * @util: Current CPU utilization. @@ -140,10 +194,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; - util = map_util_perf(util); + freq = get_capacity_ref_freq(policy); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) @@ -153,14 +206,33 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(struct sugov_cpu *sg_cpu) +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max) { - unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu); - struct rq *rq = cpu_rq(sg_cpu->cpu); + /* Add dvfs headroom to actual utilization */ + actual = map_util_perf(actual); + /* Actually we don't need to target the max performance */ + if (actual < max) + max = actual; - sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util, - FREQUENCY_UTIL, NULL); + /* + * Ensure at least minimum performance while providing more compute + * capacity when possible. + */ + return max(min, max); +} + +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) +{ + unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); + + if (!scx_switched_all()) + util += cpu_util_cfs_boost(sg_cpu->cpu); + util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + util = max(util, boost); + sg_cpu->bw_min = min; + sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); } /** @@ -251,18 +323,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, unsigned long max_cap) { - unsigned long boost; - /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return 0; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return 0; if (!sg_cpu->iowait_boost_pending) { /* @@ -271,7 +341,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return; + return 0; } } @@ -281,24 +351,40 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; - boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); - if (sg_cpu->util < boost) - sg_cpu->util = boost; + return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; } #ifdef CONFIG_NO_HZ_COMMON -static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); - bool ret = idle_calls == sg_cpu->saved_idle_calls; + unsigned long idle_calls; + bool ret; + + /* + * The heuristics in this function is for the fair class. For SCX, the + * performance target comes directly from the BPF scheduler. Let's just + * follow it. + */ + if (scx_switched_all()) + return false; + + /* if capped by uclamp_max, always update to be in compliance */ + if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) + return false; + + /* + * Maintain the frequency if the CPU has not been idle recently, as + * reduction is likely to be premature. + */ + idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); + ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; return ret; } -#else -static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ +static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } +#endif /* !CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL @@ -306,14 +392,16 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } */ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_cpu->sg_policy->limits_changed = true; + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) + sg_cpu->sg_policy->need_freq_update = true; } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, u64 time, unsigned long max_cap, unsigned int flags) { + unsigned long boost; + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -322,8 +410,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) return false; - sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time, max_cap); + boost = sugov_iowait_apply(sg_cpu, time, max_cap); + sugov_get_util(sg_cpu, boost); return true; } @@ -343,14 +431,9 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, return; next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - * - * Except when the rq is capped by uclamp_max. - */ - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { + + if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && + !sg_policy->need_freq_update) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ @@ -396,18 +479,11 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - /* - * Do not reduce the target performance level if the CPU has not been - * idle recently, as the reduction is likely to be premature then. - * - * Except when the rq is capped by uclamp_max. - */ - if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), max_cap); + cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + sg_cpu->util, max_cap); sg_cpu->sg_policy->last_freq_update_time = time; } @@ -423,9 +499,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); + unsigned long boost; - sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time, max_cap); + boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); + sugov_get_util(j_sg_cpu, boost); util = max(j_sg_cpu->util, util); } @@ -555,7 +632,7 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -struct cpufreq_governor schedutil_gov; +static struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -588,9 +665,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) * Fake (unused) bandwidth; workaround to "fix" * priority inheritance. */ - .sched_runtime = 1000000, - .sched_deadline = 10000000, - .sched_period = 10000000, + .sched_runtime = NSEC_PER_MSEC, + .sched_deadline = 10 * NSEC_PER_MSEC, + .sched_period = 10 * NSEC_PER_MSEC, }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -617,7 +694,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + if (policy->dvfs_possible_from_any_cpu) + set_cpus_allowed_ptr(thread, policy->related_cpus); + else + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -710,6 +791,11 @@ static int sugov_init(struct cpufreq_policy *policy) goto fail; out: + /* + * Schedutil is the preferred governor for EAS, so rebuild sched domains + * on governor changes to make sure the scheduler knows about them. + */ + em_rebuild_sched_domains(); mutex_unlock(&global_tunables_lock); return 0; @@ -750,6 +836,8 @@ static void sugov_exit(struct cpufreq_policy *policy) sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); + + em_rebuild_sched_domains(); } static int sugov_start(struct cpufreq_policy *policy) @@ -767,14 +855,6 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); - for_each_cpu(cpu, policy->cpus) { - struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); - - memset(sg_cpu, 0, sizeof(*sg_cpu)); - sg_cpu->cpu = cpu; - sg_cpu->sg_policy = sg_policy; - } - if (policy_is_shared(policy)) uu = sugov_update_shared; else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) @@ -785,6 +865,9 @@ static int sugov_start(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + memset(sg_cpu, 0, sizeof(*sg_cpu)); + sg_cpu->cpu = cpu; + sg_cpu->sg_policy = sg_policy; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); } return 0; @@ -816,10 +899,19 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } -struct cpufreq_governor schedutil_gov = { +static struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, @@ -837,30 +929,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -cpufreq_governor_init(schedutil_gov); - -#ifdef CONFIG_ENERGY_MODEL -static void rebuild_sd_workfn(struct work_struct *work) +bool sugov_is_governor(struct cpufreq_policy *policy) { - rebuild_sched_domains_energy(); + return policy->governor == &schedutil_gov; } -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - struct cpufreq_governor *old_gov) -{ - if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); - } - -} -#endif +cpufreq_governor_init(schedutil_gov); |
