diff options
Diffstat (limited to 'kernel/sched/cpufreq_schedutil.c')
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 571 |
1 files changed, 285 insertions, 286 deletions
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033ec7c45f13..0ab5f9d4bc59 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -5,13 +5,10 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - +#include <uapi/linux/sched/types.h> #include "sched.h" -#include <linux/sched/cpufreq.h> -#include <trace/events/power.h> +#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) struct sugov_tunables { struct gov_attr_set attr_set; @@ -24,7 +21,7 @@ struct sugov_policy { struct sugov_tunables *tunables; struct list_head tunables_hook; - raw_spinlock_t update_lock; /* For shared policies */ + raw_spinlock_t update_lock; u64 last_freq_update_time; s64 freq_update_delay_ns; unsigned int next_freq; @@ -38,6 +35,7 @@ struct sugov_policy { struct task_struct *thread; bool work_in_progress; + bool limits_changed; bool need_freq_update; }; @@ -48,11 +46,10 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; - unsigned int iowait_boost_max; u64 last_update; - unsigned long bw_dl; - unsigned long max; + unsigned long util; + unsigned long bw_min; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -80,16 +77,31 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) * by the hardware, as calculating the frequency is pointless if * we cannot in fact act on it. * - * For the slow switching platforms, the kthread is always scheduled on - * the right set of CPUs and any CPU can find the next frequency and - * schedule the kthread. + * This is needed on the slow switching platforms too to prevent CPUs + * going offline from leaving stale IRQ work items behind. */ - if (sg_policy->policy->fast_switch_enabled && - !cpufreq_this_cpu_can_update(sg_policy->policy)) + if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->need_freq_update)) + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); + sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + + return true; + } else if (sg_policy->need_freq_update) { + /* ignore_dl_rate_limit() wants a new frequency to be found. */ return true; + } delta_ns = time - sg_policy->last_freq_update_time; @@ -99,8 +111,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq) + if (sg_policy->need_freq_update) { + sg_policy->need_freq_update = false; + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -108,28 +134,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, return true; } -static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static void sugov_deferred_update(struct sugov_policy *sg_policy) { - struct cpufreq_policy *policy = sg_policy->policy; - - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; - - policy->cur = next_freq; - trace_cpu_frequency(next_freq, smp_processor_id()); -} - -static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) -{ - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); @@ -137,6 +143,32 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, } /** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + /* + * Apply a 25% margin so that we select a higher frequency than + * the current one before the CPU is fully busy: + */ + return policy->cur + (policy->cur >> 2); +} + +/** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. * @util: Current CPU utilization. @@ -162,125 +194,45 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; + freq = get_capacity_ref_freq(policy); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type) +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max) { - unsigned long dl_util, util, irq; - struct rq *rq = cpu_rq(cpu); - - if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) - return max; - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - */ - util = util_cfs; - util += cpu_util_rt(rq); - - dl_util = cpu_util_dl(rq); - - /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. - */ - if (util + dl_util >= max) - return max; - - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * 1 - irq - * U' = irq + ------- * U - * max - */ - util = scale_irq_capacity(util, irq, max); - util += irq; + /* Add dvfs headroom to actual utilization */ + actual = map_util_perf(actual); + /* Actually we don't need to target the max performance */ + if (actual < max) + max = actual; /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. + * Ensure at least minimum performance while providing more compute + * capacity when possible. */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); + return max(min, max); } -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util = cpu_util_cfs(rq); - unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); - sg_cpu->max = max; - sg_cpu->bw_dl = cpu_bw_dl(rq); - - return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); + if (!scx_switched_all()) + util += cpu_util_cfs_boost(sg_cpu->cpu); + util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + util = max(util, boost); + sg_cpu->bw_min = min; + sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); } /** @@ -291,8 +243,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * * The IO wait boost of a task is disabled after a tick since the last update * of a CPU. If a new IO wait boost is requested after more then a tick, then - * we enable the boost starting from the minimum frequency, which improves - * energy efficiency by ignoring sporadic wakeups from IO. + * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy + * efficiency by ignoring sporadic wakeups from IO. */ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, bool set_iowait_boost) @@ -303,8 +255,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost - ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -318,8 +269,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, * * Each time a task wakes up after an IO operation, the CPU utilization can be * boosted to a certain utilization which doubles at each "frequent and - * successive" wakeup from IO, ranging from the utilization of the minimum - * OPP to the utilization of the maximum OPP. + * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization + * of the maximum OPP. + * * To keep doubling, an IO boost has to be requested at least once per tick, * otherwise we restart from the utilization of the minimum OPP. */ @@ -344,22 +296,20 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, /* Double the boost at each request */ if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + sg_cpu->iowait_boost = + min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); return; } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; } /** * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller - * @util: the utilization to (eventually) boost - * @max: the maximum value the utilization can be boosted to + * @max_cap: the max CPU capacity * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -373,142 +323,191 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long *util, unsigned long *max) +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long max_cap) { - unsigned int boost_util, boost_max; - /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return 0; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return 0; - /* - * An IO waiting task has just woken up: - * allow to further double the boost value - */ - if (sg_cpu->iowait_boost_pending) { - sg_cpu->iowait_boost_pending = false; - } else { + if (!sg_cpu->iowait_boost_pending) { /* - * Otherwise: reduce the boost value and disable it when we - * reach the minimum. + * No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return; + return 0; } } + sg_cpu->iowait_boost_pending = false; + /* - * Apply the current boost value: a CPU is boosted only if its current - * utilization is smaller then the current IO boost level. + * sg_cpu->util is already in capacity scale; convert iowait_boost + * into the same scale so we can compare. */ - boost_util = sg_cpu->iowait_boost; - boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { - *util = boost_util; - *max = boost_max; - } + return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; } #ifdef CONFIG_NO_HZ_COMMON -static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); - bool ret = idle_calls == sg_cpu->saved_idle_calls; + unsigned long idle_calls; + bool ret; + + /* + * The heuristics in this function is for the fair class. For SCX, the + * performance target comes directly from the BPF scheduler. Let's just + * follow it. + */ + if (scx_switched_all()) + return false; + + /* if capped by uclamp_max, always update to be in compliance */ + if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) + return false; + + /* + * Maintain the frequency if the CPU has not been idle recently, as + * reduction is likely to be premature. + */ + idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); + ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; return ret; } -#else -static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ +static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } +#endif /* !CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL * has increased the utilization. */ -static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) +static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_policy->need_freq_update = true; + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) + sg_cpu->sg_policy->need_freq_update = true; } -static void sugov_update_single(struct update_util_data *hook, u64 time, - unsigned int flags) +static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, + u64 time, unsigned long max_cap, + unsigned int flags) { - struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); - struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; - unsigned int next_f; - bool busy; + unsigned long boost; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - ignore_dl_rate_limit(sg_cpu, sg_policy); + ignore_dl_rate_limit(sg_cpu); + + if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) + return false; + + boost = sugov_iowait_apply(sg_cpu, time, max_cap); + sugov_get_util(sg_cpu, boost); + + return true; +} - if (!sugov_should_update_freq(sg_policy, time)) +static void sugov_update_single_freq(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned long max_cap; + unsigned int next_f; + + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - busy = sugov_cpu_is_busy(sg_cpu); + next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); - util = sugov_get_util(sg_cpu); - max = sg_cpu->max; - sugov_iowait_apply(sg_cpu, time, &util, &max); - next_f = get_next_freq(sg_policy, util, max); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - */ - if (busy && next_f < sg_policy->next_freq) { + if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && + !sg_policy->need_freq_update) { next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; + /* Restore cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = cached_freq; } + if (!sugov_update_next_freq(sg_policy, time, next_f)) + return; + /* * This code runs under rq->lock for the target CPU, so it won't run * concurrently on two different CPUs for the same target and it is not * necessary to acquire the lock in the fast switch case. */ if (sg_policy->policy->fast_switch_enabled) { - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); } else { raw_spin_lock(&sg_policy->update_lock); - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); raw_spin_unlock(&sg_policy->update_lock); } } +static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + unsigned long prev_util = sg_cpu->util; + unsigned long max_cap; + + /* + * Fall back to the "frequency" path if frequency invariance is not + * supported, because the direct mapping between the utilization and + * the performance levels depends on the frequency invariance. + */ + if (!arch_scale_freq_invariant()) { + sugov_update_single_freq(hook, time, flags); + return; + } + + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) + return; + + if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) + sg_cpu->util = prev_util; + + cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + sg_cpu->util, max_cap); + + sg_cpu->sg_policy->last_freq_update_time = time; +} + static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0, max_cap; unsigned int j; + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; + unsigned long boost; - j_util = sugov_get_util(j_sg_cpu); - j_max = j_sg_cpu->max; - sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); + sugov_get_util(j_sg_cpu, boost); - if (j_util * max > j_max * util) { - util = j_util; - max = j_max; - } + util = max(j_sg_cpu->util, util); } - return get_next_freq(sg_policy, util, max); + return get_next_freq(sg_policy, util, max_cap); } static void @@ -523,17 +522,20 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - ignore_dl_rate_limit(sg_cpu, sg_policy); + ignore_dl_rate_limit(sg_cpu); if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); + if (!sugov_update_next_freq(sg_policy, time, next_f)) + goto unlock; + if (sg_policy->policy->fast_switch_enabled) - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); else - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); } - +unlock: raw_spin_unlock(&sg_policy->update_lock); } @@ -545,7 +547,7 @@ static void sugov_work(struct kthread_work *work) /* * Hold sg_policy->update_lock shortly to handle the case where: - * incase sg_policy->next_freq is read here, and then updated by + * in case sg_policy->next_freq is read here, and then updated by * sugov_deferred_update() just before work_in_progress is set to false * here, we may miss queueing the new update. * @@ -609,19 +611,28 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); -static struct attribute *sugov_attributes[] = { +static struct attribute *sugov_attrs[] = { &rate_limit_us.attr, NULL }; +ATTRIBUTE_GROUPS(sugov); -static struct kobj_type sugov_tunables_ktype = { - .default_attrs = sugov_attributes, +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); + + kfree(to_sugov_tunables(attr_set)); +} + +static const struct kobj_type sugov_tunables_ktype = { + .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ -struct cpufreq_governor schedutil_gov; +static struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -654,9 +665,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) * Fake (unused) bandwidth; workaround to "fix" * priority inheritance. */ - .sched_runtime = 1000000, - .sched_deadline = 10000000, - .sched_period = 10000000, + .sched_runtime = NSEC_PER_MSEC, + .sched_deadline = 10 * NSEC_PER_MSEC, + .sched_period = 10 * NSEC_PER_MSEC, }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -683,7 +694,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + if (policy->dvfs_possible_from_any_cpu) + set_cpus_allowed_ptr(thread, policy->related_cpus); + else + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -716,12 +731,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -778,12 +791,18 @@ static int sugov_init(struct cpufreq_policy *policy) goto fail; out: + /* + * Schedutil is the preferred governor for EAS, so rebuild sched domains + * on governor changes to make sure the scheduler knows about them. + */ + em_rebuild_sched_domains(); mutex_unlock(&global_tunables_lock); return 0; fail: + kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -810,43 +829,46 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); + + em_rebuild_sched_domains(); } static int sugov_start(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; + void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); unsigned int cpu; sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; sg_policy->last_freq_update_time = 0; sg_policy->next_freq = 0; sg_policy->work_in_progress = false; - sg_policy->need_freq_update = false; + sg_policy->limits_changed = false; sg_policy->cached_raw_freq = 0; - for_each_cpu(cpu, policy->cpus) { - struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); - memset(sg_cpu, 0, sizeof(*sg_cpu)); - sg_cpu->cpu = cpu; - sg_cpu->sg_policy = sg_policy; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; - } + if (policy_is_shared(policy)) + uu = sugov_update_shared; + else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) + uu = sugov_update_single_perf; + else + uu = sugov_update_single_freq; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - policy_is_shared(policy) ? - sugov_update_shared : - sugov_update_single); + memset(sg_cpu, 0, sizeof(*sg_cpu)); + sg_cpu->cpu = cpu; + sg_cpu->sg_policy = sg_policy; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); } return 0; } @@ -859,7 +881,7 @@ static void sugov_stop(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) cpufreq_remove_update_util_hook(cpu); - synchronize_sched(); + synchronize_rcu(); if (!policy->fast_switch_enabled) { irq_work_sync(&sg_policy->irq_work); @@ -877,13 +899,22 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->need_freq_update = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } -struct cpufreq_governor schedutil_gov = { +static struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, @@ -898,41 +929,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -static int __init sugov_register(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} -fs_initcall(sugov_register); - -#ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - -static void rebuild_sd_workfn(struct work_struct *work) +bool sugov_is_governor(struct cpufreq_policy *policy) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); + return policy->governor == &schedutil_gov; } -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - struct cpufreq_governor *old_gov) -{ - if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); - } - -} -#endif +cpufreq_governor_init(schedutil_gov); |
