diff options
author | Tejun Heo <tj@kernel.org> | 2024-06-21 12:37:22 -1000 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2024-06-21 12:37:22 -1000 |
commit | d86adb4fc0655a0867da811d000df75d2a325ef6 (patch) | |
tree | f2024987668b3421ea8bd60ce71bd6b313b6ec53 /kernel/sched/ext.c | |
parent | 8988cad8d06eb6097667925d2eb0522850bb0aac (diff) |
sched_ext: Add cpuperf support
sched_ext currently does not integrate with schedutil. When schedutil is the
governor, frequencies are left unregulated and usually get stuck close to
the highest performance level from running RT tasks.
Add CPU performance monitoring and scaling support by integrating into
schedutil. The following kfuncs are added:
- scx_bpf_cpuperf_cap(): Query the relative performance capacity of
different CPUs in the system.
- scx_bpf_cpuperf_cur(): Query the current performance level of a CPU
relative to its max performance.
- scx_bpf_cpuperf_set(): Set the current target performance level of a CPU.
This gives direct control over CPU performance setting to the BPF scheduler.
The only changes on the schedutil side are accounting for the utilization
factor from sched_ext and disabling frequency holding heuristics as it may
not apply well to sched_ext schedulers which may have a lot weaker
connection between tasks and their current / last CPU.
With cpuperf support added, there is no reason to block uclamp. Enable while
at it.
A toy implementation of cpuperf is added to scx_qmap as a demonstration of
the feature.
v2: Ignore cpu_util_cfs_boost() when scx_switched_all() in sugov_get_util()
to avoid factoring in stale util metric. (Christian)
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Christian Loehle <christian.loehle@arm.com>
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r-- | kernel/sched/ext.c | 83 |
1 files changed, 81 insertions, 2 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 390623a4a376..28f7a4266fde 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -16,6 +16,8 @@ enum scx_consts { SCX_EXIT_BT_LEN = 64, SCX_EXIT_MSG_LEN = 1024, SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, }; enum scx_exit_kind { @@ -3520,7 +3522,7 @@ DEFINE_SCHED_CLASS(ext) = { .update_curr = update_curr_scx, #ifdef CONFIG_UCLAMP_TASK - .uclamp_enabled = 0, + .uclamp_enabled = 1, #endif }; @@ -4393,7 +4395,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, ret; + int i, cpu, ret; mutex_lock(&scx_ops_enable_mutex); @@ -4442,6 +4444,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) atomic_long_set(&scx_nr_rejected, 0); + for_each_possible_cpu(cpu) + cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + /* * Keep CPUs stable during enable so that the BPF scheduler can track * online CPUs by watching ->on/offline_cpu() after ->init(). @@ -5836,6 +5841,77 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, } /** + * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU + * @cpu: CPU of interest + * + * Return the maximum relative capacity of @cpu in relation to the most + * performant CPU in the system. The return value is in the range [1, + * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) +{ + if (ops_cpu_valid(cpu, NULL)) + return arch_scale_cpu_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU + * @cpu: CPU of interest + * + * Return the current relative performance of @cpu in relation to its maximum. + * The return value is in the range [1, %SCX_CPUPERF_ONE]. + * + * The current performance level of a CPU in relation to the maximum performance + * available in the system can be calculated as follows: + * + * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE + * + * The result is in the range [1, %SCX_CPUPERF_ONE]. + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) +{ + if (ops_cpu_valid(cpu, NULL)) + return arch_scale_freq_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_set - Set the relative performance target of a CPU + * @cpu: CPU of interest + * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * @flags: %SCX_CPUPERF_* flags + * + * Set the target performance level of @cpu to @perf. @perf is in linear + * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the + * schedutil cpufreq governor chooses the target frequency. + * + * The actual performance level chosen, CPU grouping, and the overhead and + * latency of the operations are dependent on the hardware and cpufreq driver in + * use. Consult hardware and cpufreq documentation for more information. The + * current performance level can be monitored using scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf) +{ + if (unlikely(perf > SCX_CPUPERF_ONE)) { + scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + return; + } + + if (ops_cpu_valid(cpu, NULL)) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.cpuperf_target = perf; + + rcu_read_lock_sched_notrace(); + cpufreq_update_util(cpu_rq(cpu), 0); + rcu_read_unlock_sched_notrace(); + } +} + +/** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * * All valid CPU IDs in the system are smaller than the returned value. @@ -6045,6 +6121,9 @@ BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) |