diff options
Diffstat (limited to 'kernel/sched')
40 files changed, 28056 insertions, 8843 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5fc9c9b70862..976092b7bd45 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,17 +1,17 @@ # SPDX-License-Identifier: GPL-2.0 -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) -endif + +# The compilers are complaining about unused variables inside an if(0) scope +# block. This is daft, shut them up. +ccflags-y += $(call cc-disable-warning, unused-but-set-variable) # These files are disabled because they produce non-interesting flaky coverage # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n -# There are numerous data races here, however, most of them are due to plain accesses. -# This would make it even harder for syzbot to find reproducers, because these -# bugs trigger without specific input. Disable by default, but should re-enable -# eventually. +# Disable KCSAN to avoid excessive noise and performance degradation. To avoid +# false positives ensure barriers implied by sched functions are instrumented. KCSAN_SANITIZE := n +KCSAN_INSTRUMENT_BARRIERS := y ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is @@ -22,17 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle.o fair.o rt.o deadline.o -obj-y += wait.o wait_bit.o swait.o completion.o - -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o -obj-$(CONFIG_SCHEDSTATS) += stats.o -obj-$(CONFIG_SCHED_DEBUG) += debug.o -obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -obj-$(CONFIG_CPU_FREQ) += cpufreq.o -obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o -obj-$(CONFIG_CPU_ISOLATION) += isolation.o -obj-$(CONFIG_PSI) += psi.o +# +# Build efficiency: +# +# These compilation units have roughly the same size and complexity - so their +# build parallelizes well and finishes roughly at once: +# +obj-y += core.o +obj-y += fair.o +obj-y += build_policy.o +obj-y += build_utility.o diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2067080bb235..2b331822c7e7 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,20 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Auto-group scheduling implementation: */ -#include <linux/nospec.h> -#include "sched.h" unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; +#ifdef CONFIG_SYSCTL +static const struct ctl_table sched_autogroup_sysctls[] = { + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static void __init sched_autogroup_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_autogroup_sysctls); +} +#else +#define sched_autogroup_sysctl_init() do { } while (0) +#endif + void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; + sched_autogroup_sysctl_init(); } void autogroup_free(struct task_group *tg) @@ -31,7 +52,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif - sched_offline_group(ag->tg); + sched_release_group(ag->tg); sched_destroy_group(ag->tg); } @@ -129,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -139,7 +160,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) struct task_struct *t; unsigned long flags; - BUG_ON(!lock_task_sighand(p, &flags)); + if (WARN_ON_ONCE(!lock_task_sighand(p, &flags))) + return; prev = p->signal->autogroup; if (prev == ag) { @@ -160,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index b96419974a1f..90d69f2c5eaf 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_SCHED_AUTOGROUP_H +#define _KERNEL_SCHED_AUTOGROUP_H + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { + extern unsigned int sysctl_sched_autogroup_enabled; int enabled = READ_ONCE(sysctl_sched_autogroup_enabled); if (enabled && task_wants_autogroup(p, tg)) @@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) } #endif /* CONFIG_SCHED_AUTOGROUP */ + +#endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c new file mode 100644 index 000000000000..fae1f5c921eb --- /dev/null +++ b/kernel/sched/build_policy.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are the scheduling policy related scheduler files, built + * in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c and fair.c, the other two big + * compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + * + * core.c and fair.c are built separately. + */ + +/* Headers: */ +#include <linux/sched/clock.h> +#include <linux/sched/cputime.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/isolation.h> +#include <linux/sched/posix-timers.h> +#include <linux/sched/rt.h> + +#include <linux/cpuidle.h> +#include <linux/jiffies.h> +#include <linux/kobject.h> +#include <linux/livepatch.h> +#include <linux/pm.h> +#include <linux/psi.h> +#include <linux/rhashtable.h> +#include <linux/seq_buf.h> +#include <linux/seqlock_api.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include <linux/tsacct_kern.h> +#include <linux/vtime.h> +#include <linux/sysrq.h> +#include <linux/percpu-rwsem.h> + +#include <uapi/linux/sched/types.h> + +#include "sched.h" +#include "smp.h" + +#include "autogroup.h" +#include "stats.h" +#include "pelt.h" + +/* Source code modules: */ + +#include "idle.c" + +#include "rt.c" + +#ifdef CONFIG_SMP +# include "cpudeadline.c" +# include "pelt.c" +#endif + +#include "cputime.c" +#include "deadline.c" + +#ifdef CONFIG_SCHED_CLASS_EXT +# include "ext.c" +#endif + +#include "syscalls.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c new file mode 100644 index 000000000000..80a3df49ab47 --- /dev/null +++ b/kernel/sched/build_utility.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * These are various utility functions of the scheduler, + * built in a single compilation unit for build efficiency reasons. + * + * ( Incidentally, the size of the compilation unit is roughly + * comparable to core.c, fair.c, smp.c and policy.c, the other + * big compilation units. This helps balance build time, while + * coalescing source files to amortize header inclusion + * cost. ) + */ +#include <linux/sched/clock.h> +#include <linux/sched/cputime.h> +#include <linux/sched/debug.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/nohz.h> +#include <linux/sched/mm.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/task_stack.h> + +#include <linux/cpufreq.h> +#include <linux/cpumask_api.h> +#include <linux/cpuset.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/energy_model.h> +#include <linux/hashtable_api.h> +#include <linux/irq.h> +#include <linux/kobject_api.h> +#include <linux/membarrier.h> +#include <linux/mempolicy.h> +#include <linux/nmi.h> +#include <linux/nospec.h> +#include <linux/proc_fs.h> +#include <linux/psi.h> +#include <linux/ptrace_api.h> +#include <linux/sched_clock.h> +#include <linux/security.h> +#include <linux/spinlock_api.h> +#include <linux/swait_api.h> +#include <linux/timex.h> +#include <linux/utsname.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#include <uapi/linux/prctl.h> +#include <uapi/linux/sched/types.h> + +#include <asm/switch_to.h> + +#include "sched.h" +#include "sched-pelt.h" +#include "stats.h" +#include "autogroup.h" + +#include "clock.c" + +#ifdef CONFIG_CGROUP_CPUACCT +# include "cpuacct.c" +#endif + +#ifdef CONFIG_CPU_FREQ +# include "cpufreq.c" +#endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +# include "cpufreq_schedutil.c" +#endif + +#ifdef CONFIG_SCHED_DEBUG +# include "debug.c" +#endif + +#ifdef CONFIG_SCHEDSTATS +# include "stats.c" +#endif + +#include "loadavg.c" +#include "completion.c" +#include "swait.c" +#include "wait_bit.c" +#include "wait.c" + +#ifdef CONFIG_SMP +# include "cpupri.c" +# include "stop_task.c" +# include "topology.c" +#endif + +#ifdef CONFIG_SCHED_CORE +# include "core_sched.c" +#endif + +#ifdef CONFIG_PSI +# include "psi.c" +#endif + +#ifdef CONFIG_MEMBARRIER +# include "membarrier.c" +#endif + +#ifdef CONFIG_CPU_ISOLATION +# include "isolation.c" +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +# include "autogroup.c" +#endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 12bca64dff73..a09655b48140 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -41,7 +41,7 @@ * Otherwise it tries to create a semi stable clock from a mixture of other * clocks, including: * - * - GTOD (clock monotomic) + * - GTOD (clock monotonic) * - sched_clock() * - explicit idle events * @@ -53,15 +53,13 @@ * that is otherwise invisible (TSC gets stopped). * */ -#include "sched.h" -#include <linux/sched_clock.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __weak sched_clock(void) +notrace unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); @@ -95,28 +93,28 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } -static inline struct sched_clock_data *cpu_sdc(int cpu) +notrace static inline struct sched_clock_data *cpu_sdc(int cpu) { return &per_cpu(sched_clock_data, cpu); } -int sched_clock_stable(void) +notrace int sched_clock_stable(void) { return static_branch_likely(&__sched_clock_stable); } -static void __scd_stamp(struct sched_clock_data *scd) +notrace static void __scd_stamp(struct sched_clock_data *scd) { scd->tick_gtod = ktime_get_ns(); scd->tick_raw = sched_clock(); } -static void __set_sched_clock_stable(void) +notrace static void __set_sched_clock_stable(void) { struct sched_clock_data *scd; @@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void) * The only way to fully avoid random clock jumps is to boot with: * "tsc=unstable". */ -static void __sched_clock_work(struct work_struct *work) +notrace static void __sched_clock_work(struct work_struct *work) { struct sched_clock_data *scd; int cpu; @@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work) static DECLARE_WORK(sched_clock_work, __sched_clock_work); -static void __clear_sched_clock_stable(void) +notrace static void __clear_sched_clock_stable(void) { if (!sched_clock_stable()) return; @@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void) schedule_work(&sched_clock_work); } -void clear_sched_clock_stable(void) +notrace void clear_sched_clock_stable(void) { __sched_clock_stable_early = 0; @@ -196,7 +194,7 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -static void __sched_clock_gtod_offset(void) +notrace static void __sched_clock_gtod_offset(void) { struct sched_clock_data *scd = this_scd(); @@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -262,13 +260,13 @@ static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; again: - now = sched_clock(); + now = sched_clock_noinstr(); delta = now - scd->tick_raw; if (unlikely(delta < 0)) delta = 0; @@ -289,13 +287,38 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock_noinstr(void) +{ + u64 clock; + + if (static_branch_likely(&__sched_clock_stable)) + return sched_clock_noinstr() + __sched_clock_offset; + + if (!static_branch_likely(&sched_clock_running)) + return sched_clock_noinstr(); + + clock = sched_clock_local(this_scd()); + + return clock; +} + +u64 local_clock(void) +{ + u64 now; + preempt_disable_notrace(); + now = local_clock_noinstr(); + preempt_enable_notrace(); + return now; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; @@ -317,7 +340,7 @@ again: this_clock = sched_clock_local(my_scd); /* * We must enforce atomic readout on 32-bit, otherwise the - * update on the remote CPU can hit inbetween the readout of + * update on the remote CPU can hit in between the readout of * the low 32-bit and the high 32-bit portion. */ remote_clock = cmpxchg64(&scd->clock, 0, 0); @@ -351,7 +374,7 @@ again: val = remote_clock; } - if (cmpxchg64(ptr, old_val, val) != old_val) + if (!try_cmpxchg64(ptr, &old_val, val)) goto again; return val; @@ -362,7 +385,7 @@ again: * * See cpu_clock(). */ -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; u64 clock; @@ -386,7 +409,7 @@ u64 sched_clock_cpu(int cpu) } EXPORT_SYMBOL_GPL(sched_clock_cpu); -void sched_clock_tick(void) +notrace void sched_clock_tick(void) { struct sched_clock_data *scd; @@ -403,7 +426,7 @@ void sched_clock_tick(void) sched_clock_local(scd); } -void sched_clock_tick_stable(void) +notrace void sched_clock_tick_stable(void) { if (!sched_clock_stable()) return; @@ -421,9 +444,9 @@ void sched_clock_tick_stable(void) } /* - * We are going deep-idle (irqs are disabled): + * We are going deep-idle (IRQs are disabled): */ -void sched_clock_idle_sleep_event(void) +notrace void sched_clock_idle_sleep_event(void) { sched_clock_cpu(smp_processor_id()); } @@ -432,7 +455,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* * We just idled; resync with ktime. */ -void sched_clock_idle_wakeup_event(void) +notrace void sched_clock_idle_wakeup_event(void) { unsigned long flags; @@ -458,7 +481,7 @@ void __init sched_clock_init(void) local_irq_enable(); } -u64 sched_clock_cpu(int cpu) +notrace u64 sched_clock_cpu(int cpu) { if (!static_branch_likely(&sched_clock_running)) return 0; @@ -476,7 +499,7 @@ u64 sched_clock_cpu(int cpu) * On bare metal this function should return the same as local_clock. * Architectures and sub-architectures can override this. */ -u64 __weak running_clock(void) +notrace u64 __weak running_clock(void) { return local_clock(); } diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a778554f9dad..3561ab533dd4 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 + /* * Generic wait-for-completion handler; * @@ -11,7 +12,23 @@ * typically be used for exclusion which gives rise to priority inversion. * Waiting for completion is a typically sync point, but not an exclusion point. */ -#include "sched.h" + +static void complete_with_flags(struct completion *x, int wake_flags) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&x->wait.lock, flags); + + if (x->done != UINT_MAX) + x->done++; + swake_up_locked(&x->wait, wake_flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); +} + +void complete_on_current_cpu(struct completion *x) +{ + return complete_with_flags(x, WF_CURRENT_CPU); +} /** * complete: - signals a single thread waiting on this completion @@ -27,14 +44,7 @@ */ void complete(struct completion *x) { - unsigned long flags; - - raw_spin_lock_irqsave(&x->wait.lock, flags); - - if (x->done != UINT_MAX) - x->done++; - swake_up_locked(&x->wait); - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + complete_with_flags(x, 0); } EXPORT_SYMBOL(complete); @@ -204,6 +214,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) return t; return 0; @@ -241,12 +252,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) return t; return 0; } EXPORT_SYMBOL(wait_for_completion_killable); +int __sched wait_for_completion_state(struct completion *x, unsigned int state) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state); + + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_state); + /** * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) * @x: holds the state of this particular completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2142c6767682..042351c7afce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2,29 +2,103 @@ /* * kernel/sched/core.c * - * Core kernel scheduler code and related syscalls + * Core kernel CPU scheduler code * * Copyright (C) 1991-2002 Linus Torvalds - */ -#include "sched.h" - -#include <linux/nospec.h> - + * Copyright (C) 1998-2024 Ingo Molnar, Red Hat + */ +#include <linux/highmem.h> +#include <linux/hrtimer_api.h> +#include <linux/ktime_api.h> +#include <linux/sched/signal.h> +#include <linux/syscalls_api.h> +#include <linux/debug_locks.h> +#include <linux/prefetch.h> +#include <linux/capability.h> +#include <linux/pgtable_api.h> +#include <linux/wait_bit.h> +#include <linux/jiffies.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/hardirq.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/debug.h> +#include <linux/sched/hotplug.h> +#include <linux/sched/init.h> +#include <linux/sched/isolation.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/mm.h> +#include <linux/sched/nohz.h> +#include <linux/sched/rseq_api.h> +#include <linux/sched/rt.h> + +#include <linux/blkdev.h> +#include <linux/context_tracking.h> +#include <linux/cpuset.h> +#include <linux/delayacct.h> +#include <linux/init_task.h> +#include <linux/interrupt.h> +#include <linux/ioprio.h> +#include <linux/kallsyms.h> #include <linux/kcov.h> +#include <linux/kprobes.h> +#include <linux/llist_api.h> +#include <linux/mmu_context.h> +#include <linux/mmzone.h> +#include <linux/mutex_api.h> +#include <linux/nmi.h> +#include <linux/nospec.h> +#include <linux/perf_event_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/rcuwait_api.h> +#include <linux/rseq.h> +#include <linux/sched/wake_q.h> #include <linux/scs.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/vtime.h> +#include <linux/wait_api.h> +#include <linux/workqueue_api.h> + +#ifdef CONFIG_PREEMPT_DYNAMIC +# ifdef CONFIG_GENERIC_ENTRY +# include <linux/entry-common.h> +# endif +#endif +#include <uapi/linux/sched/types.h> + +#include <asm/irq_regs.h> #include <asm/switch_to.h> #include <asm/tlb.h> -#include "../workqueue_internal.h" -#include "../../fs/io-wq.h" -#include "../smpboot.h" +#define CREATE_TRACE_POINTS +#include <linux/sched/rseq_api.h> +#include <trace/events/sched.h> +#include <trace/events/ipi.h> +#undef CREATE_TRACE_POINTS +#include "sched.h" +#include "stats.h" + +#include "autogroup.h" #include "pelt.h" #include "smp.h" +#include "stats.h" -#define CREATE_TRACE_POINTS -#include <trace/events/sched.h> +#include "../workqueue_internal.h" +#include "../../io_uring/io-wq.h" +#include "../smpboot.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); +EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); /* * Export tracepoints that act as a bare tracehook (ie: have no trace event @@ -35,11 +109,17 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -53,27 +133,533 @@ const_debug unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT -#endif + +/* + * Print a warning if need_resched is set for the given duration (if + * LATENCY_WARN is enabled). + * + * If sysctl_resched_latency_warn_once is set, only one warning will be shown + * per boot. + */ +__read_mostly int sysctl_resched_latency_warn_ms = 100; +__read_mostly int sysctl_resched_latency_warn_once = 1; +#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -const_debug unsigned int sysctl_sched_nr_migrate = 32; +const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; + +__read_mostly int scheduler_running; + +#ifdef CONFIG_SCHED_CORE + +DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); + +/* kernel prio, less is more */ +static inline int __task_prio(const struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + + if (p->dl_server) + return -1; /* deadline */ + + if (rt_or_dl_prio(p->prio)) + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) + return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ + + if (task_on_scx(p)) + return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ + + return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ +} /* - * period over which we measure -rt task CPU usage in us. - * default: 1s + * l(a,b) + * le(a,b) := !l(b,a) + * g(a,b) := l(b,a) + * ge(a,b) := !l(a,b) */ -unsigned int sysctl_sched_rt_period = 1000000; -__read_mostly int scheduler_running; +/* real prio, less is less */ +static inline bool prio_less(const struct task_struct *a, + const struct task_struct *b, bool in_fi) +{ + + int pa = __task_prio(a), pb = __task_prio(b); + + if (-pa < -pb) + return true; + + if (-pb < -pa) + return false; + + if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ + const struct sched_dl_entity *a_dl, *b_dl; + + a_dl = &a->dl; + /* + * Since,'a' and 'b' can be CFS tasks served by DL server, + * __task_prio() can return -1 (for DL) even for those. In that + * case, get to the dl_server's DL entity. + */ + if (a->dl_server) + a_dl = a->dl_server; + + b_dl = &b->dl; + if (b->dl_server) + b_dl = b->dl_server; + + return !dl_time_before(a_dl->deadline, b_dl->deadline); + } + + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b, in_fi); + +#ifdef CONFIG_SCHED_CLASS_EXT + if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ + return scx_prio_less(a, b, in_fi); +#endif + + return false; +} + +static inline bool __sched_core_less(const struct task_struct *a, + const struct task_struct *b) +{ + if (a->core_cookie < b->core_cookie) + return true; + + if (a->core_cookie > b->core_cookie) + return false; + + /* flip prio, so high prio is leftmost */ + if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count)) + return true; + + return false; +} + +#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node) + +static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b) +{ + return __sched_core_less(__node_2_sc(a), __node_2_sc(b)); +} + +static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) +{ + const struct task_struct *p = __node_2_sc(node); + unsigned long cookie = (unsigned long)key; + + if (cookie < p->core_cookie) + return -1; + + if (cookie > p->core_cookie) + return 1; + + return 0; +} + +void sched_core_enqueue(struct rq *rq, struct task_struct *p) +{ + if (p->se.sched_delayed) + return; + + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); +} + +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) +{ + if (p->se.sched_delayed) + return; + + rq->core->core_task_seq++; + + if (sched_core_enqueued(p)) { + rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); + } + + /* + * Migrating the last task off the cpu, with the cpu in forced idle + * state. Reschedule to create an accounting edge for forced idle, + * and re-examine whether the core is still in forced idle state. + */ + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && + rq->core->core_forceidle_count && rq->curr == rq->idle) + resched_curr(rq); +} + +static int sched_task_is_throttled(struct task_struct *p, int cpu) +{ + if (p->sched_class->task_is_throttled) + return p->sched_class->task_is_throttled(p, cpu); + + return 0; +} + +static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) +{ + struct rb_node *node = &p->core_node; + int cpu = task_cpu(p); + + do { + node = rb_next(node); + if (!node) + return NULL; + + p = __node_2_sc(node); + if (p->core_cookie != cookie) + return NULL; + + } while (sched_task_is_throttled(p, cpu)); + + return p; +} + +/* + * Find left-most (aka, highest priority) and unthrottled task matching @cookie. + * If no suitable task is found, NULL will be returned. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct task_struct *p; + struct rb_node *node; + + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); + if (!node) + return NULL; + + p = __node_2_sc(node); + if (!sched_task_is_throttled(p, rq->cpu)) + return p; + + return sched_core_next(p, cookie); +} + +/* + * Magic required such that: + * + * raw_spin_rq_lock(rq); + * ... + * raw_spin_rq_unlock(rq); + * + * ends up locking and unlocking the _same_ lock, and all CPUs + * always agree on what rq has what lock. + * + * XXX entirely possible to selectively enable cores, don't bother for now. + */ + +static DEFINE_MUTEX(sched_core_mutex); +static atomic_t sched_core_count; +static struct cpumask sched_core_mask; + +static void sched_core_lock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t, i = 0; + + local_irq_save(*flags); + for_each_cpu(t, smt_mask) + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); +} + +static void sched_core_unlock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_restore(*flags); +} + +static void __sched_core_flip(bool enabled) +{ + unsigned long flags; + int cpu, t; + + cpus_read_lock(); + + /* + * Toggle the online cores, one by one. + */ + cpumask_copy(&sched_core_mask, cpu_online_mask); + for_each_cpu(cpu, &sched_core_mask) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + + sched_core_lock(cpu, &flags); + + for_each_cpu(t, smt_mask) + cpu_rq(t)->core_enabled = enabled; + + cpu_rq(cpu)->core->core_forceidle_start = 0; + + sched_core_unlock(cpu, &flags); + + cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); + } + + /* + * Toggle the offline CPUs. + */ + for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask) + cpu_rq(cpu)->core_enabled = enabled; + + cpus_read_unlock(); +} + +static void sched_core_assert_empty(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); +} + +static void __sched_core_enable(void) +{ + static_branch_enable(&__sched_core_enabled); + /* + * Ensure all previous instances of raw_spin_rq_*lock() have finished + * and future ones will observe !sched_core_disabled(). + */ + synchronize_rcu(); + __sched_core_flip(true); + sched_core_assert_empty(); +} + +static void __sched_core_disable(void) +{ + sched_core_assert_empty(); + __sched_core_flip(false); + static_branch_disable(&__sched_core_enabled); +} + +void sched_core_get(void) +{ + if (atomic_inc_not_zero(&sched_core_count)) + return; + + mutex_lock(&sched_core_mutex); + if (!atomic_read(&sched_core_count)) + __sched_core_enable(); + + smp_mb__before_atomic(); + atomic_inc(&sched_core_count); + mutex_unlock(&sched_core_mutex); +} + +static void __sched_core_put(struct work_struct *work) +{ + if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) { + __sched_core_disable(); + mutex_unlock(&sched_core_mutex); + } +} + +void sched_core_put(void) +{ + static DECLARE_WORK(_work, __sched_core_put); + + /* + * "There can be only one" + * + * Either this is the last one, or we don't actually need to do any + * 'work'. If it is the last *again*, we rely on + * WORK_STRUCT_PENDING_BIT. + */ + if (!atomic_add_unless(&sched_core_count, -1, 1)) + schedule_work(&_work); +} + +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } +static inline void +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } + +#endif /* CONFIG_SCHED_CORE */ + +/* + * Serialization rules: + * + * Lock order: + * + * p->pi_lock + * rq->lock + * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) + * + * rq1->lock + * rq2->lock where: rq1 < rq2 + * + * Regular state: + * + * Normal scheduling state is serialized by rq->lock. __schedule() takes the + * local CPU's rq->lock, it optionally removes the task from the runqueue and + * always looks at the local rq data structures to find the most eligible task + * to run next. + * + * Task enqueue is also under rq->lock, possibly taken from another CPU. + * Wakeups from another LLC domain might use an IPI to transfer the enqueue to + * the local CPU to avoid bouncing the runqueue state around [ see + * ttwu_queue_wakelist() ] + * + * Task wakeup, specifically wakeups that involve migration, are horribly + * complicated to avoid having to take two rq->locks. + * + * Special state: + * + * System-calls and anything external will use task_rq_lock() which acquires + * both p->pi_lock and rq->lock. As a consequence the state they change is + * stable while holding either lock: + * + * - sched_setaffinity()/ + * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed + * - set_user_nice(): p->se.load, p->*prio + * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, + * p->se.load, p->rt_priority, + * p->dl.dl_{runtime, deadline, period, flags, bw, density} + * - sched_setnuma(): p->numa_preferred_nid + * - sched_move_task(): p->sched_task_group + * - uclamp_update_active() p->uclamp* + * + * p->state <- TASK_*: + * + * is changed locklessly using set_current_state(), __set_current_state() or + * set_special_state(), see their respective comments, or by + * try_to_wake_up(). This latter uses p->pi_lock to serialize against + * concurrent self. + * + * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: + * + * is set by activate_task() and cleared by deactivate_task(), under + * rq->lock. Non-zero indicates the task is runnable, the special + * ON_RQ_MIGRATING state is used for migration without holding both + * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). + * + * Additionally it is possible to be ->on_rq but still be considered not + * runnable when p->se.sched_delayed is true. These tasks are on the runqueue + * but will be dequeued as soon as they get picked again. See the + * task_is_runnable() helper. + * + * p->on_cpu <- { 0, 1 }: + * + * is set by prepare_task() and cleared by finish_task() such that it will be + * set before p is scheduled-in and cleared after p is scheduled-out, both + * under rq->lock. Non-zero indicates the task is running on its CPU. + * + * [ The astute reader will observe that it is possible for two tasks on one + * CPU to have ->on_cpu = 1 at the same time. ] + * + * task_cpu(p): is changed by set_task_cpu(), the rules are: + * + * - Don't call set_task_cpu() on a blocked task: + * + * We don't care what CPU we're not running on, this simplifies hotplug, + * the CPU assignment of blocked tasks isn't required to be valid. + * + * - for try_to_wake_up(), called under p->pi_lock: + * + * This allows try_to_wake_up() to only take one rq->lock, see its comment. + * + * - for migration called under rq->lock: + * [ see task_on_rq_migrating() in task_rq_lock() ] + * + * o move_queued_task() + * o detach_task() + * + * - for migration called under double_rq_lock(): + * + * o __migrate_swap_task() + * o push_rt_task() / pull_rt_task() + * o push_dl_task() / pull_dl_task() + * o dl_task_offline_migration() + * + */ + +void raw_spin_rq_lock_nested(struct rq *rq, int subclass) +{ + raw_spinlock_t *lock; + + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + raw_spin_lock_nested(&rq->__lock, subclass); + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); + return; + } + + for (;;) { + lock = __rq_lockp(rq); + raw_spin_lock_nested(lock, subclass); + if (likely(lock == __rq_lockp(rq))) { + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); + return; + } + raw_spin_unlock(lock); + } +} + +bool raw_spin_rq_trylock(struct rq *rq) +{ + raw_spinlock_t *lock; + bool ret; + + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + ret = raw_spin_trylock(&rq->__lock); + preempt_enable(); + return ret; + } + + for (;;) { + lock = __rq_lockp(rq); + ret = raw_spin_trylock(lock); + if (!ret || (likely(lock == __rq_lockp(rq)))) { + preempt_enable(); + return ret; + } + raw_spin_unlock(lock); + } +} +void raw_spin_rq_unlock(struct rq *rq) +{ + raw_spin_unlock(rq_lockp(rq)); +} + +#ifdef CONFIG_SMP /* - * part of the period that we allow rt tasks to run in us. - * default: 0.95s + * double_rq_lock - safely lock two runqueues */ -int sysctl_sched_rt_runtime = 950000; +void double_rq_lock(struct rq *rq1, struct rq *rq2) +{ + lockdep_assert_irqs_disabled(); + + if (rq_order_less(rq2, rq1)) + swap(rq1, rq2); + + raw_spin_rq_lock(rq1); + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); + + double_rq_clock_clear_update(rq1, rq2); +} +#endif /* * __task_rq_lock - lock the rq @p resides on. @@ -87,12 +673,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); while (unlikely(task_on_rq_migrating(p))) cpu_relax(); @@ -111,7 +697,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { raw_spin_lock_irqsave(&p->pi_lock, rf->flags); rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); /* * move_queued_task() task_rq_lock() * @@ -133,7 +719,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); while (unlikely(task_on_rq_migrating(p))) @@ -154,38 +740,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) s64 __maybe_unused steal = 0, irq_delta = 0; #ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + if (irqtime_enabled()) { + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}irq region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}irq - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}IRQ region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}IRQ + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; - rq->prev_irq_time += irq_delta; - delta -= irq_delta; + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + delayacct_irq(rq->curr, irq_delta); + } #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - steal = paravirt_steal_clock(cpu_of(rq)); + u64 prev_steal; + + steal = prev_steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - rq->prev_steal_time_rq += steal; + rq->prev_steal_time_rq = prev_steal; delta -= steal; } #endif @@ -202,8 +793,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) void update_rq_clock(struct rq *rq) { s64 delta; + u64 clock; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (rq->clock_update_flags & RQCF_ACT_SKIP) return; @@ -213,20 +805,15 @@ void update_rq_clock(struct rq *rq) SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; #endif + clock = sched_clock_cpu(cpu_of(rq)); + scx_rq_clock_update(rq, clock); - delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + delta = clock - rq->clock; if (delta < 0) return; rq->clock += delta; - update_rq_clock_task(rq, delta); -} -static inline void -rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) -{ - csd->flags = 0; - csd->func = func; - csd->info = rq; + update_rq_clock_task(rq, delta); } #ifdef CONFIG_SCHED_HRTICK @@ -253,7 +840,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->curr->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->curr, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; @@ -264,8 +851,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -284,12 +872,11 @@ static void __hrtick_start(void *arg) /* * Called to set the hrtick timer state. * - * called with rq->lock held and irqs disabled + * called with rq->lock held and IRQs disabled */ void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -297,9 +884,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); - - hrtimer_set_expires(timer, time); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); if (rq == this_rq()) __hrtick_restart(rq); @@ -311,7 +896,7 @@ void hrtick_start(struct rq *rq, u64 delay) /* * Called to set the hrtick timer state. * - * called with rq->lock held and irqs disabled + * called with rq->lock held and IRQs disabled */ void hrtick_start(struct rq *rq, u64 delay) { @@ -329,7 +914,7 @@ void hrtick_start(struct rq *rq, u64 delay) static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); + INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; @@ -345,21 +930,17 @@ static inline void hrtick_rq_init(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ /* - * cmpxchg based fetch_or, macro so it works for different integer types + * try_cmpxchg based fetch_or() macro so it works for different integer types: */ #define fetch_or(ptr, mask) \ ({ \ typeof(ptr) _ptr = (ptr); \ typeof(mask) _mask = (mask); \ - typeof(*_ptr) _old, _val = *_ptr; \ + typeof(*_ptr) _val = *_ptr; \ \ - for (;;) { \ - _old = cmpxchg(_ptr, _val, _val | _mask); \ - if (_old == _val) \ - break; \ - _val = _old; \ - } \ - _old; \ + do { \ + } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \ + _val; \ }) #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) @@ -368,10 +949,9 @@ static inline void hrtick_rq_init(struct rq *rq) * this avoids any races wrt polling state changes and thereby avoids * spurious IPIs. */ -static bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - struct thread_info *ti = task_thread_info(p); - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); } /* @@ -383,30 +963,27 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = READ_ONCE(ti->flags); + typeof(ti->flags) val = READ_ONCE(ti->flags); - for (;;) { + do { if (!(val & _TIF_POLLING_NRFLAG)) return false; if (val & _TIF_NEED_RESCHED) return true; - old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); - if (old == val) - break; - val = old; - } + } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); + return true; } #else -static bool set_nr_and_not_polling(struct task_struct *p) +static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) { - set_tsk_need_resched(p); + set_ti_thread_flag(ti, tif); return true; } #ifdef CONFIG_SMP -static bool set_nr_if_polling(struct task_struct *p) +static inline bool set_nr_if_polling(struct task_struct *p) { return false; } @@ -419,7 +996,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) /* * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the + * it's already queued (either by us or someone else) and will get the * wakeup due to that. * * In order to ensure that a pending wakeup will observe our pending @@ -486,10 +1063,10 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - BUG_ON(!task); - /* Task can safely be re-inserted now: */ node = node->next; - task->wake_q.next = NULL; + /* pairs with cmpxchg_relaxed() in __wake_q_add() */ + WRITE_ONCE(task->wake_q.next, NULL); + /* Task can safely be re-inserted now. */ /* * wake_up_process() executes a full barrier, which pairs with @@ -507,28 +1084,70 @@ void wake_up_q(struct wake_q_head *head) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_curr(struct rq *rq) +static void __resched_curr(struct rq *rq, int tif) { struct task_struct *curr = rq->curr; + struct thread_info *cti = task_thread_info(curr); int cpu; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); + + /* + * Always immediately preempt the idle task; no point in delaying doing + * actual work. + */ + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) + tif = TIF_NEED_RESCHED; - if (test_tsk_need_resched(curr)) + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) return; cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(curr); - set_preempt_need_resched(); + set_ti_thread_flag(cti, tif); + if (tif == TIF_NEED_RESCHED) + set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(curr)) - smp_send_reschedule(cpu); - else + if (set_nr_and_not_polling(cti, tif)) { + if (tif == TIF_NEED_RESCHED) + smp_send_reschedule(cpu); + } else { trace_sched_wake_idle_without_ipi(cpu); + } +} + +void resched_curr(struct rq *rq) +{ + __resched_curr(rq, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_PREEMPT_DYNAMIC +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); +static __always_inline bool dynamic_preempt_lazy(void) +{ + return static_branch_unlikely(&sk_dynamic_preempt_lazy); +} +#else +static __always_inline bool dynamic_preempt_lazy(void) +{ + return IS_ENABLED(CONFIG_PREEMPT_LAZY); +} +#endif + +static __always_inline int get_lazy_tif_bit(void) +{ + if (dynamic_preempt_lazy()) + return TIF_NEED_RESCHED_LAZY; + + return TIF_NEED_RESCHED; +} + +void resched_curr_lazy(struct rq *rq) +{ + __resched_curr(rq, get_lazy_tif_bit()); } void resched_cpu(int cpu) @@ -536,10 +1155,10 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (cpu_online(cpu) || cpu == smp_processor_id()) resched_curr(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } #ifdef CONFIG_SMP @@ -550,39 +1169,38 @@ void resched_cpu(int cpu) * * We don't do similar optimization for completely idle system, as * selecting an idle CPU will add more delays to the timers than intended - * (as that CPU's timer base may not be uptodate wrt jiffies etc). + * (as that CPU's timer base may not be up to date wrt jiffies etc). */ int get_nohz_timer_target(void) { int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; + const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - rcu_read_lock(); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); + + guard(rcu)(); + for_each_domain(cpu, sd) { - for_each_cpu_and(i, sched_domain_span(sd), - housekeeping_cpumask(HK_FLAG_TIMER)) { + for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; - if (!idle_cpu(i)) { - cpu = i; - goto unlock; - } + if (!idle_cpu(i)) + return i; } } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); - cpu = default_cpu; -unlock: - rcu_read_unlock(); - return cpu; + default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); + + return default_cpu; } /* @@ -602,7 +1220,29 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - if (set_nr_and_not_polling(rq->idle)) + /* + * Set TIF_NEED_RESCHED and send an IPI if in the non-polling + * part of the idle loop. This forces an exit from the idle loop + * and a round trip to schedule(). Now this could be optimized + * because a simple new idle loop iteration is enough to + * re-evaluate the next tick. Provided some re-ordering of tick + * nohz functions that would need to follow TIF_NR_POLLING + * clearing: + * + * - On most architectures, a simple fetch_or on ti::flags with a + * "0" value would be enough to know if an IPI needs to be sent. + * + * - x86 needs to perform a last need_resched() check between + * monitor and mwait which doesn't take timers into account. + * There a dedicated TIF_TIMER flag would be required to + * fetch_or here and be checked along with TIF_NEED_RESCHED + * before mwait(). + * + * However, remote timer enqueue is not such a frequent event + * and testing of the above solutions didn't appear to report + * much benefits. + */ + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -648,19 +1288,33 @@ static void nohz_csd_func(void *info) /* * Release the rq::nohz_csd. */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); + flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu)); WARN_ON(!(flags & NOHZ_KICK_MASK)); rq->idle_balance = idle_cpu(cpu); - if (rq->idle_balance && !need_resched()) { + if (rq->idle_balance) { rq->nohz_idle_balance = flags; - raise_softirq_irqoff(SCHED_SOFTIRQ); + __raise_softirq_irqoff(SCHED_SOFTIRQ); } } #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL +static inline bool __need_bw_check(struct rq *rq, struct task_struct *p) +{ + if (rq->nr_running != 1) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + if (!task_on_rq_queued(p)) + return false; + + return true; +} + bool sched_can_stop_tick(struct rq *rq) { int fifo_nr_running; @@ -670,7 +1324,7 @@ bool sched_can_stop_tick(struct rq *rq) return false; /* - * If there are more than one RR tasks, we need the tick to effect the + * If there are more than one RR tasks, we need the tick to affect the * actual RR behaviour. */ if (rq->rt.rr_nr_running) { @@ -689,13 +1343,28 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; - * if there's more than one we need the tick for involuntary - * preemption. + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks + * left. For CFS, if there's more than one we need the tick for + * involuntary preemption. For SCX, ask. */ - if (rq->nr_running > 1) + if (scx_enabled() && !scx_can_stop_tick(rq)) return false; + if (rq->cfs.h_nr_queued > 1) + return false; + + /* + * If there is one task and it has CFS runtime bandwidth constraints + * and it's on the cpu now we don't want to stop the tick. + * This check prevents clearing the bit if a newly enqueued task here is + * dequeued by migrating while the constrained task continues to run. + * E.g. going from 2->1 without going through pick_next_task(). + */ + if (__need_bw_check(rq, rq->curr)) { + if (cfs_task_bw_constrained(rq->curr)) + return false; + } + return true; } #endif /* CONFIG_NO_HZ_FULL */ @@ -746,30 +1415,27 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p, bool update_load) +void set_load_weight(struct task_struct *p, bool update_load) { int prio = p->static_prio - MAX_RT_PRIO; - struct load_weight *load = &p->se.load; + struct load_weight lw; - /* - * SCHED_IDLE tasks get minimal weight: - */ if (task_has_idle_policy(p)) { - load->weight = scale_load(WEIGHT_IDLEPRIO); - load->inv_weight = WMULT_IDLEPRIO; - return; + lw.weight = scale_load(WEIGHT_IDLEPRIO); + lw.inv_weight = WMULT_IDLEPRIO; + } else { + lw.weight = scale_load(sched_prio_to_weight[prio]); + lw.inv_weight = sched_prio_to_wmult[prio]; } /* * SCHED_OTHER tasks have to update their load when changing their * weight */ - if (update_load && p->sched_class == &fair_sched_class) { - reweight_task(p, prio); - } else { - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; - } + if (update_load && p->sched_class->reweight_task) + p->sched_class->reweight_task(task_rq(p), p, &lw); + else + p->se.load = lw; } #ifdef CONFIG_UCLAMP_TASK @@ -783,47 +1449,53 @@ static void set_load_weight(struct task_struct *p, bool update_load) * requests are serialized using a mutex to reduce the risk of conflicting * updates or API abuses. */ -static DEFINE_MUTEX(uclamp_mutex); +static __maybe_unused DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ -unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ -unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; + +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; -/* Integer rounded range for each bucket */ -#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) - -#define for_each_clamp_id(clamp_id) \ - for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) - -static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) -{ - return clamp_value / UCLAMP_BUCKET_DELTA; -} - -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); -} - -static inline unsigned int uclamp_none(enum uclamp_id clamp_id) -{ - if (clamp_id == UCLAMP_MIN) - return 0; - return SCHED_CAPACITY_SCALE; -} - -static inline void uclamp_se_set(struct uclamp_se *uc_se, - unsigned int value, bool user_defined) -{ - uc_se->value = value; - uc_se->bucket_id = uclamp_bucket_id(value); - uc_se->user_defined = user_defined; -} +/* + * This static key is used to reduce the uclamp overhead in the fast path. It + * primarily disables the call to uclamp_rq_{inc, dec}() in + * enqueue/dequeue_task(). + * + * This allows users to continue to enable uclamp in their kernel config with + * minimum uclamp overhead in the fast path. + * + * As soon as userspace modifies any of the uclamp knobs, the static key is + * enabled, since we have an actual users that make use of uclamp + * functionality. + * + * The knobs that would enable this static key are: + * + * * A task modifying its uclamp value with sched_setattr(). + * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. + * * An admin modifying the cgroup cpu.uclamp.{min, max} + */ +DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); static inline unsigned int uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, @@ -849,7 +1521,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) return; - WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); + uclamp_rq_set(rq, clamp_id, clamp_value); } static inline @@ -873,12 +1545,40 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + guard(task_rq_lock)(p); + __uclamp_update_util_min_rt_default(p); +} + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { + /* Copy by value as we could modify it */ struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP - struct uclamp_se uc_max; + unsigned int tg_min, tg_max, value; /* * Tasks in autogroups or root task group will be @@ -889,9 +1589,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) if (task_group(p) == &root_task_group) return uc_req; - uc_max = task_group(p)->uclamp[clamp_id]; - if (uc_req.value > uc_max.value || !uc_req.user_defined) - return uc_max; + tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; + tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; + value = uc_req.value; + value = clamp(value, tg_min, tg_max); + uclamp_se_set(&uc_req, value, false); #endif return uc_req; @@ -948,7 +1650,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* Update task effective clamp */ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); @@ -966,8 +1668,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, if (bucket->tasks == 1 || uc_se->value > bucket->value) bucket->value = uc_se->value; - if (uc_se->value > READ_ONCE(uc_rq->value)) - WRITE_ONCE(uc_rq->value, uc_se->value); + if (uc_se->value > uclamp_rq_get(rq, clamp_id)) + uclamp_rq_set(rq, clamp_id, uc_se->value); } /* @@ -988,12 +1690,40 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, unsigned int bkt_clamp; unsigned int rq_clamp; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); + + /* + * If sched_uclamp_used was enabled after task @p was enqueued, + * we could end up with unbalanced call to uclamp_rq_dec_id(). + * + * In this case the uc_se->active flag should be false since no uclamp + * accounting was performed at enqueue time and we can just return + * here. + * + * Need to be careful of the following enqueue/dequeue ordering + * problem too + * + * enqueue(taskA) + * // sched_uclamp_used gets enabled + * enqueue(taskB) + * dequeue(taskA) + * // Must not decrement bucket->tasks here + * dequeue(taskB) + * + * where we could end up with stale data in uc_se and + * bucket[uc_se->bucket_id]. + * + * The following check here eliminates the possibility of such race. + */ + if (unlikely(!uc_se->active)) + return; bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; + uc_se->active = false; /* @@ -1005,15 +1735,15 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, if (likely(bucket->tasks)) return; - rq_clamp = READ_ONCE(uc_rq->value); + rq_clamp = uclamp_rq_get(rq, clamp_id); /* * Defensive programming: this should never happen. If it happens, - * e.g. due to future modification, warn and fixup the expected value. + * e.g. due to future modification, warn and fix up the expected value. */ SCHED_WARN_ON(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); - WRITE_ONCE(uc_rq->value, bkt_clamp); + uclamp_rq_set(rq, clamp_id, bkt_clamp); } } @@ -1021,9 +1751,21 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; + if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id); @@ -1036,16 +1778,46 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { enum uclamp_id clamp_id; + /* + * Avoid any overhead until uclamp is actually used by the userspace. + * + * The condition is constructed such that a NOP is generated when + * sched_uclamp_used is disabled. + */ + if (!static_branch_unlikely(&sched_uclamp_used)) + return; + if (unlikely(!p->sched_class->uclamp_enabled)) return; + if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void -uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +uclamp_update_active(struct task_struct *p) { + enum uclamp_id clamp_id; struct rq_flags rf; struct rq *rq; @@ -1065,34 +1837,30 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_UCLAMP_TASK_GROUP static inline void -uclamp_update_active_tasks(struct cgroup_subsys_state *css, - unsigned int clamps) +uclamp_update_active_tasks(struct cgroup_subsys_state *css) { - enum uclamp_id clamp_id; struct css_task_iter it; struct task_struct *p; css_task_iter_start(css, 0, &it); - while ((p = css_task_iter_next(&it))) { - for_each_clamp_id(clamp_id) { - if ((0x1 << clamp_id) & clamps) - uclamp_update_active(p, clamp_id); - } - } + while ((p = css_task_iter_next(&it))) + uclamp_update_active(p); css_task_iter_end(&it); } static void cpu_util_update_eff(struct cgroup_subsys_state *css); +#endif + +#ifdef CONFIG_SYSCTL +#ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { struct task_group *tg = &root_task_group; @@ -1102,33 +1870,62 @@ static void uclamp_update_root_tg(void) uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); - rcu_read_lock(); + guard(rcu)(); cpu_util_update_eff(&root_task_group.css); - rcu_read_unlock(); } #else static void uclamp_update_root_tg(void) { } #endif -int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + guard(rcu)(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); +} + +static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; - mutex_lock(&uclamp_mutex); + guard(mutex)(&uclamp_mutex); + old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) goto undo; if (!write) - goto done; + return 0; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1144,97 +1941,68 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, update_root_tg = true; } - if (update_root_tg) + if (update_root_tg) { + static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); + } + + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } /* * We update all RUNNABLE tasks only when task groups are in use. * Otherwise, keep it simple and do just a lazy update at each next * task enqueue time. */ - - goto done; + return 0; undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; -done: - mutex_unlock(&uclamp_mutex); - + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; return result; } +#endif -static int uclamp_validate(struct task_struct *p, - const struct sched_attr *attr) -{ - unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; - unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; - - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) - lower_bound = attr->sched_util_min; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) - upper_bound = attr->sched_util_max; - - if (lower_bound > upper_bound) - return -EINVAL; - if (upper_bound > SCHED_CAPACITY_SCALE) - return -EINVAL; - - return 0; -} - -static void __setscheduler_uclamp(struct task_struct *p, - const struct sched_attr *attr) +static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; /* - * On scheduling class change, reset to default clamps for tasks - * without a task-specific value. + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. */ - for_each_clamp_id(clamp_id) { - struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); - - /* Keep using defined clamps across class changes */ - if (uc_se->user_defined) - continue; - - /* By default, RT tasks always get 100% boost */ - if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); - - uclamp_se_set(uc_se, clamp_value, false); - } + for_each_clamp_id(clamp_id) + p->uclamp[clamp_id].active = false; - if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) + if (likely(!p->sched_reset_on_fork)) return; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { - uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], - attr->sched_util_min, true); + for_each_clamp_id(clamp_id) { + uclamp_se_set(&p->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); } +} - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { - uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], - attr->sched_util_max, true); - } +static void uclamp_post_fork(struct task_struct *p) +{ + uclamp_update_util_min_rt_default(p); } -static void uclamp_fork(struct task_struct *p) +static void __init init_uclamp_rq(struct rq *rq) { enum uclamp_id clamp_id; - - for_each_clamp_id(clamp_id) - p->uclamp[clamp_id].active = false; - - if (likely(!p->sched_reset_on_fork)) - return; + struct uclamp_rq *uc_rq = rq->uclamp; for_each_clamp_id(clamp_id) { - uclamp_se_set(&p->uclamp_req[clamp_id], - uclamp_none(clamp_id), false); + uc_rq[clamp_id] = (struct uclamp_rq) { + .value = uclamp_none(clamp_id) + }; } + + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } static void __init init_uclamp(void) @@ -1243,13 +2011,8 @@ static void __init init_uclamp(void) enum uclamp_id clamp_id; int cpu; - mutex_init(&uclamp_mutex); - - for_each_possible_cpu(cpu) { - memset(&cpu_rq(cpu)->uclamp, 0, - sizeof(struct uclamp_rq)*UCLAMP_CNT); - cpu_rq(cpu)->uclamp_flags = 0; - } + for_each_possible_cpu(cpu) + init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id], @@ -1267,108 +2030,115 @@ static void __init init_uclamp(void) } } -#else /* CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } -static inline int uclamp_validate(struct task_struct *p, - const struct sched_attr *attr) -{ - return -EOPNOTSUPP; -} -static void __setscheduler_uclamp(struct task_struct *p, - const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ -static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +bool sched_task_on_rq(struct task_struct *p) +{ + return task_on_rq_queued(p); +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ip = 0; + unsigned int state; + + if (!p || p == current) + return 0; + + /* Only get wchan if task is blocked and we can keep it that way. */ + raw_spin_lock_irq(&p->pi_lock); + state = READ_ONCE(p->__state); + smp_rmb(); /* see try_to_wake_up() */ + if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) + ip = __get_wchan(p); + raw_spin_unlock_irq(&p->pi_lock); + + return ip; +} + +void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) { - sched_info_queued(rq, p); - psi_enqueue(p, flags & ENQUEUE_WAKEUP); - } - - uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + /* + * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear + * ->sched_delayed. + */ + uclamp_rq_inc(rq, p); + + psi_enqueue(p, flags); + + if (!(flags & ENQUEUE_RESTORE)) + sched_info_enqueue(rq, p); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); } -static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +/* + * Must only return false when DEQUEUE_SLEEP. + */ +inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p, flags); + if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) { - sched_info_dequeued(rq, p); - psi_dequeue(p, flags & DEQUEUE_SLEEP); - } + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeue(rq, p); + + psi_dequeue(p, flags); + /* + * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' + * and mark the task ->sched_delayed. + */ uclamp_rq_dec(rq, p); - p->sched_class->dequeue_task(rq, p, flags); + return p->sched_class->dequeue_task(rq, p, flags); } void activate_task(struct rq *rq, struct task_struct *p, int flags) { + if (task_on_rq_migrating(p)) + flags |= ENQUEUE_MIGRATED; + if (flags & ENQUEUE_MIGRATED) + sched_mm_cid_migrate_to(rq, p); + enqueue_task(rq, p, flags); - p->on_rq = TASK_ON_RQ_QUEUED; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + SCHED_WARN_ON(flags & DEQUEUE_SLEEP); - dequeue_task(rq, p, flags); -} - -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) -{ - return p->static_prio; -} + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); -/* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. - */ -static inline int normal_prio(struct task_struct *p) -{ - int prio; + /* + * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* + * dequeue_task() and cleared *after* enqueue_task(). + */ - if (task_has_dl_policy(p)) - prio = MAX_DL_PRIO-1; - else if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; + dequeue_task(rq, p, flags); } -/* - * Calculate the current priority, i.e. the priority - * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got - * RT-boosted. If not then it returns p->normal_prio. - */ -static int effective_prio(struct task_struct *p) +static void block_task(struct rq *rq, struct task_struct *p, int flags) { - p->normal_prio = normal_prio(p); - /* - * If we are RT tasks or we were boosted to RT priority, - * keep the priority unchanged. Otherwise, update priority - * to the normal priority: - */ - if (!rt_prio(p->prio)) - return p->normal_prio; - return p->prio; + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) + __block_task(rq, p); } /** @@ -1383,15 +2153,26 @@ inline int task_curr(const struct task_struct *p) } /* + * ->switching_to() is called with the pi_lock and rq_lock held and must not + * mess with locking. + */ +void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class) +{ + if (prev_class != p->sched_class && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); +} + +/* * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, * use the balance_callback list if you want balancing. * * this means any call to check_class_changed() must be followed by a call to * balance_callback(). */ -static inline void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) +void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) @@ -1402,46 +2183,279 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { - const struct sched_class *class; + struct task_struct *donor = rq->donor; - if (p->sched_class == rq->curr->sched_class) { - rq->curr->sched_class->check_preempt_curr(rq, p, flags); - } else { - for_each_class(class) { - if (class == rq->curr->sched_class) - break; - if (class == p->sched_class) { - resched_curr(rq); - break; - } - } - } + if (p->sched_class == donor->sched_class) + donor->sched_class->wakeup_preempt(rq, p, flags); + else if (sched_class_above(p->sched_class, donor->sched_class)) + resched_curr(rq); /* * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr)) rq_clock_skip_update(rq); } +static __always_inline +int __task_state_match(struct task_struct *p, unsigned int state) +{ + if (READ_ONCE(p->__state) & state) + return 1; + + if (READ_ONCE(p->saved_state) & state) + return -1; + + return 0; +} + +static __always_inline +int task_state_match(struct task_struct *p, unsigned int state) +{ + /* + * Serialize against current_save_and_set_rtlock_wait_state(), + * current_restore_rtlock_saved_state(), and __refrigerator(). + */ + guard(raw_spinlock_irq)(&p->pi_lock); + return __task_state_match(p, state); +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) +{ + int running, queued, match; + struct rq_flags rf; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_on_cpu()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_on_cpu(rq, p)) { + if (!task_state_match(p, match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &rf); + trace_sched_wait_task(p); + running = task_on_cpu(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; + if ((match = __task_state_match(p, match_state))) { + /* + * When matching on p->saved_state, consider this task + * still queued so it will wait. + */ + if (match < 0) + queued = 1; + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + } + task_rq_unlock(rq, p, &rf); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(queued)) { + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + #ifdef CONFIG_SMP +static void +__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); + +static void migrate_disable_switch(struct rq *rq, struct task_struct *p) +{ + struct affinity_context ac = { + .new_mask = cpumask_of(rq->cpu), + .flags = SCA_MIGRATE_DISABLE, + }; + + if (likely(!p->migration_disabled)) + return; + + if (p->cpus_ptr != &p->cpus_mask) + return; + + /* + * Violates locking rules! See comment in __do_set_cpus_allowed(). + */ + __do_set_cpus_allowed(p, &ac); +} + +void migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif + p->migration_disabled++; + return; + } + + guard(preempt)(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + struct affinity_context ac = { + .new_mask = &p->cpus_mask, + .flags = SCA_MIGRATE_ENABLE, + }; + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (p->cpus_ptr != &p->cpus_mask) + __set_cpus_allowed_ptr(p, &ac); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; +} +EXPORT_SYMBOL_GPL(migrate_enable); + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return rq->nr_pinned; +} + /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + /* When not in the task's cpumask, no point in looking further. */ + if (!task_allowed_on_cpu(p, cpu)) return false; - if (is_per_cpu_kthread(p)) + /* migrate_disabled() must be allowed to finish. */ + if (is_migration_disabled(p)) + return cpu_online(cpu); + + /* Non kernel threads are not allowed during either online or offline. */ + if (!(p->flags & PF_KTHREAD)) + return cpu_active(cpu); + + /* KTHREAD_IS_PER_CPU is always allowed. */ + if (kthread_is_per_cpu(p)) return cpu_online(cpu); - return cpu_active(cpu); + /* Regular kernel threads don't get to stay during offline. */ + if (cpu_dying(cpu)) + return false; + + /* But are allowed during online. */ + return cpu_online(cpu); } /* @@ -1466,27 +2480,38 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); - WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); - dequeue_task(rq, p, DEQUEUE_NOCLOCK); + deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); rq = cpu_rq(new_cpu); rq_lock(rq, rf); - BUG_ON(task_cpu(p) != new_cpu); - enqueue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; - check_preempt_curr(rq, p, 0); + WARN_ON_ONCE(task_cpu(p) != new_cpu); + activate_task(rq, p, 0); + wakeup_preempt(rq, p, 0); return rq; } struct migration_arg { - struct task_struct *task; - int dest_cpu; + struct task_struct *task; + int dest_cpu; + struct set_affinity_pending *pending; +}; + +/* + * @refs: number of wait_for_completion() + * @stop_pending: is @stop_work in use + */ +struct set_affinity_pending { + refcount_t refs; + unsigned int stop_pending; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; }; /* @@ -1505,53 +2530,160 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, if (!is_cpu_allowed(p, dest_cpu)) return rq; - update_rq_clock(rq); rq = move_queued_task(rq, rf, p, dest_cpu); return rq; } /* - * migration_cpu_stop - this will be executed by a highprio stopper thread + * migration_cpu_stop - this will be executed by a high-prio stopper thread * and performs thread migration by bumping thread off CPU then * 'pushing' onto another runqueue. */ static int migration_cpu_stop(void *data) { struct migration_arg *arg = data; + struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + bool complete = false; struct rq_flags rf; /* * The original target CPU might have gone down and we might * be on another CPU but it doesn't matter. */ - local_irq_disable(); + local_irq_save(rf.flags); /* * We need to explicitly wake pending tasks before running * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); + + /* + * If we were passed a pending, then ->stop_pending was set, thus + * p->migration_pending must have remained stable. + */ + WARN_ON_ONCE(pending && pending != p->migration_pending); + /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ if (task_rq(p) == rq) { - if (task_on_rq_queued(p)) + if (is_migration_disabled(p)) + goto out; + + if (pending) { + p->migration_pending = NULL; + complete = true; + + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) + goto out; + } + + if (task_on_rq_queued(p)) { + update_rq_clock(rq); rq = __migrate_task(rq, &rf, p, arg->dest_cpu); - else + } else { p->wake_cpu = arg->dest_cpu; + } + + /* + * XXX __migrate_task() can fail, at which point we might end + * up running on a dodgy CPU, AFAICT this can only happen + * during CPU hotplug, at which point we'll get pushed out + * anyway, so it's probably not a big deal. + */ + + } else if (pending) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that + * point we're a regular task again and not current anymore. + * + * A !PREEMPT kernel has a giant hole here, which makes it far + * more likely. + */ + + /* + * The task moved before the stopper got to run. We're holding + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ + if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + p->migration_pending = NULL; + complete = true; + goto out; + } + + /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after + * it. + */ + WARN_ON_ONCE(!pending->stop_pending); + preempt_disable(); + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + preempt_enable(); + return 0; } - rq_unlock(rq, &rf); - raw_spin_unlock(&p->pi_lock); +out: + if (pending) + pending->stop_pending = false; + task_rq_unlock(rq, p, &rf); + + if (complete) + complete_all(&pending->done); - local_irq_enable(); + return 0; +} + +int push_cpu_stop(void *arg) +{ + struct rq *lowest_rq = NULL, *rq = this_rq(); + struct task_struct *p = arg; + + raw_spin_lock_irq(&p->pi_lock); + raw_spin_rq_lock(rq); + + if (task_rq(p) != rq) + goto out_unlock; + + if (is_migration_disabled(p)) { + p->migration_flags |= MDF_PUSH; + goto out_unlock; + } + + p->migration_flags &= ~MDF_PUSH; + + if (p->sched_class->find_lock_rq) + lowest_rq = p->sched_class->find_lock_rq(p, rq); + + if (!lowest_rq) + goto out_unlock; + + // XXX validate p is still the highest prio task + if (task_rq(p) == rq) { + move_queued_task_locked(rq, lowest_rq, p); + resched_curr(lowest_rq); + } + + double_unlock_balance(rq, lowest_rq); + +out_unlock: + rq->push_busy = false; + raw_spin_rq_unlock(rq); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); return 0; } @@ -1559,34 +2691,62 @@ static int migration_cpu_stop(void *data) * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx) { - cpumask_copy(&p->cpus_mask, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); + if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = ctx->new_mask; + return; + } + + cpumask_copy(&p->cpus_mask, ctx->new_mask); + p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); + + /* + * Swap in a new user_cpus_ptr if SCA_USER flag set + */ + if (ctx->flags & SCA_USER) + swap(p->user_cpus_ptr, ctx->user_mask); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +static void +__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { struct rq *rq = task_rq(p); bool queued, running; - lockdep_assert_held(&p->pi_lock); + /* + * This here violates the locking rules for affinity, since we're only + * supposed to change these variables while holding both rq->lock and + * p->pi_lock. + * + * HOWEVER, it magically works, because ttwu() is the only code that + * accesses these variables under p->pi_lock and only does so after + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() + * before finish_task(). + * + * XXX do further audits, this smells like something putrid. + */ + if (ctx->flags & SCA_MIGRATE_DISABLE) + SCHED_WARN_ON(!p->on_cpu); + else + lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) { /* * Because __kthread_bind() calls this on blocked tasks without * holding rq->lock. */ - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) put_prev_task(rq, p); - p->sched_class->set_cpus_allowed(p, new_mask); + p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -1595,113 +2755,558 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) } /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Used for kthread_bind() and select_fallback_rq(), in both cases the user + * affinity (if any) should be destroyed too. + */ +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + struct affinity_context ac = { + .new_mask = new_mask, + .user_mask = NULL, + .flags = SCA_USER, /* clear the user requested mask */ + }; + union cpumask_rcuhead { + cpumask_t cpumask; + struct rcu_head rcu; + }; + + __do_set_cpus_allowed(p, &ac); + + /* + * Because this is called with p->pi_lock held, it is not possible + * to use kfree() here (when PREEMPT_RT=y), therefore punt to using + * kfree_rcu(). + */ + kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu); +} + +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) +{ + cpumask_t *user_mask; + unsigned long flags; + + /* + * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's + * may differ by now due to racing. + */ + dst->user_cpus_ptr = NULL; + + /* + * This check is racy and losing the race is a valid situation. + * It is not worth the extra overhead of taking the pi_lock on + * every fork/clone. + */ + if (data_race(!src->user_cpus_ptr)) + return 0; + + user_mask = alloc_user_cpus_ptr(node); + if (!user_mask) + return -ENOMEM; + + /* + * Use pi_lock to protect content of user_cpus_ptr + * + * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent + * do_set_cpus_allowed(). + */ + raw_spin_lock_irqsave(&src->pi_lock, flags); + if (src->user_cpus_ptr) { + swap(dst->user_cpus_ptr, user_mask); + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + } + raw_spin_unlock_irqrestore(&src->pi_lock, flags); + + if (unlikely(user_mask)) + kfree(user_mask); + + return 0; +} + +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = NULL; + + swap(p->user_cpus_ptr, user_mask); + + return user_mask; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ + kfree(clear_user_cpus_ptr(p)); +} + +/* + * This function is wildly self concurrent; here be dragons. + * + * + * When given a valid mask, __set_cpus_allowed_ptr() must block until the + * designated task is enqueued on an allowed CPU. If that task is currently + * running, we have to kick it out using the CPU stopper. + * + * Migrate-Disable comes along and tramples all over our nice sandcastle. + * Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). + * This means we need the following scheme: + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * <resumes> + * migrate_enable(); + * __set_cpus_allowed_ptr(); + * <wakes local stopper> + * `--> <woken on migration completion> + * + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any + * task p are serialized by p->pi_lock, which we can leverage: the one that + * should come into effect at the end of the Migrate-Disable region is the last + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), + * but we still need to properly signal those waiting tasks at the appropriate + * moment. + * + * This is implemented using struct set_affinity_pending. The first + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will + * setup an instance of that struct and install it on the targeted task_struct. + * Any and all further callers will reuse that instance. Those then wait for + * a completion signaled at the tail of the CPU stopper callback (1), triggered + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). + * + * + * (1) In the cases covered above. There is one more where the completion is + * signaled within affine_move_task() itself: when a subsequent affinity request + * occurs after the stopper bailed out due to the targeted task still being + * Migrate-Disable. Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * CPU0 P1 P2 + * <P0> + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * <migration/0> + * migration_cpu_stop() + * is_migration_disabled() + * <bails> + * set_cpus_allowed_ptr(P0, [0, 1]); + * <signal completion> + * <awakes> + * + * Note that the above is safe vs a concurrent migrate_enable(), as any + * pending affinity completion is preceded by an uninstallation of + * p->migration_pending done with p->pi_lock held. + */ +static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, + int dest_cpu, unsigned int flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + struct set_affinity_pending my_pending = { }, *pending = NULL; + bool stop_pending, complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + struct task_struct *push_task = NULL; + + if ((flags & SCA_MIGRATE_ENABLE) && + (p->migration_flags & MDF_PUSH) && !rq->push_busy) { + rq->push_busy = true; + push_task = get_task_struct(p); + } + + /* + * If there are pending waiters, but no pending stop_work, + * then complete now. + */ + pending = p->migration_pending; + if (pending && !pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } + + preempt_disable(); + task_rq_unlock(rq, p, rf); + if (push_task) { + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + p, &rq->push_work); + } + preempt_enable(); + + if (complete) + complete_all(&pending->done); + + return 0; + } + + if (!(flags & SCA_MIGRATE_ENABLE)) { + /* serialized by p->pi_lock */ + if (!p->migration_pending) { + /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); + my_pending.arg = (struct migration_arg) { + .task = p, + .dest_cpu = dest_cpu, + .pending = &my_pending, + }; + + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; + refcount_inc(&pending->refs); + /* + * Affinity has changed, but we've already installed a + * pending. migration_cpu_stop() *must* see this, else + * we risk a completion of the pending despite having a + * task on a disallowed CPU. + * + * Serialized by p->pi_lock, so this is safe. + */ + pending->arg.dest_cpu = dest_cpu; + } + } + pending = p->migration_pending; + /* + * - !MIGRATE_ENABLE: + * we'll have installed a pending if there wasn't one already. + * + * - MIGRATE_ENABLE: + * we're here because the current CPU isn't matching anymore, + * the only way that can happen is because of a concurrent + * set_cpus_allowed_ptr() call, which should then still be + * pending completion. + * + * Either way, we really should have a @pending here. + */ + if (WARN_ON_ONCE(!pending)) { + task_rq_unlock(rq, p, rf); + return -EINVAL; + } + + if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { + /* + * MIGRATE_ENABLE gets here because 'p == current', but for + * anything else we cannot do is_migration_disabled(), punt + * and have the stopper function handle it all race-free. + */ + stop_pending = pending->stop_pending; + if (!stop_pending) + pending->stop_pending = true; + + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; + + preempt_disable(); + task_rq_unlock(rq, p, rf); + if (!stop_pending) { + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + } + preempt_enable(); + + if (flags & SCA_MIGRATE_ENABLE) + return 0; + } else { + + if (!is_migration_disabled(p)) { + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + + if (!pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } + } + task_rq_unlock(rq, p, rf); + + if (complete) + complete_all(&pending->done); + } + + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); /* No UaF, just an address */ + + /* + * Block the original owner of &pending until all subsequent callers + * have seen the completion and decremented the refcount + */ + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + + /* ARGH */ + WARN_ON_ONCE(my_pending.stop_pending); + + return 0; +} + +/* + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + struct affinity_context *ctx, + struct rq *rq, + struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) { + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD) { + if (kthread || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs + * Kernel threads are allowed on online && !active CPUs, + * however, during cpu-hot-unplug, even these might get pushed + * away if not KTHREAD_IS_PER_CPU. + * + * Specifically, migration_disabled() tasks must not fail the + * cpumask_any_and_distribute() pick below, esp. so on + * SCA_MIGRATE_ENABLE, otherwise we'll not call + * set_cpus_allowed_common() and actually reset p->cpus_ptr. */ cpu_valid_mask = cpu_online_mask; } + if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } + /* * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. */ - if (check && (p->flags & PF_NO_SETAFFINITY)) { + if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out; } - if (cpumask_equal(&p->cpus_mask, new_mask)) - goto out; + if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) { + if (ctx->flags & SCA_USER) + swap(p->user_cpus_ptr, ctx->user_mask); + goto out; + } + + if (WARN_ON_ONCE(p == current && + is_migration_disabled(p) && + !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) { + ret = -EBUSY; + goto out; + } + } /* * Picking a ~random cpu helps in cases where we are changing affinity * for groups of tasks (ie. cpuset), so that load balancing is not * immediately required to distribute the tasks within their new mask. */ - dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask); if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } - do_set_cpus_allowed(p, new_mask); - - if (p->flags & PF_KTHREAD) { - /* - * For kernel threads that do indeed end up on online && - * !active we want to ensure they are strict per-CPU threads. - */ - WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && - !cpumask_intersects(new_mask, cpu_active_mask) && - p->nr_cpus_allowed != 1); - } + __do_set_cpus_allowed(p, ctx); - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; + return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - return 0; - } else if (task_on_rq_queued(p)) { - /* - * OK, since we're going to drop the lock immediately - * afterwards anyway. - */ - rq = move_queued_task(rq, &rf, p, dest_cpu); - } out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + /* + * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_* + * flags are set. + */ + if (p->user_cpus_ptr && + !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) && + cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr)) + ctx->new_mask = rq->scratch_mask; + + return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf); +} + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - return __set_cpus_allowed_ptr(p, new_mask, false); + struct affinity_context ac = { + .new_mask = new_mask, + .flags = 0, + }; + + return __set_cpus_allowed_ptr(p, &ac); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask. + * If user_cpus_ptr is defined, use it as the basis for restricting CPU + * affinity or use cpu_online_mask instead. + * + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ + struct affinity_context ac = { + .new_mask = new_mask, + .flags = 0, + }; + struct rq_flags rf; + struct rq *rq; + int err; + + rq = task_rq_lock(p, &rf); + + /* + * Forcefully restricting the affinity of a deadline task is + * likely to cause problems, so fail and noisily override the + * mask entirely. + */ + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + err = -EPERM; + goto err_unlock; + } + + if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + + return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf); + +err_unlock: + task_rq_unlock(rq, p, &rf); + return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + +out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: + cpus_read_unlock(); + free_cpumask_var(new_mask); +} + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + struct affinity_context ac = { + .new_mask = task_user_cpus(p), + .flags = 0, + }; + int ret; + + /* + * Try to restore the old affinity mask with __sched_setaffinity(). + * Cpuset masking will be done there too. + */ + ret = __sched_setaffinity(p, &ac); + WARN_ON_ONCE(ret); +} + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG + unsigned int state = READ_ONCE(p->__state); + /* * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !p->on_rq); + WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); /* * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, * because schedstat_wait_{start,end} rebase migrating task's wait_start * time relying on p->on_rq. */ - WARN_ON_ONCE(p->state == TASK_RUNNING && + WARN_ON_ONCE(state == TASK_RUNNING && p->sched_class == &fair_sched_class && (p->on_rq && !task_on_rq_migrating(p))); @@ -1717,12 +3322,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock))); + lockdep_is_held(__rq_lockp(task_rq(p))))); #endif /* * Clearly, migrating tasks to offline CPUs is a fairly daft thing. */ WARN_ON_ONCE(!cpu_online(new_cpu)); + + WARN_ON_ONCE(is_migration_disabled(p)); #endif trace_sched_migrate_task(p, new_cpu); @@ -1732,6 +3339,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); + sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -1751,10 +3359,8 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); - check_preempt_curr(dst_rq, p, 0); + move_queued_task_locked(src_rq, dst_rq, p); + wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(src_rq, &srf); @@ -1778,7 +3384,6 @@ static int migrate_swap_stop(void *data) { struct migration_swap_arg *arg = data; struct rq *src_rq, *dst_rq; - int ret = -EAGAIN; if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) return -EAGAIN; @@ -1786,33 +3391,25 @@ static int migrate_swap_stop(void *data) src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); - double_raw_lock(&arg->src_task->pi_lock, - &arg->dst_task->pi_lock); - double_rq_lock(src_rq, dst_rq); + guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); + guard(double_rq_lock)(src_rq, dst_rq); if (task_cpu(arg->dst_task) != arg->dst_cpu) - goto unlock; + return -EAGAIN; if (task_cpu(arg->src_task) != arg->src_cpu) - goto unlock; + return -EAGAIN; if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) - goto unlock; + return -EAGAIN; if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) - goto unlock; + return -EAGAIN; __migrate_swap_task(arg->src_task, arg->dst_cpu); __migrate_swap_task(arg->dst_task, arg->src_cpu); - ret = 0; - -unlock: - double_rq_unlock(src_rq, dst_rq); - raw_spin_unlock(&arg->dst_task->pi_lock); - raw_spin_unlock(&arg->src_task->pi_lock); - - return ret; + return 0; } /* @@ -1855,114 +3452,6 @@ out: } #endif /* CONFIG_NUMA_BALANCING */ -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. - * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. - */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) -{ - int running, queued; - struct rq_flags rf; - unsigned long ncsw; - struct rq *rq; - - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) - return 0; - cpu_relax(); - } - - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &rf); - trace_sched_wait_task(p); - running = task_running(rq, p); - queued = task_on_rq_queued(p); - ncsw = 0; - if (!match_state || p->state == match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &rf); - - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; - - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } - - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(queued)) { - ktime_t to = NSEC_PER_SEC / HZ; - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); - continue; - } - - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } - - return ncsw; -} - /*** * kick_process - kick a running thread to enter/exit the kernel * @p: the to-be-kicked thread @@ -1978,13 +3467,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) */ void kick_process(struct task_struct *p) { - int cpu; + guard(preempt)(); + int cpu = task_cpu(p); - preempt_disable(); - cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); - preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); @@ -2027,9 +3514,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } @@ -2046,17 +3531,21 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - if (IS_ENABLED(CONFIG_CPUSETS)) { - cpuset_cpus_allowed_fallback(p); + if (cpuset_cpus_allowed_fallback(p)) { state = possible; break; } - /* Fall-through */ + fallthrough; case possible: - do_set_cpus_allowed(p, cpu_possible_mask); + /* + * XXX When called from select_task_rq() we only + * hold p->pi_lock and again violate locking order. + * + * More yuck to audit. + */ + do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); state = fail; break; - case fail: BUG(); break; @@ -2083,14 +3572,16 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); - else + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { + cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); + *wake_flags |= WF_RQ_SELECTED; + } else { cpu = cpumask_any(p->cpus_ptr); + } /* * In order not to call set_task_cpu() on a blocking task we need @@ -2110,6 +3601,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) void sched_set_stop_task(int cpu, struct task_struct *stop) { + static struct lock_class_key stop_pi_lock; struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; struct task_struct *old_stop = cpu_rq(cpu)->stop; @@ -2125,6 +3617,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); stop->sched_class = &stop_sched_class; + + /* + * The PI code calls rt_mutex_setprio() with ->pi_lock held to + * adjust the effective priority of a task. As a result, + * rt_mutex_setprio() can trigger (RT) balancing operations, + * which can then trigger wakeups of the stop thread to push + * around the current task. + * + * The stop task itself will never be part of the PI-chain, it + * never blocks, therefore that ->pi_lock recursion is safe. + * Tell lockdep about this by placing the stop->pi_lock in its + * own class. + */ + lockdep_set_class(&stop->pi_lock, &stop_pi_lock); } cpu_rq(cpu)->stop = stop; @@ -2138,15 +3644,16 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else +#else /* CONFIG_SMP */ + +static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) +static inline bool rq_has_pinned_tasks(struct rq *rq) { - return set_cpus_allowed_ptr(p, new_mask); + return false; } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -2161,46 +3668,73 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); - __schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(p->stats.nr_wakeups_local); } else { struct sched_domain *sd; - __schedstat_inc(p->se.statistics.nr_wakeups_remote); - rcu_read_lock(); + __schedstat_inc(p->stats.nr_wakeups_remote); + + guard(rcu)(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { __schedstat_inc(sd->ttwu_wake_remote); break; } } - rcu_read_unlock(); } if (wake_flags & WF_MIGRATED) - __schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->stats.nr_wakeups_migrate); #endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); - __schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(p->stats.nr_wakeups); if (wake_flags & WF_SYNC) - __schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->stats.nr_wakeups_sync); } /* - * Mark the task runnable and perform wakeup-preemption. + * Mark the task runnable. */ -static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) +static inline void ttwu_do_wakeup(struct task_struct *p) { - check_preempt_curr(rq, p, wake_flags); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct rq_flags *rf) +{ + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + + lockdep_assert_rq_held(rq); + + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + +#ifdef CONFIG_SMP + if (wake_flags & WF_RQ_SELECTED) + en_flags |= ENQUEUE_RQ_SELECTED; + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; + else +#endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + + activate_task(rq, p, en_flags); + wakeup_preempt(rq, p, wake_flags); + + ttwu_do_wakeup(p); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* - * Our task @p is fully woken up and running; so its safe to + * Our task @p is fully woken up and running; so it's safe to * drop the rq->lock, hereafter rq is only used for statistics. */ rq_unpin_lock(rq, rf); @@ -2222,33 +3756,32 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, #endif } -static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) -{ - int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - - lockdep_assert_held(&rq->lock); - - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; - -#ifdef CONFIG_SMP - if (wake_flags & WF_MIGRATED) - en_flags |= ENQUEUE_MIGRATED; -#endif - - activate_task(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, rf); -} - /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Consider @p being inside a wait loop: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * between set_current_state() and schedule(). In this case @p is still + * runnable, so all that needs doing is change p->state back to TASK_RUNNING in + * an atomic manner. + * + * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq + * then schedule() must still happen and p->state can be changed to + * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we + * need to do a full wakeup with enqueue. + * + * Returns: %true when the wakeup is done, + * %false otherwise. */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +static int ttwu_runnable(struct task_struct *p, int wake_flags) { struct rq_flags rf; struct rq *rq; @@ -2256,9 +3789,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { - /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, &rf); + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ + wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); ret = 1; } __task_rq_unlock(rq, &rf); @@ -2277,13 +3818,6 @@ void sched_ttwu_pending(void *arg) if (!llist) return; - /* - * rq::ttwu_pending racy indication of out-standing wakeups. - * Races such that false-negatives are possible, since they - * are shorter lived that false-positives would be. - */ - WRITE_ONCE(rq->ttwu_pending, 0); - rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -2297,17 +3831,34 @@ void sched_ttwu_pending(void *arg) ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); } + /* + * Must be after enqueueing at least once task such that + * idle_cpu() does not observe a false-negative -- if it does, + * it is possible for select_idle_siblings() to stack a number + * of tasks on this CPU during that window. + * + * It is OK to clear ttwu_pending when another task pending. + * We will receive IPI after local IRQ enabled and then enqueue it. + * Since now nr_running > 0, idle_cpu() will always get correct result. + */ + WRITE_ONCE(rq->ttwu_pending, 0); rq_unlock_irqrestore(rq, &rf); } -void send_call_function_single_ipi(int cpu) +/* + * Prepare the scene for sending an IPI for a remote smp_call + * + * Returns true if the caller can proceed with sending the IPI. + * Returns false otherwise. + */ +bool call_function_single_prep_ipi(int cpu) { - struct rq *rq = cpu_rq(cpu); - - if (!set_nr_if_polling(rq->idle)) - arch_send_call_function_single_ipi(cpu); - else + if (set_nr_if_polling(cpu_rq(cpu)->idle)) { trace_sched_wake_idle_without_ipi(cpu); + return false; + } + + return true; } /* @@ -2329,48 +3880,90 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - rcu_read_lock(); - - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; - - if (set_nr_if_polling(rq->idle)) { - trace_sched_wake_idle_without_ipi(cpu); - } else { - rq_lock_irqsave(rq, &rf); + guard(rcu)(); + if (is_idle_task(rcu_dereference(rq->curr))) { + guard(rq_lock_irqsave)(rq); if (is_idle_task(rq->curr)) - smp_send_reschedule(cpu); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); + resched_curr(rq); } +} -out: - rcu_read_unlock(); +bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + if (!sched_asym_cpucap_active()) + return true; + + if (this_cpu == that_cpu) + return true; + + return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu); } bool cpus_share_cache(int this_cpu, int that_cpu) { + if (this_cpu == that_cpu) + return true; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static inline bool ttwu_queue_cond(int cpu, int wake_flags) +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + +static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* + * The BPF scheduler may depend on select_task_rq() being invoked during + * wakeups. In addition, @p may end up executing on a different CPU + * regardless of what happens in the wakeup path making the ttwu_queue + * optimization less meaningful. Skip if on SCX. + */ + if (task_on_scx(p)) + return false; + + /* + * Do not complicate things with the async wake_list while the CPU is + * in hotplug state. + */ + if (!cpu_active(cpu)) + return false; + + /* Ensure the task will still be allowed to run on the CPU. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + + /* * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. */ if (!cpus_share_cache(smp_processor_id(), cpu)) return true; + if (cpu == smp_processor_id()) + return false; + /* - * If the task is descheduling and the only running task on the - * CPU then use the wakelist to offload the task activation to - * the soon-to-be-idle CPU as the current CPU is likely busy. - * nr_running is checked to avoid unnecessary task stacking. + * If the wakee cpu is idle, or the task is descheduling and the + * only running task on the CPU, then use the wakelist to offload + * the task activation to the idle (or soon-to-be-idle) CPU as + * the current CPU is likely busy. nr_running is checked to + * avoid unnecessary task stacking. + * + * Note that we can only get here with (wakee) p->on_rq=0, + * p->on_cpu can be whatever, we've done the dequeue, so + * the wakee has been accounted out of ->nr_running. */ - if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) + if (!cpu_rq(cpu)->nr_running) return true; return false; @@ -2378,10 +3971,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { - if (WARN_ON_ONCE(cpu == smp_processor_id())) - return false; - + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -2389,6 +3979,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } + +#else /* !CONFIG_SMP */ + +static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) +{ + return false; +} + #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) @@ -2396,10 +3994,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; -#if defined(CONFIG_SMP) if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; -#endif rq_lock(rq, &rf); update_rq_clock(rq); @@ -2408,6 +4004,56 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } /* + * Invoked from try_to_wake_up() to check whether the task can be woken up. + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. + * + * The rules of saved_state: + * + * The related locking code always holds p::pi_lock when updating + * p::saved_state, which means the code is fully serialized in both cases. + * + * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. + * No other bits set. This allows to distinguish all wakeup scenarios. + * + * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This + * allows us to prevent early wakeup of tasks before they can be run on + * asymmetric ISA architectures (eg ARMv9). + */ +static __always_inline +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) +{ + int match; + + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { + WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && + state != TASK_RTLOCK_WAIT); + } + + *success = !!(match = __task_state_match(p, state)); + + /* + * Saved state preserves the task state across blocking on + * an RT lock or TASK_FREEZABLE tasks. If the state matches, + * set p::saved_state to TASK_RUNNING, but do not wake the task + * because it waits for a lock wakeup or __thaw_task(). Also + * indicate success because from the regular waker's point of + * view this has succeeded. + * + * After acquiring the lock the task will restore p::__state + * from p::saved_state which ensures that the regular + * wakeup is not lost. The restore will also set + * p::saved_state to TASK_RUNNING so any further tests will + * not result in false positives vs. @success + */ + if (match < 0) + p->saved_state = TASK_RUNNING; + + return match > 0; +} + +/* * Notes on Program-Order guarantees on SMP systems. * * MIGRATION @@ -2455,8 +4101,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * migration. However the means are completely different as there is no lock * chain to provide order. Instead we do: * - * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_load_acquire(!X->on_cpu) + * 1) smp_store_release(X->on_cpu, 0) -- finish_task() + * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * Example: * @@ -2496,231 +4142,312 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * If (@state & @p->state) @p->state = TASK_RUNNING. + * Conceptually does: + * + * If (@state & @p->state) @p->state = TASK_RUNNING. * * If the task was not queued/runnable, also place it back on a runqueue. * - * Atomic against schedule() which would dequeue a task, also see - * set_current_state(). + * This function is atomic against schedule() which would dequeue the task. * - * This function executes a full memory barrier before accessing the task - * state; see set_current_state(). + * It issues a full memory barrier before accessing @p->state, see the comment + * with set_current_state(). + * + * Uses p->pi_lock to serialize against concurrent wake-ups. + * + * Relies on p->pi_lock stabilizing: + * - p->sched_class + * - p->cpus_ptr + * - p->sched_task_group + * in order to do migration, see its use of select_task_rq()/set_task_cpu(). + * + * Tries really hard to only take one task_rq(p)->lock for performance. + * Takes rq->lock in: + * - ttwu_runnable() -- old rq, unavoidable, see comment there; + * - ttwu_queue() -- new rq, for enqueue of the task; + * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. + * + * As a consequence we race really badly with just about everything. See the + * many memory barriers and their comments for details. * * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - unsigned long flags; + guard(preempt)(); int cpu, success = 0; - preempt_disable(); + wake_flags |= WF_TTWU; + if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special - * case the whole 'p->on_rq && ttwu_remote()' case below + * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * + * Specifically, given current runs ttwu() we must be before + * schedule()'s block_task(), as such this must not observe + * sched_delayed. + * * In particular: * - we rely on Program-Order guarantees for all the ordering, * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(p->state & state)) + SCHED_WARN_ON(p->se.sched_delayed); + if (!ttwu_state_match(p, state, &success)) goto out; - success = 1; trace_sched_waking(p); - p->state = TASK_RUNNING; - trace_sched_wakeup(p); + ttwu_do_wakeup(p); goto out; } /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. + * reordered with p->state check below. This pairs with smp_store_mb() + * in set_current_state() that the waiting thread does. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - smp_mb__after_spinlock(); - if (!(p->state & state)) - goto unlock; + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + smp_mb__after_spinlock(); + if (!ttwu_state_match(p, state, &success)) + break; - trace_sched_waking(p); + trace_sched_waking(p); - /* We're going to change ->state: */ - success = 1; + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * STORE p->on_rq = 1 LOAD p->state + * UNLOCK rq->lock + * + * __schedule() (switch to task 'p') + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * UNLOCK rq->lock + * + * [task p] + * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smp_rmb() lives in __task_needs_rq_lock(). + */ + smp_rmb(); + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) + break; - /* - * Ensure we load p->on_rq _after_ p->state, otherwise it would - * be possible to, falsely, observe p->on_rq == 0 and get stuck - * in smp_cond_load_acquire() below. - * - * sched_ttwu_pending() try_to_wake_up() - * STORE p->on_rq = 1 LOAD p->state - * UNLOCK rq->lock - * - * __schedule() (switch to task 'p') - * LOCK rq->lock smp_rmb(); - * smp_mb__after_spinlock(); - * UNLOCK rq->lock - * - * [task p] - * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq - * - * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in - * __schedule(). See the comment for smp_mb__after_spinlock(). - * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). - */ - smp_rmb(); - if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) - goto unlock; +#ifdef CONFIG_SMP + /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * __schedule() (switch to task 'p') try_to_wake_up() + * STORE p->on_cpu = 1 LOAD p->on_rq + * UNLOCK rq->lock + * + * __schedule() (put 'p' to sleep) + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * STORE p->on_rq = 0 LOAD p->on_cpu + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). + */ + smp_acquire__after_ctrl_dep(); - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + WRITE_ONCE(p->__state, TASK_WAKING); + + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. + */ + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) + break; + + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, wait until it's done referencing the task. + * + * Pairs with the smp_store_release() in finish_task(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. + */ + smp_cond_load_acquire(&p->on_cpu, !VAL); + + cpu = select_task_rq(p, p->wake_cpu, &wake_flags); + if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + + wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); + set_task_cpu(p, cpu); + } +#else + cpu = task_cpu(p); +#endif /* CONFIG_SMP */ + + ttwu_queue(p, cpu, wake_flags); } +out: + if (success) + ttwu_stat(p, task_cpu(p), wake_flags); -#ifdef CONFIG_SMP - /* - * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be - * possible to, falsely, observe p->on_cpu == 0. - * - * One must be running (->on_cpu == 1) in order to remove oneself - * from the runqueue. - * - * __schedule() (switch to task 'p') try_to_wake_up() - * STORE p->on_cpu = 1 LOAD p->on_rq - * UNLOCK rq->lock - * - * __schedule() (put 'p' to sleep) - * LOCK rq->lock smp_rmb(); - * smp_mb__after_spinlock(); - * STORE p->on_rq = 0 LOAD p->on_cpu - * - * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in - * __schedule(). See the comment for smp_mb__after_spinlock(). - * - * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer - * care about it's own p->state. See the comment in __schedule(). - */ - smp_acquire__after_ctrl_dep(); + return success; +} + +static bool __task_needs_rq_lock(struct task_struct *p) +{ + unsigned int state = READ_ONCE(p->__state); /* - * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq - * == 0), which means we need to do an enqueue, change p->state to - * TASK_WAKING such that we can unlock p->pi_lock before doing the - * enqueue, such as ttwu_queue_wakelist(). + * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when + * the task is blocked. Make sure to check @state since ttwu() can drop + * locks at the end, see ttwu_queue_wakelist(). */ - p->state = TASK_WAKING; + if (state == TASK_RUNNING || state == TASK_WAKING) + return true; /* - * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, considering queueing p on the remote CPUs wake_list - * which potentially sends an IPI instead of spinning on p->on_cpu to - * let the waker make forward progress. This is safe because IRQs are - * disabled and the IPI will deliver after on_cpu is cleared. - * - * Ensure we load task_cpu(p) after p->on_cpu: - * - * set_task_cpu(p, cpu); - * STORE p->cpu = @cpu - * __schedule() (switch to task 'p') - * LOCK rq->lock - * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) - * STORE p->on_cpu = 1 LOAD p->cpu + * Ensure we load p->on_rq after p->__state, otherwise it would be + * possible to, falsely, observe p->on_rq == 0. * - * to ensure we observe the correct CPU on which the task is currently - * scheduling. + * See try_to_wake_up() for a longer comment. */ - if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) - goto unlock; + smp_rmb(); + if (p->on_rq) + return true; +#ifdef CONFIG_SMP /* - * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, wait until its done referencing the task. - * - * Pairs with the smp_store_release() in finish_task(). - * - * This ensures that tasks getting woken will be fully ordered against - * their previous state and preserve Program Order. + * Ensure the task has finished __schedule() and will not be referenced + * anymore. Again, see try_to_wake_up() for a longer comment. */ + smp_rmb(); smp_cond_load_acquire(&p->on_cpu, !VAL); +#endif - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) { - wake_flags |= WF_MIGRATED; - psi_ttwu_dequeue(p); - set_task_cpu(p, cpu); - } -#else - cpu = task_cpu(p); -#endif /* CONFIG_SMP */ - - ttwu_queue(p, cpu, wake_flags); -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); -out: - if (success) - ttwu_stat(p, task_cpu(p), wake_flags); - preempt_enable(); - - return success; + return false; } /** - * try_invoke_on_locked_down_task - Invoke a function on task in fixed state - * @p: Process for which the function is to be invoked. + * task_call_func - Invoke a function on task in fixed state + * @p: Process for which the function is to be invoked, can be @current. * @func: Function to invoke. * @arg: Argument to function. * - * If the specified task can be quickly locked into a definite state - * (either sleeping or on a given runqueue), arrange to keep it in that - * state while invoking @func(@arg). This function can use ->on_rq and - * task_curr() to work out what the state is, if required. Given that - * @func can be invoked with a runqueue lock held, it had better be quite + * Fix the task in it's current state by avoiding wakeups and or rq operations + * and call @func(@arg) on it. This function can use task_is_runnable() and + * task_curr() to work out what the state is, if required. Given that @func + * can be invoked with a runqueue lock held, it had better be quite * lightweight. * * Returns: - * @false if the task slipped out from under the locks. - * @true if the task was locked onto a runqueue or is sleeping. - * However, @func can override this by returning @false. + * Whatever @func returns */ -bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +int task_call_func(struct task_struct *p, task_call_f func, void *arg) { - bool ret = false; + struct rq *rq = NULL; struct rq_flags rf; - struct rq *rq; + int ret; - lockdep_assert_irqs_enabled(); - raw_spin_lock_irq(&p->pi_lock); - if (p->on_rq) { + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + + if (__task_needs_rq_lock(p)) rq = __task_rq_lock(p, &rf); - if (task_rq(p) == rq) - ret = func(p, arg); + + /* + * At this point the task is pinned; either: + * - blocked and we're holding off wakeups (pi->lock) + * - woken, and we're holding off enqueue (rq->lock) + * - queued, and we're holding off schedule (rq->lock) + * - running, and we're holding off de-schedule (rq->lock) + * + * The called function (@func) can use: task_curr(), p->on_rq and + * p->__state to differentiate between these states. + */ + ret = func(p, arg); + + if (rq) rq_unlock(rq, &rf); - } else { - switch (p->state) { - case TASK_RUNNING: - case TASK_WAKING: - break; - default: - smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). - if (!p->on_rq) - ret = func(p, arg); - } - } - raw_spin_unlock_irq(&p->pi_lock); + + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; } /** + * cpu_curr_snapshot - Return a snapshot of the currently running task + * @cpu: The CPU on which to snapshot the task. + * + * Returns the task_struct pointer of the task "currently" running on + * the specified CPU. + * + * If the specified CPU was offline, the return value is whatever it + * is, perhaps a pointer to the task_struct structure of that CPU's idle + * task, but there is no guarantee. Callers wishing a useful return + * value must take some action to ensure that the specified CPU remains + * online throughout. + * + * This function executes full memory barriers before and after fetching + * the pointer, which permits the caller to confine this function's fetch + * with respect to the caller's accesses to other shared variables. + */ +struct task_struct *cpu_curr_snapshot(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *t; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */ + t = rcu_dereference(cpu_curr(cpu)); + rq_unlock_irqrestore(rq, &rf); + smp_mb(); /* Pairing determined by caller's synchronization design. */ + + return t; +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2746,7 +4473,8 @@ int wake_up_state(struct task_struct *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. * - * __sched_fork() is basic setup used by init_idle() too: + * __sched_fork() is basic setup which is also used by sched_init() to + * initialize the boot CPU's idle task. */ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { @@ -2758,21 +4486,22 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + p->se.vlag = 0; INIT_LIST_HEAD(&p->se.group_node); + /* A delayed task cannot be in clone(). */ + SCHED_WARN_ON(p->se.sched_delayed); + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + memset(&p->stats, 0, sizeof(p->stats)); #endif - RB_CLEAR_NODE(&p->dl.rb_node); - init_dl_task_timer(&p->dl); - init_dl_inactive_task_timer(&p->dl); - __dl_clear_params(p); + init_dl_entity(&p->dl); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; @@ -2780,6 +4509,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->rt.on_rq = 0; p->rt.on_list = 0; +#ifdef CONFIG_SCHED_CLASS_EXT + init_scx_entity(&p->scx); +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif @@ -2790,14 +4523,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; #endif + init_sched_mm_cid(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -2805,13 +4542,33 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL -int sysctl_numa_balancing(struct ctl_table *table, int write, +static void reset_memory_tiering(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + pgdat->nbp_threshold = 0; + pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + pgdat->nbp_th_start = jiffies_to_msecs(jiffies); + } +} + +static int sysctl_numa_balancing(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2821,8 +4578,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + (state & NUMA_BALANCING_MEMORY_TIERING)) + reset_memory_tiering(); + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif @@ -2831,7 +4593,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #ifdef CONFIG_SCHEDSTATS DEFINE_STATIC_KEY_FALSE(sched_schedstats); -static bool __initdata __sched_schedstats = false; static void set_schedstats(bool enabled) { @@ -2855,16 +4616,11 @@ static int __init setup_schedstats(char *str) if (!str) goto out; - /* - * This code is called before jump labels have been set up, so we can't - * change the static branch directly just yet. Instead set a temporary - * variable so init_schedstats() can do it later. - */ if (!strcmp(str, "enable")) { - __sched_schedstats = true; + set_schedstats(true); ret = 1; } else if (!strcmp(str, "disable")) { - __sched_schedstats = false; + set_schedstats(false); ret = 1; } out: @@ -2875,13 +4631,8 @@ out: } __setup("schedstats=", setup_schedstats); -static void __init init_schedstats(void) -{ - set_schedstats(__sched_schedstats); -} - #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, +static int sysctl_schedstats(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2901,24 +4652,76 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, return err; } #endif /* CONFIG_PROC_SYSCTL */ -#else /* !CONFIG_SCHEDSTATS */ -static inline void init_schedstats(void) {} #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_SYSCTL +static const struct ctl_table sched_core_sysctls[] = { +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif /* CONFIG_UCLAMP_TASK */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_balancing, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_FOUR, + }, +#endif /* CONFIG_NUMA_BALANCING */ +}; +static int __init sched_core_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_core_sysctls); + return 0; +} +late_initcall(sched_core_sysctl_init); +#endif /* CONFIG_SYSCTL */ + /* * fork()/clone()-time setup: */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { - unsigned long flags; - __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_NEW; + p->__state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2938,8 +4741,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); - p->prio = p->normal_prio = __normal_prio(p); + p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); + p->se.custom_slice = 0; + p->se.slice = sysctl_sched_base_slice; /* * We don't need the reset flag anymore after the fork. It has @@ -2950,30 +4755,21 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (dl_prio(p->prio)) return -EAGAIN; - else if (rt_prio(p->prio)) + + scx_pre_fork(p); + + if (rt_prio(p->prio)) { p->sched_class = &rt_sched_class; - else +#ifdef CONFIG_SCHED_CLASS_EXT + } else if (task_should_scx(p->policy)) { + p->sched_class = &ext_sched_class; +#endif + } else { p->sched_class = &fair_sched_class; + } init_entity_runnable_average(&p->se); - /* - * The child is not yet in the pid-hash so no cgroup attach races, - * and the cgroup is pinned to this child due to cgroup_fork() - * is ran before sched_fork(). - * - * Silence PROVE_RCU. - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - rseq_migrate(p); - /* - * We're setting the CPU for the first time, we don't migrate, - * so use __set_task_cpu(). - */ - __set_task_cpu(p, smp_processor_id()); - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) @@ -2990,6 +4786,48 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +{ + unsigned long flags; + + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_CGROUP_SCHED + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } +#endif + rseq_migrate(p); + /* + * We're setting the CPU for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, smp_processor_id()); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return scx_fork(p); +} + +void sched_cancel_fork(struct task_struct *p) +{ + scx_cancel_fork(p); +} + +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); + scx_post_fork(p); +} + unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -3017,9 +4855,10 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; + int wake_flags = WF_FORK; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -3031,19 +4870,19 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); - activate_task(rq, p, ENQUEUE_NOCLOCK); + activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); - check_preempt_curr(rq, p, WF_FORK); + wakeup_preempt(rq, p, wake_flags); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* - * Nothing relies on rq->lock after this, so its fine to + * Nothing relies on rq->lock after this, so it's fine to * drop it. */ rq_unpin_lock(rq, &rf); @@ -3147,8 +4986,11 @@ static inline void prepare_task(struct task_struct *next) /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. + * + * See the smp_load_acquire(&p->on_cpu) case in ttwu() and + * its ordering comment. */ - next->on_cpu = 1; + WRITE_ONCE(next->on_cpu, 1); #endif } @@ -3156,8 +4998,9 @@ static inline void finish_task(struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely * finished. * * In particular, the load of prev->state in finish_task_switch() must @@ -3169,6 +5012,97 @@ static inline void finish_task(struct task_struct *prev) #endif } +#ifdef CONFIG_SMP + +static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) +{ + void (*func)(struct rq *rq); + struct balance_callback *next; + + lockdep_assert_rq_held(rq); + + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; + + func(rq); + } +} + +static void balance_push(struct rq *rq); + +/* + * balance_push_callback is a right abuse of the callback interface and plays + * by significantly different rules. + * + * Where the normal balance_callback's purpose is to be ran in the same context + * that queued it (only later, when it's safe to drop rq->lock again), + * balance_push_callback is specifically targeted at __schedule(). + * + * This abuse is tolerated because it places all the unlikely/odd cases behind + * a single test, namely: rq->balance_callback == NULL. + */ +struct balance_callback balance_push_callback = { + .next = NULL, + .func = balance_push, +}; + +static inline struct balance_callback * +__splice_balance_callbacks(struct rq *rq, bool split) +{ + struct balance_callback *head = rq->balance_callback; + + if (likely(!head)) + return NULL; + + lockdep_assert_rq_held(rq); + /* + * Must not take balance_push_callback off the list when + * splice_balance_callbacks() and balance_callbacks() are not + * in the same rq->lock section. + * + * In that case it would be possible for __schedule() to interleave + * and observe the list empty. + */ + if (split && head == &balance_push_callback) + head = NULL; + else + rq->balance_callback = NULL; + + return head; +} + +struct balance_callback *splice_balance_callbacks(struct rq *rq) +{ + return __splice_balance_callbacks(rq, true); +} + +static void __balance_callbacks(struct rq *rq) +{ + do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); +} + +void balance_callbacks(struct rq *rq, struct balance_callback *head) +{ + unsigned long flags; + + if (unlikely(head)) { + raw_spin_rq_lock_irqsave(rq, flags); + do_balance_callbacks(rq, head); + raw_spin_rq_unlock_irqrestore(rq, flags); + } +} + +#else + +static inline void __balance_callbacks(struct rq *rq) +{ +} + +#endif + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -3179,10 +5113,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, _THIS_IP_); + spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = next; + rq_lockp(rq)->owner = next; #endif } @@ -3193,8 +5127,9 @@ static inline void finish_lock_switch(struct rq *rq) * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - raw_spin_unlock_irq(&rq->lock); + spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); + __balance_callbacks(rq); + raw_spin_rq_unlock_irq(rq); } /* @@ -3209,6 +5144,22 @@ static inline void finish_lock_switch(struct rq *rq) # define finish_arch_post_lock_switch() do { } while (0) #endif +static inline void kmap_local_sched_out(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_out(); +#endif +} + +static inline void kmap_local_sched_in(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_in(); +#endif +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -3231,6 +5182,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); + kmap_local_sched_out(); prepare_task(next); prepare_arch_switch(next); } @@ -3251,7 +5203,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * * The context switch have flipped the stack from under us and restored the * local variables which were saved when this task called schedule() in the - * past. prev == current is still correct but we need to recalculate this_rq + * past. 'prev == current' is still correct but we need to recalculate this_rq * because prev may have moved to another CPU. */ static struct rq *finish_task_switch(struct task_struct *prev) @@ -3259,7 +5211,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; - long prev_state; + unsigned int prev_state; /* * The previous task will have left us with a preempt_count of 2 @@ -3290,13 +5242,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ - prev_state = prev->state; + prev_state = READ_ONCE(prev->__state); vtime_task_switch(prev); perf_event_task_sched_in(prev, current); finish_task(prev); + tick_nohz_task_switch(); finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); + /* + * kmap_local_sched_out() is invoked with rq::lock held and + * interrupts disabled. There is no requirement for that, but the + * sched out code does not have an interrupt enabled section. + * Restoring the maps on sched in does not require interrupts being + * disabled either. + */ + kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); /* @@ -3308,70 +5269,27 @@ static struct rq *finish_task_switch(struct task_struct *prev) * rq->curr, before returning to userspace, so provide them here: * * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly - * provided by mmdrop(), + * provided by mmdrop_lazy_tlb(), * - a sync_core for SYNC_CORE. */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_lazy_tlb_sched(mm); } + if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - /* Task is done with its stack. */ put_task_stack(prev); put_task_struct_rcu_user(prev); } - tick_nohz_task_switch(); return rq; } -#ifdef CONFIG_SMP - -/* rq->lock is NOT held, but preemption is disabled */ -static void __balance_callback(struct rq *rq) -{ - struct callback_head *head, *next; - void (*func)(struct rq *rq); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - head = rq->balance_callback; - rq->balance_callback = NULL; - while (head) { - func = (void (*)(struct rq *))head->func; - next = head->next; - head->next = NULL; - head = next; - - func(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -static inline void balance_callback(struct rq *rq) -{ - if (unlikely(rq->balance_callback)) - __balance_callback(rq); -} - -#else - -static inline void balance_callback(struct rq *rq) -{ -} - -#endif - /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -3379,8 +5297,6 @@ static inline void balance_callback(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq; - /* * New tasks start with FORK_PREEMPT_COUNT, see there and * finish_task_switch() for details. @@ -3390,8 +5306,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) * PREEMPT_COUNT kernels). */ - rq = finish_task_switch(prev); - balance_callback(rq); + finish_task_switch(prev); preempt_enable(); if (current->set_child_tid) @@ -3418,17 +5333,20 @@ context_switch(struct rq *rq, struct task_struct *prev, /* * kernel -> kernel lazy + transfer active - * user -> kernel lazy + mmgrab() active + * user -> kernel lazy + mmgrab_lazy_tlb() active * - * kernel -> user switch + mmdrop() active + * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch + * + * switch_mm_cid() needs to be updated if the barriers provided + * by context_switch() are modified. */ if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); next->active_mm = prev->active_mm; if (prev->mm) // from user - mmgrab(prev->active_mm); + mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; } else { // to user @@ -3442,15 +5360,17 @@ context_switch(struct rq *rq, struct task_struct *prev, * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_use_mm(next->mm); if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ + /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } } - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + /* switch_mm_cid() requires the memory barriers above. */ + switch_mm_cid(rq, prev, next); prepare_lock_switch(rq, next, rf); @@ -3467,9 +5387,9 @@ context_switch(struct rq *rq, struct task_struct *prev, * externally visible scheduler statistics: current number of runnable * threads, total number of context switches performed since bootup. */ -unsigned long nr_running(void) +unsigned int nr_running(void) { - unsigned long i, sum = 0; + unsigned int i, sum = 0; for_each_online_cpu(i) sum += cpu_rq(i)->nr_running; @@ -3496,6 +5416,11 @@ bool single_task_running(void) } EXPORT_SYMBOL(single_task_running); +unsigned long long nr_context_switches_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_switches; +} + unsigned long long nr_context_switches(void) { int i; @@ -3514,13 +5439,13 @@ unsigned long long nr_context_switches(void) * it does become runnable. */ -unsigned long nr_iowait_cpu(int cpu) +unsigned int nr_iowait_cpu(int cpu) { return atomic_read(&cpu_rq(cpu)->nr_iowait); } /* - * IO-wait accounting, and how its mostly bollocks (on SMP). + * IO-wait accounting, and how it's mostly bollocks (on SMP). * * The idea behind IO-wait account is to account the idle time that we could * have spend running if it were not for IO. That is, if we were to improve the @@ -3549,9 +5474,9 @@ unsigned long nr_iowait_cpu(int cpu) * Task CPU affinities can make all that even more 'interesting'. */ -unsigned long nr_iowait(void) +unsigned int nr_iowait(void) { - unsigned long i, sum = 0; + unsigned int i, sum = 0; for_each_possible_cpu(i) sum += nr_iowait_cpu(i); @@ -3568,23 +5493,20 @@ unsigned long nr_iowait(void) void sched_exec(void) { struct task_struct *p = current; - unsigned long flags; + struct migration_arg arg; int dest_cpu; - raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); - if (dest_cpu == smp_processor_id()) - goto unlock; + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); + if (dest_cpu == smp_processor_id()) + return; - if (likely(cpu_active(dest_cpu))) { - struct migration_arg arg = { p, dest_cpu }; + if (unlikely(!cpu_active(dest_cpu))) + return; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - return; + arg = (struct migration_arg){ p, dest_cpu }; } -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } #endif @@ -3604,9 +5526,9 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat); static inline void prefetch_curr_exec_start(struct task_struct *p) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct sched_entity *curr = (&p->se)->cfs_rq->curr; + struct sched_entity *curr = p->se.cfs_rq->curr; #else - struct sched_entity *curr = (&task_rq(p)->cfs)->curr; + struct sched_entity *curr = task_rq(p)->cfs.curr; #endif prefetch(curr); prefetch(&curr->exec_start); @@ -3627,7 +5549,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. - * Reading ->on_cpu is racy, but this is ok. + * Reading ->on_cpu is racy, but this is OK. * * If we race with it leaving CPU, we'll take a lock. So we're correct. * If we race with it entering CPU, unaccounted time is 0. This is @@ -3645,7 +5567,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && task_on_rq_queued(p)) { + if (task_current_donor(rq, p) && task_on_rq_queued(p)) { prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); @@ -3656,48 +5578,109 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); +#ifdef CONFIG_SCHED_DEBUG +static u64 cpu_resched_latency(struct rq *rq) +{ + int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); + u64 resched_latency, now = rq_clock(rq); + static bool warned_once; + + if (sysctl_resched_latency_warn_once && warned_once) + return 0; + + if (!need_resched() || !latency_warn_ms) + return 0; + + if (system_state == SYSTEM_BOOTING) + return 0; + + if (!rq->last_seen_need_resched_ns) { + rq->last_seen_need_resched_ns = now; + rq->ticks_without_resched = 0; + return 0; + } -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure) + rq->ticks_without_resched++; + resched_latency = now - rq->last_seen_need_resched_ns; + if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) + return 0; + + warned_once = true; + + return resched_latency; +} + +static int __init setup_resched_latency_warn_ms(char *str) { - int cpu; + long val; - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); + if ((kstrtol(str, 0, &val))) { + pr_warn("Unable to set resched_latency_warn_ms\n"); + return 1; + } + + sysctl_resched_latency_warn_ms = val; + return 1; } +__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); +#else +static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } +#endif /* CONFIG_SCHED_DEBUG */ /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -void scheduler_tick(void) +void sched_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; + /* accounting goes to the donor task */ + struct task_struct *donor; struct rq_flags rf; - unsigned long thermal_pressure; + unsigned long hw_pressure; + u64 resched_latency; + + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) + arch_scale_freq_tick(); - arch_scale_freq_tick(); sched_clock_tick(); rq_lock(rq, &rf); + donor = rq->donor; + + psi_account_irqtime(rq, donor, NULL); update_rq_clock(rq); - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); - curr->sched_class->task_tick(rq, curr, 0); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); + + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + resched_curr(rq); + + donor->sched_class->task_tick(rq, donor, 0); + if (sched_feat(LATENCY_WARN)) + resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); - psi_task_tick(rq); + sched_core_tick(rq); + task_tick_mm_cid(rq, donor); + scx_tick(rq); rq_unlock(rq, &rf); + if (sched_feat(LATENCY_WARN) && resched_latency) + resched_latency_warn(cpu, resched_latency); + perf_event_task_tick(); + if (donor->flags & PF_WQ_WORKER) + wq_worker_tick(donor); + #ifdef CONFIG_SMP - rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); + if (!scx_switched_all()) { + rq->idle_balance = idle_cpu(cpu); + sched_balance_trigger(rq); + } #endif } @@ -3744,9 +5727,6 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); - struct task_struct *curr; - struct rq_flags rf; - u64 delta; int os; /* @@ -3756,30 +5736,32 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!tick_nohz_tick_stopped_cpu(cpu)) - goto out_requeue; - - rq_lock_irq(rq, &rf); - curr = rq->curr; - if (cpu_is_offline(cpu)) - goto out_unlock; + if (tick_nohz_tick_stopped_cpu(cpu)) { + guard(rq_lock_irq)(rq); + struct task_struct *curr = rq->curr; - update_rq_clock(rq); + if (cpu_online(cpu)) { + /* + * Since this is a remote tick for full dynticks mode, + * we are always sure that there is no proxy (only a + * single task is running). + */ + SCHED_WARN_ON(rq->curr != rq->donor); + update_rq_clock(rq); + + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a + * reasonable amount of time. + */ + u64 delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } + curr->sched_class->task_tick(rq, curr, 0); - if (!is_idle_task(curr)) { - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + calc_load_nohz_remote(rq); + } } - curr->sched_class->task_tick(rq, curr, 0); - - calc_load_nohz_remote(rq); -out_unlock: - rq_unlock_irq(rq, &rf); -out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary @@ -3798,7 +5780,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -3819,7 +5801,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -3947,13 +5929,11 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } - if (panic_on_warn) - panic("scheduling while atomic\n"); + check_panic_on_warn("scheduling while atomic"); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); @@ -3973,7 +5953,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP - if (!preempt && prev->state && prev->non_block_count) { + if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", prev->comm, prev->pid, prev->non_block_count); dump_stack(); @@ -3986,17 +5966,32 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); + SCHED_WARN_ON(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq()->sched_count); } -static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) +static void prev_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) { -#ifdef CONFIG_SMP + const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; + +#ifdef CONFIG_SCHED_CLASS_EXT + /* + * SCX requires a balance() call before every pick_task() including when + * waking up from SCHED_IDLE. If @start_class is below SCX, start from + * SCX instead. Also, set a flag to detect missing balance() call. + */ + if (scx_enabled()) { + rq->scx.flags |= SCX_RQ_BAL_PENDING; + if (sched_class_above(&ext_sched_class, start_class)) + start_class = &ext_sched_class; + } +#endif + /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -4005,58 +6000,607 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * We can terminate the balance pass as soon as we know there is * a runnable task of @class priority or higher. */ - for_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) + for_active_class_range(class, start_class, &idle_sched_class) { + if (class->balance && class->balance(rq, prev, rf)) break; } -#endif - - put_prev_task(rq, prev); } /* * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; + rq->dl_server = NULL; + + if (scx_enabled()) + goto restart; + /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a - * higher scheduling class, because otherwise those loose the + * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ - if (likely((prev->sched_class == &idle_sched_class || - prev->sched_class == &fair_sched_class) && - rq->nr_running == rq->cfs.h_nr_running)) { + if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && + rq->nr_running == rq->cfs.h_nr_queued)) { p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto restart; - /* Assumes fair_sched_class->next == idle_sched_class */ + /* Assume the next prioritized class is idle_sched_class */ if (!p) { - put_prev_task(rq, prev); - p = pick_next_task_idle(rq); + p = pick_task_idle(rq); + put_prev_set_next_task(rq, prev, p); } return p; } restart: - put_prev_task_balance(rq, prev, rf); + prev_balance(rq, prev, rf); + + for_each_active_class(class) { + if (class->pick_next_task) { + p = class->pick_next_task(rq, prev); + if (p) + return p; + } else { + p = class->pick_task(rq); + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; + } + } + } + + BUG(); /* The idle class should always have a runnable task. */ +} + +#ifdef CONFIG_SCHED_CORE +static inline bool is_task_rq_idle(struct task_struct *t) +{ + return (task_rq(t)->idle == t); +} - for_each_class(class) { - p = class->pick_next_task(rq); +static inline bool cookie_equals(struct task_struct *a, unsigned long cookie) +{ + return is_task_rq_idle(a) || (a->core_cookie == cookie); +} + +static inline bool cookie_match(struct task_struct *a, struct task_struct *b) +{ + if (is_task_rq_idle(a) || is_task_rq_idle(b)) + return true; + + return a->core_cookie == b->core_cookie; +} + +static inline struct task_struct *pick_task(struct rq *rq) +{ + const struct sched_class *class; + struct task_struct *p; + + rq->dl_server = NULL; + + for_each_active_class(class) { + p = class->pick_task(rq); if (p) return p; } - /* The idle class should always have a runnable task: */ - BUG(); + BUG(); /* The idle class should always have a runnable task. */ +} + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); + +static void queue_core_balance(struct rq *rq); + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + struct task_struct *next, *p, *max = NULL; + const struct cpumask *smt_mask; + bool fi_before = false; + bool core_clock_updated = (rq == rq->core); + unsigned long cookie; + int i, cpu, occ = 0; + struct rq *rq_i; + bool need_sync; + + if (!sched_core_enabled(rq)) + return __pick_next_task(rq, prev, rf); + + cpu = cpu_of(rq); + + /* Stopper task is switching into idle, no need core-wide selection. */ + if (cpu_is_offline(cpu)) { + /* + * Reset core_pick so that we don't enter the fastpath when + * coming online. core_pick would already be migrated to + * another cpu during offline. + */ + rq->core_pick = NULL; + rq->core_dl_server = NULL; + return __pick_next_task(rq, prev, rf); + } + + /* + * If there were no {en,de}queues since we picked (IOW, the task + * pointers are all still valid), and we haven't scheduled the last + * pick yet, do so now. + * + * rq->core_pick can be NULL if no selection was made for a CPU because + * it was either offline or went offline during a sibling's core-wide + * selection. In this case, do a core-wide selection. + */ + if (rq->core->core_pick_seq == rq->core->core_task_seq && + rq->core->core_pick_seq != rq->core_sched_seq && + rq->core_pick) { + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); + + next = rq->core_pick; + rq->dl_server = rq->core_dl_server; + rq->core_pick = NULL; + rq->core_dl_server = NULL; + goto out_set_next; + } + + prev_balance(rq, prev, rf); + + smt_mask = cpu_smt_mask(cpu); + need_sync = !!rq->core->core_cookie; + + /* reset state */ + rq->core->core_cookie = 0UL; + if (rq->core->core_forceidle_count) { + if (!core_clock_updated) { + update_rq_clock(rq->core); + core_clock_updated = true; + } + sched_core_account_forceidle(rq); + /* reset after accounting force idle */ + rq->core->core_forceidle_start = 0; + rq->core->core_forceidle_count = 0; + rq->core->core_forceidle_occupation = 0; + need_sync = true; + fi_before = true; + } + + /* + * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq + * + * @task_seq guards the task state ({en,de}queues) + * @pick_seq is the @task_seq we did a selection on + * @sched_seq is the @pick_seq we scheduled + * + * However, preemptions can cause multiple picks on the same task set. + * 'Fix' this by also increasing @task_seq for every pick. + */ + rq->core->core_task_seq++; + + /* + * Optimize for common case where this CPU has no cookies + * and there are no cookied tasks running on siblings. + */ + if (!need_sync) { + next = pick_task(rq); + if (!next->core_cookie) { + rq->core_pick = NULL; + rq->core_dl_server = NULL; + /* + * For robustness, update the min_vruntime_fi for + * unconstrained picks as well. + */ + WARN_ON_ONCE(fi_before); + task_vruntime_update(rq, next, false); + goto out_set_next; + } + } + + /* + * For each thread: do the regular task pick and find the max prio task + * amongst them. + * + * Tie-break prio towards the current CPU + */ + for_each_cpu_wrap(i, smt_mask, cpu) { + rq_i = cpu_rq(i); + + /* + * Current cpu always has its clock updated on entrance to + * pick_next_task(). If the current cpu is not the core, + * the core may also have been updated above. + */ + if (i != cpu && (rq_i != rq->core || !core_clock_updated)) + update_rq_clock(rq_i); + + rq_i->core_pick = p = pick_task(rq_i); + rq_i->core_dl_server = rq_i->dl_server; + + if (!max || prio_less(max, p, fi_before)) + max = p; + } + + cookie = rq->core->core_cookie = max->core_cookie; + + /* + * For each thread: try and find a runnable task that matches @max or + * force idle. + */ + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + p = rq_i->core_pick; + + if (!cookie_equals(p, cookie)) { + p = NULL; + if (cookie) + p = sched_core_find(rq_i, cookie); + if (!p) + p = idle_sched_class.pick_task(rq_i); + } + + rq_i->core_pick = p; + rq_i->core_dl_server = NULL; + + if (p == rq_i->idle) { + if (rq_i->nr_running) { + rq->core->core_forceidle_count++; + if (!fi_before) + rq->core->core_forceidle_seq++; + } + } else { + occ++; + } + } + + if (schedstat_enabled() && rq->core->core_forceidle_count) { + rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_occupation = occ; + } + + rq->core->core_pick_seq = rq->core->core_task_seq; + next = rq->core_pick; + rq->core_sched_seq = rq->core->core_pick_seq; + + /* Something should have been selected for current CPU */ + WARN_ON_ONCE(!next); + + /* + * Reschedule siblings + * + * NOTE: L1TF -- at this point we're no longer running the old task and + * sending an IPI (below) ensures the sibling will no longer be running + * their task. This ensures there is no inter-sibling overlap between + * non-matching user state. + */ + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + + /* + * An online sibling might have gone offline before a task + * could be picked for it, or it might be offline but later + * happen to come online, but its too late and nothing was + * picked for it. That's Ok - it will pick tasks for itself, + * so ignore it. + */ + if (!rq_i->core_pick) + continue; + + /* + * Update for new !FI->FI transitions, or if continuing to be in !FI: + * fi_before fi update? + * 0 0 1 + * 0 1 1 + * 1 0 1 + * 1 1 0 + */ + if (!(fi_before && rq->core->core_forceidle_count)) + task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); + + rq_i->core_pick->core_occupation = occ; + + if (i == cpu) { + rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; + continue; + } + + /* Did we break L1TF mitigation requirements? */ + WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick)); + + if (rq_i->curr == rq_i->core_pick) { + rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; + continue; + } + + resched_curr(rq_i); + } + +out_set_next: + put_prev_set_next_task(rq, prev, next); + if (rq->core->core_forceidle_count && next == rq->idle) + queue_core_balance(rq); + + return next; +} + +static bool try_steal_cookie(int this, int that) +{ + struct rq *dst = cpu_rq(this), *src = cpu_rq(that); + struct task_struct *p; + unsigned long cookie; + bool success = false; + + guard(irq)(); + guard(double_rq_lock)(dst, src); + + cookie = dst->core->core_cookie; + if (!cookie) + return false; + + if (dst->curr != dst->idle) + return false; + + p = sched_core_find(src, cookie); + if (!p) + return false; + + do { + if (p == src->core_pick || p == src->curr) + goto next; + + if (!is_cpu_allowed(p, this)) + goto next; + + if (p->core_occupation > dst->idle->core_occupation) + goto next; + /* + * sched_core_find() and sched_core_next() will ensure + * that task @p is not throttled now, we also need to + * check whether the runqueue of the destination CPU is + * being throttled. + */ + if (sched_task_is_throttled(p, this)) + goto next; + + move_queued_task_locked(src, dst, p); + resched_curr(dst); + + success = true; + break; + +next: + p = sched_core_next(p, cookie); + } while (p); + + return success; +} + +static bool steal_cookie_task(int cpu, struct sched_domain *sd) +{ + int i; + + for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) { + if (i == cpu) + continue; + + if (need_resched()) + break; + + if (try_steal_cookie(cpu, i)) + return true; + } + + return false; +} + +static void sched_core_balance(struct rq *rq) +{ + struct sched_domain *sd; + int cpu = cpu_of(rq); + + guard(preempt)(); + guard(rcu)(); + + raw_spin_rq_unlock_irq(rq); + for_each_domain(cpu, sd) { + if (need_resched()) + break; + + if (steal_cookie_task(cpu, sd)) + break; + } + raw_spin_rq_lock_irq(rq); +} + +static DEFINE_PER_CPU(struct balance_callback, core_balance_head); + +static void queue_core_balance(struct rq *rq) +{ + if (!sched_core_enabled(rq)) + return; + + if (!rq->core->core_cookie) + return; + + if (!rq->nr_running) /* not forced idle */ + return; + + queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); +} + +DEFINE_LOCK_GUARD_1(core_lock, int, + sched_core_lock(*_T->lock, &_T->flags), + sched_core_unlock(*_T->lock, &_T->flags), + unsigned long flags) + +static void sched_core_cpu_starting(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + int t; + + guard(core_lock)(&cpu); + + WARN_ON_ONCE(rq->core != rq); + + /* if we're the first, we'll be our own leader */ + if (cpumask_weight(smt_mask) == 1) + return; + + /* find the leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + rq = cpu_rq(t); + if (rq->core == rq) { + core_rq = rq; + break; + } + } + + if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ + return; + + /* install and validate core_rq */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + + if (t == cpu) + rq->core = core_rq; + + WARN_ON_ONCE(rq->core != core_rq); + } +} + +static void sched_core_cpu_deactivate(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + int t; + + guard(core_lock)(&cpu); + + /* if we're the last man standing, nothing to do */ + if (cpumask_weight(smt_mask) == 1) { + WARN_ON_ONCE(rq->core != rq); + return; + } + + /* if we're not the leader, nothing to do */ + if (rq->core != rq) + return; + + /* find a new leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + core_rq = cpu_rq(t); + break; + } + + if (WARN_ON_ONCE(!core_rq)) /* impossible */ + return; + + /* copy the shared state to the new leader */ + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle_count = rq->core_forceidle_count; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; + + /* + * Accounting edge for forced idle is handled in pick_next_task(). + * Don't need another one here, since the hotplug thread shouldn't + * have a cookie. + */ + core_rq->core_forceidle_start = 0; + + /* install new leader */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + rq->core = core_rq; + } +} + +static inline void sched_core_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->core != rq) + rq->core = rq; +} + +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cpu_deactivate(unsigned int cpu) {} +static inline void sched_core_cpu_dying(unsigned int cpu) {} + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return __pick_next_task(rq, prev, rf); +} + +#endif /* CONFIG_SCHED_CORE */ + +/* + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a + * preemption from blocking on an 'sleeping' spin/rwlock. + */ +#define SM_IDLE (-1) +#define SM_NONE 0 +#define SM_PREEMPT 1 +#define SM_RTLOCK_WAIT 2 + +/* + * Helper function for __schedule() + * + * If a task does not have signals pending, deactivate it + * Otherwise marks the task's __state as RUNNING + */ +static bool try_to_block_task(struct rq *rq, struct task_struct *p, + unsigned long task_state) +{ + int flags = DEQUEUE_NOCLOCK; + + if (signal_pending_state(task_state, p)) { + WRITE_ONCE(p->__state, TASK_RUNNING); + return false; + } + + p->sched_contributes_to_load = + (task_state & TASK_UNINTERRUPTIBLE) && + !(task_state & TASK_NOLOAD) && + !(task_state & TASK_FROZEN); + + if (unlikely(is_special_task_state(task_state))) + flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ + block_task(rq, p, flags); + return true; } /* @@ -4070,7 +6614,7 @@ restart: * paths. For example, see arch/x86/entry_64.S. * * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). + * interrupt handler sched_tick(). * * 3. Wakeups don't really cause entry into schedule(). They add a * task to the run-queue and that's it. @@ -4098,9 +6642,14 @@ restart: * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(int sched_mode) { struct task_struct *prev, *next; + /* + * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted + * as a preemption by schedule_debug() and RCU. + */ + bool preempt = sched_mode > SM_NONE; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -4113,7 +6662,7 @@ static void __sched notrace __schedule(bool preempt) schedule_debug(prev, preempt); - if (sched_feat(HRTICK)) + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); @@ -4132,7 +6681,9 @@ static void __sched notrace __schedule(bool preempt) * if (signal_pending_state()) if (p->state & @state) * * Also, the membarrier system call requires a full memory barrier - * after coming from user-space, before storing to rq->curr. + * after coming from user-space, before storing to rq->curr; this + * barrier matches a full barrier in the proximity of the membarrier + * system call exit. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -4140,53 +6691,37 @@ static void __sched notrace __schedule(bool preempt) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); + rq->clock_update_flags = RQCF_UPDATED; switch_count = &prev->nivcsw; + /* Task state changes only considers SM_PREEMPT as preemption */ + preempt = sched_mode == SM_PREEMPT; + /* * We must load prev->state once (task_struct::state is volatile), such - * that: - * - * - we form a control dependency vs deactivate_task() below. - * - ptrace_{,un}freeze_traced() can change ->state underneath us. - */ - prev_state = prev->state; - if (!preempt && prev_state) { - if (signal_pending_state(prev_state, prev)) { - prev->state = TASK_RUNNING; - } else { - prev->sched_contributes_to_load = - (prev_state & TASK_UNINTERRUPTIBLE) && - !(prev_state & TASK_NOLOAD) && - !(prev->flags & PF_FROZEN); - - if (prev->sched_contributes_to_load) - rq->nr_uninterruptible++; - - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) - * if (prev_state) goto out; - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); - * p->state = TASK_WAKING - * - * Where __schedule() and ttwu() have matching control dependencies. - * - * After this, schedule() must not care about p->state any more. - */ - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - - if (prev->in_iowait) { - atomic_inc(&rq->nr_iowait); - delayacct_blkio_start(); - } + * that we form a control dependency vs deactivate_task() below. + */ + prev_state = READ_ONCE(prev->__state); + if (sched_mode == SM_IDLE) { + /* SCX must consult the BPF scheduler to tell if rq is empty */ + if (!rq->nr_running && !scx_enabled()) { + next = prev; + goto picked; } + } else if (!preempt && prev_state) { + try_to_block_task(rq, prev, prev_state); switch_count = &prev->nvcsw; } next = pick_next_task(rq, prev, &rf); + rq_set_donor(rq, next); +picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); +#ifdef CONFIG_SCHED_DEBUG + rq->last_seen_need_resched_ns = 0; +#endif if (likely(prev != next)) { rq->nr_switches++; @@ -4202,27 +6737,37 @@ static void __sched notrace __schedule(bool preempt) * * Here are the schemes providing that barrier on the * various architectures: - * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. - * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC, + * RISC-V. switch_mm() relies on membarrier_arch_switch_mm() + * on PowerPC and on RISC-V. * - finish_lock_switch() for weakly-ordered * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock * is a RELEASE barrier), + * + * The barrier matches a full barrier in the proximity of + * the membarrier system call entry. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ ++*switch_count; - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + migrate_disable_switch(rq, prev); + psi_account_irqtime(rq, prev, next); + psi_sched_switch(prev, next, !task_on_rq_queued(prev) || + prev->se.sched_delayed); - trace_sched_switch(preempt, prev, next); + trace_sched_switch(preempt, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unlock_irq(rq, &rf); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_rq_unlock_irq(rq); } - - balance_callback(rq); } void __noreturn do_task_dead(void) @@ -4233,7 +6778,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; - __schedule(false); + __schedule(SM_NONE); BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -4243,57 +6788,73 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state) - return; + static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG); + unsigned int task_flags; /* - * If a worker went to sleep, notify and ask workqueue whether - * it wants to wake up a task to maintain concurrency. - * As this function is called inside the schedule() context, - * we disable preemption to avoid it calling schedule() again - * in the possible wakeup of a kworker and because wq_worker_sleeping() - * requires it. + * Establish LD_WAIT_CONFIG context to ensure none of the code called + * will use a blocking primitive -- which would lead to recursion. */ - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { - preempt_disable(); - if (tsk->flags & PF_WQ_WORKER) - wq_worker_sleeping(tsk); - else - io_wq_worker_sleeping(tsk); - preempt_enable_no_resched(); - } + lock_map_acquire_try(&sched_map); - if (tsk_is_pi_blocked(tsk)) - return; + task_flags = tsk->flags; + /* + * If a worker goes to sleep, notify and ask workqueue whether it + * wants to wake up a task to maintain concurrency. + */ + if (task_flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else if (task_flags & PF_IO_WORKER) + io_wq_worker_sleeping(tsk); + + /* + * spinlock and rwlock must not flush block requests. This will + * deadlock if the callback attempts to acquire a lock which is + * already acquired. + */ + SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. */ - if (blk_needs_flush_plug(tsk)) - blk_schedule_flush_plug(tsk); + blk_flush_plug(tsk->plug, true); + + lock_map_release(&sched_map); } static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } } -asmlinkage __visible void __sched schedule(void) +static __always_inline void __schedule_loop(int sched_mode) { - struct task_struct *tsk = current; - - sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + __schedule(sched_mode); sched_preempt_enable_no_resched(); } while (need_resched()); +} + +asmlinkage __visible void __sched schedule(void) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_RT_MUTEXES + lockdep_assert(!tsk->sched_rt_mutex); +#endif + + if (!task_is_running(tsk)) + sched_submit_work(tsk); + __schedule_loop(SM_NONE); sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -4312,18 +6873,18 @@ void __sched schedule_idle(void) { /* * As this skips calling sched_submit_work(), which the idle task does - * regardless because that function is a nop when the task is in a + * regardless because that function is a NOP when the task is in a * TASK_RUNNING state, make sure this isn't used someplace that the * current task can be in any other state. Note, idle is always in the * TASK_RUNNING state. */ - WARN_ON_ONCE(current->state); + WARN_ON_ONCE(current->__state); do { - __schedule(false); + __schedule(SM_IDLE); } while (need_resched()); } -#ifdef CONFIG_CONTEXT_TRACKING +#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* @@ -4333,7 +6894,7 @@ asmlinkage __visible void __sched schedule_user(void) * we find a better solution. * * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != CONTEXT_USER, but that will trigger + * should warn if prev_state != CT_STATE_USER, but that will trigger * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); @@ -4354,6 +6915,14 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +#ifdef CONFIG_PREEMPT_RT +void __sched notrace schedule_rtlock(void) +{ + __schedule_loop(SM_RTLOCK_WAIT); +} +NOKPROBE_SYMBOL(schedule_rtlock); +#endif + static void __sched notrace preempt_schedule_common(void) { do { @@ -4372,7 +6941,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + __schedule(SM_PREEMPT); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -4396,12 +6965,32 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) */ if (likely(!preemptible())) return; - preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_dynamic_enabled +#define preempt_schedule_dynamic_enabled preempt_schedule +#define preempt_schedule_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); +void __sched notrace dynamic_preempt_schedule(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) + return; + preempt_schedule(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule); +EXPORT_SYMBOL(dynamic_preempt_schedule); +#endif +#endif + /** * preempt_schedule_notrace - preempt_schedule called by tracing * @@ -4445,7 +7034,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(true); + __schedule(SM_PREEMPT); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -4454,13 +7043,34 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#ifndef preempt_schedule_notrace_dynamic_enabled +#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +#define preempt_schedule_notrace_dynamic_disabled NULL +#endif +DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); +void __sched notrace dynamic_preempt_schedule_notrace(void) +{ + if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) + return; + preempt_schedule_notrace(); +} +NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); +EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); +#endif +#endif + #endif /* CONFIG_PREEMPTION */ /* * This is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * off of IRQ context. + * Note, that this is called and return with IRQs disabled. This will + * protect us against recursive calling from IRQ contexts. */ asmlinkage __visible void __sched preempt_schedule_irq(void) { @@ -4474,7 +7084,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_disable(); local_irq_enable(); - __schedule(true); + __schedule(SM_PREEMPT); local_irq_disable(); sched_preempt_enable_no_resched(); } while (need_resched()); @@ -4485,26 +7095,53 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); +const struct sched_class *__setscheduler_class(int policy, int prio) +{ + if (dl_prio(prio)) + return &dl_sched_class; + + if (rt_prio(prio)) + return &rt_sched_class; + +#ifdef CONFIG_SCHED_CLASS_EXT + if (task_should_scx(policy)) + return &ext_sched_class; +#endif + + return &fair_sched_class; +} + #ifdef CONFIG_RT_MUTEXES -static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -{ - if (pi_task) - prio = min(prio, pi_task->prio); +/* + * Would be more useful with typeof()/auto_type but they don't mix with + * bit-fields. Since it's a local thing, use int. Keep the generic sounding + * name such that if someone were to implement this function we get to compare + * notes. + */ +#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; }) - return prio; +void rt_mutex_pre_schedule(void) +{ + lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1)); + sched_submit_work(current); } -static inline int rt_effective_prio(struct task_struct *p, int prio) +void rt_mutex_schedule(void) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); + lockdep_assert(current->sched_rt_mutex); + __schedule_loop(SM_NONE); +} - return __rt_effective_prio(pi_task, prio); +void rt_mutex_post_schedule(void) +{ + sched_update_worker(current); + lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); } /* @@ -4522,7 +7159,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - const struct sched_class *prev_class; + const struct sched_class *prev_class, *next_class; struct rq_flags rf; struct rq *rq; @@ -4545,7 +7182,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to * ensure a task is de-boosted (pi_task is set to NULL) before the * task is allowed to run again (and can exit). This ensures the pointer - * points to a blocked task -- which guaratees the task is present. + * points to a blocked task -- which guarantees the task is present. */ p->pi_top_task = pi_task; @@ -4556,7 +7193,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) goto out_unlock; /* - * Idle task boosting is a nono in general. There is one + * Idle task boosting is a no-no in general. There is one * exception, when PREEMPT_RT and NOHZ is active: * * The idle task calls get_next_timer_interrupt() and holds @@ -4580,8 +7217,13 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; + next_class = __setscheduler_class(p->policy, prio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, queue_flag); if (running) @@ -4600,27 +7242,28 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (!dl_prio(p->normal_prio) || (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; + p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; - p->sched_class = &dl_sched_class; + } else { + p->dl.pi_se = &p->dl; + } } else if (rt_prio(prio)) { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; - p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; - p->sched_class = &fair_sched_class; } + p->sched_class = next_class; p->prio = prio; + check_class_changing(rq, p, prev_class); + if (queued) enqueue_task(rq, p, queue_flag); if (running) @@ -4630,1282 +7273,390 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); - __task_rq_unlock(rq, &rf); - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_rq_unlock(rq); + preempt_enable(); } -#else -static inline int rt_effective_prio(struct task_struct *p, int prio) -{ - return prio; -} #endif -void set_user_nice(struct task_struct *p, long nice) +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +int __sched __cond_resched(void) { - bool queued, running; - int old_prio; - struct rq_flags rf; - struct rq *rq; - - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) - return; - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - rq = task_rq_lock(p, &rf); - update_rq_clock(rq); - - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is - * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: - */ - if (task_has_dl_policy(p) || task_has_rt_policy(p)) { - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; + if (should_resched(0) && !irqs_disabled()) { + preempt_schedule_common(); + return 1; } - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); - -out_unlock: - task_rq_unlock(rq, p, &rf); -} -EXPORT_SYMBOL(set_user_nice); - -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - /* Convert nice value [19,-20] to rlimit style value [1,40]: */ - int nice_rlim = nice_to_rlimit(nice); - - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || - capable(CAP_SYS_NICE)); -} - -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) -{ - long nice, retval; - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. + * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * whether the current CPU is in an RCU read-side critical section, + * so the tick can report quiescent states even for CPUs looping + * in kernel context. In contrast, in non-preemptible kernels, + * RCU readers leave no in-memory hints, which means that CPU-bound + * processes executing in kernel context might never report an + * RCU quiescent state. Therefore, the following code causes + * cond_resched() to report a quiescent state, but only when RCU + * is in urgent need of one. */ - increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); - nice = task_nice(current) + increment; - - nice = clamp_val(nice, MIN_NICE, MAX_NICE); - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; - - retval = security_task_setnice(current, nice); - if (retval) - return retval; - - set_user_nice(current, nice); +#ifndef CONFIG_PREEMPT_RCU + rcu_all_qs(); +#endif return 0; } - +EXPORT_SYMBOL(__cond_resched); #endif -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * Return: The priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. - */ -int task_prio(const struct task_struct *p) -{ - return p->prio - MAX_RT_PRIO; -} - -/** - * idle_cpu - is a given CPU idle currently? - * @cpu: the processor in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->curr != rq->idle) - return 0; - - if (rq->nr_running) +#ifdef CONFIG_PREEMPT_DYNAMIC +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define cond_resched_dynamic_enabled __cond_resched +#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) +DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); +EXPORT_STATIC_CALL_TRAMP(cond_resched); + +#define might_resched_dynamic_enabled __cond_resched +#define might_resched_dynamic_disabled ((void *)&__static_call_return0) +DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); +EXPORT_STATIC_CALL_TRAMP(might_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); +int __sched dynamic_cond_resched(void) +{ + klp_sched_try_switch(); + if (!static_branch_unlikely(&sk_dynamic_cond_resched)) return 0; - -#ifdef CONFIG_SMP - if (rq->ttwu_pending) - return 0; -#endif - - return 1; + return __cond_resched(); } +EXPORT_SYMBOL(dynamic_cond_resched); -/** - * available_idle_cpu - is a given CPU idle for enqueuing work. - * @cpu: the CPU in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int available_idle_cpu(int cpu) +static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); +int __sched dynamic_might_resched(void) { - if (!idle_cpu(cpu)) - return 0; - - if (vcpu_is_preempted(cpu)) + if (!static_branch_unlikely(&sk_dynamic_might_resched)) return 0; - - return 1; -} - -/** - * idle_task - return the idle task for a given CPU. - * @cpu: the processor in question. - * - * Return: The idle task for the CPU @cpu. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. - * - * The task of @pid, if found. %NULL otherwise. - */ -static struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; + return __cond_resched(); } - -/* - * sched_setparam() passes in -1 for its policy, to let the functions - * it calls know not to change it. - */ -#define SETPARAM_POLICY -1 - -static void __setscheduler_params(struct task_struct *p, - const struct sched_attr *attr) -{ - int policy = attr->sched_policy; - - if (policy == SETPARAM_POLICY) - policy = p->policy; - - p->policy = policy; - - if (dl_policy(policy)) - __setparam_dl(p, attr); - else if (fair_policy(policy)) - p->static_prio = NICE_TO_PRIO(attr->sched_nice); - - /* - * __sched_setscheduler() ensures attr->sched_priority == 0 when - * !rt_policy. Always setting this ensures that things like - * getparam()/getattr() don't report silly values for !rt tasks. - */ - p->rt_priority = attr->sched_priority; - p->normal_prio = normal_prio(p); - set_load_weight(p, true); -} - -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr, bool keep_boost) -{ - /* - * If params can't change scheduling class changes aren't allowed - * either. - */ - if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) - return; - - __setscheduler_params(p, attr); - - /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). - */ - p->prio = normal_prio(p); - if (keep_boost) - p->prio = rt_effective_prio(p, p->prio); - - if (dl_prio(p->prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; -} - -/* - * Check the target process has a UID that matches the current process's: - */ -static bool check_same_owner(struct task_struct *p) -{ - const struct cred *cred = current_cred(), *pcred; - bool match; - - rcu_read_lock(); - pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; -} - -static int __sched_setscheduler(struct task_struct *p, - const struct sched_attr *attr, - bool user, bool pi) -{ - int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : - MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, queued, running; - int new_effective_prio, policy = attr->sched_policy; - const struct sched_class *prev_class; - struct rq_flags rf; - int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct rq *rq; - - /* The pi code expects interrupts enabled */ - BUG_ON(pi && in_interrupt()); -recheck: - /* Double check policy once rq lock held: */ - if (policy < 0) { - reset_on_fork = p->sched_reset_on_fork; - policy = oldpolicy = p->policy; - } else { - reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - - if (!valid_policy(policy)) - return -EINVAL; - } - - if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) - return -EINVAL; - - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. - */ - if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) - return -EINVAL; - if ((dl_policy(policy) && !__checkparam_dl(attr)) || - (rt_policy(policy) != (attr->sched_priority != 0))) - return -EINVAL; - - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (fair_policy(policy)) { - if (attr->sched_nice < task_nice(p) && - !can_nice(p, attr->sched_nice)) - return -EPERM; - } - - if (rt_policy(policy)) { - unsigned long rlim_rtprio = - task_rlimit(p, RLIMIT_RTPRIO); - - /* Can't set/change the rt policy: */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; - - /* Can't increase priority: */ - if (attr->sched_priority > p->rt_priority && - attr->sched_priority > rlim_rtprio) - return -EPERM; - } - - /* - * Can't set/change SCHED_DEADLINE policy at all for now - * (safest behavior); in the future we would like to allow - * unprivileged DL tasks to increase their relative deadline - * or reduce their runtime (both ways reducing utilization) - */ - if (dl_policy(policy)) - return -EPERM; - - /* - * Treat SCHED_IDLE as nice 20. Only allow a switch to - * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. - */ - if (task_has_idle_policy(p) && !idle_policy(policy)) { - if (!can_nice(p, task_nice(p))) - return -EPERM; - } - - /* Can't change other user's priorities: */ - if (!check_same_owner(p)) - return -EPERM; - - /* Normal users shall not reset the sched_reset_on_fork flag: */ - if (p->sched_reset_on_fork && !reset_on_fork) - return -EPERM; - } - - if (user) { - if (attr->sched_flags & SCHED_FLAG_SUGOV) - return -EINVAL; - - retval = security_task_setscheduler(p); - if (retval) - return retval; - } - - /* Update task specific "requested" clamps */ - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { - retval = uclamp_validate(p, attr); - if (retval) - return retval; - } - - if (pi) - cpuset_read_lock(); - - /* - * Make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: - * - * To be able to change p->policy safely, the appropriate - * runqueue lock must be held. - */ - rq = task_rq_lock(p, &rf); - update_rq_clock(rq); - - /* - * Changing the policy of the stop threads its a very bad idea: - */ - if (p == rq->stop) { - retval = -EINVAL; - goto unlock; - } - - /* - * If not changing anything there's no need to proceed further, - * but store a possible modification of reset_on_fork. - */ - if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != task_nice(p)) - goto change; - if (rt_policy(policy) && attr->sched_priority != p->rt_priority) - goto change; - if (dl_policy(policy) && dl_param_changed(p, attr)) - goto change; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) - goto change; - - p->sched_reset_on_fork = reset_on_fork; - retval = 0; - goto unlock; - } -change: - - if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0 && - !task_group_is_autogroup(task_group(p))) { - retval = -EPERM; - goto unlock; - } +EXPORT_SYMBOL(dynamic_might_resched); #endif -#ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy) && - !(attr->sched_flags & SCHED_FLAG_SUGOV)) { - cpumask_t *span = rq->rd->span; - - /* - * Don't allow tasks with an affinity mask smaller than - * the entire root_domain to become SCHED_DEADLINE. We - * will also fail if there's no bandwidth available. - */ - if (!cpumask_subset(span, p->cpus_ptr) || - rq->rd->dl_bw.bw == 0) { - retval = -EPERM; - goto unlock; - } - } #endif - } - - /* Re-check policy now with rq lock held: */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); - goto recheck; - } - - /* - * If setscheduling to SCHED_DEADLINE (or changing the parameters - * of a SCHED_DEADLINE task) we need to check if enough bandwidth - * is available. - */ - if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { - retval = -EBUSY; - goto unlock; - } - - p->sched_reset_on_fork = reset_on_fork; - oldprio = p->prio; - - if (pi) { - /* - * Take priority boosted tasks into account. If the new - * effective priority is unchanged, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. - */ - new_effective_prio = rt_effective_prio(p, newprio); - if (new_effective_prio == oldprio) - queue_flags &= ~DEQUEUE_MOVE; - } - - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); - - prev_class = p->sched_class; - - __setscheduler(rq, p, attr, pi); - __setscheduler_uclamp(p, attr); - - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; - - enqueue_task(rq, p, queue_flags); - } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); - - /* Avoid rq from going away on us: */ - preempt_disable(); - task_rq_unlock(rq, p, &rf); - - if (pi) { - cpuset_read_unlock(); - rt_mutex_adjust_pi(p); - } - - /* Run balance callbacks after we've adjusted the PI chain: */ - balance_callback(rq); - preempt_enable(); - - return 0; - -unlock: - task_rq_unlock(rq, p, &rf); - if (pi) - cpuset_read_unlock(); - return retval; -} - -static int _sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool check) -{ - struct sched_attr attr = { - .sched_policy = policy, - .sched_priority = param->sched_priority, - .sched_nice = PRIO_TO_NICE(p->static_prio), - }; - - /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ - if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - policy &= ~SCHED_RESET_ON_FORK; - attr.sched_policy = policy; - } - - return __sched_setscheduler(p, &attr, check, true); -} -/** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - * - * NOTE that the task may be already dead. - */ -int sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return _sched_setscheduler(p, policy, param, true); -} -EXPORT_SYMBOL_GPL(sched_setscheduler); - -int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -{ - return __sched_setscheduler(p, attr, true, true); -} -EXPORT_SYMBOL_GPL(sched_setattr); - -int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -{ - return __sched_setscheduler(p, attr, false, true); -} - -/** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. - * - * Return: 0 on success. An error code otherwise. - */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) -{ - return _sched_setscheduler(p, policy, param, false); -} -EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); - -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -{ - struct sched_param lparam; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); - - if (likely(p)) { - retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); - } - - return retval; -} /* - * Mimics kernel/events/core.c perf_copy_attr(). - */ -static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -{ - u32 size; - int ret; - - /* Zero the full structure, so that a short copy will be nice: */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - /* ABI compatibility quirk: */ - if (!size) - size = SCHED_ATTR_SIZE_VER0; - if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) - goto err_size; - - ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); - if (ret) { - if (ret == -E2BIG) - goto err_size; - return ret; - } - - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && - size < SCHED_ATTR_SIZE_VER1) - return -EINVAL; - - /* - * XXX: Do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? - */ - attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); - - return 0; - -err_size: - put_user(sizeof(*attr), &uattr->size); - return -E2BIG; -} - -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * @param: structure containing the new RT priority. + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. * - * Return: 0 on success. An error code otherwise. + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). */ -SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) +int __cond_resched_lock(spinlock_t *lock) { - if (policy < 0) - return -EINVAL; + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; - return do_sched_setscheduler(pid, policy, param); -} + lockdep_assert_held(lock); -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -{ - return do_sched_setscheduler(pid, SETPARAM_POLICY, param); + if (spin_needbreak(lock) || resched) { + spin_unlock(lock); + if (!_cond_resched()) + cpu_relax(); + ret = 1; + spin_lock(lock); + } + return ret; } +EXPORT_SYMBOL(__cond_resched_lock); -/** - * sys_sched_setattr - same as above, but with extended sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @flags: for future extension. - */ -SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, flags) +int __cond_resched_rwlock_read(rwlock_t *lock) { - struct sched_attr attr; - struct task_struct *p; - int retval; - - if (!uattr || pid < 0 || flags) - return -EINVAL; - - retval = sched_copy_attr(uattr, &attr); - if (retval) - return retval; + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; - if ((int)attr.sched_policy < 0) - return -EINVAL; - if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) - attr.sched_policy = SETPARAM_POLICY; + lockdep_assert_held_read(lock); - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); - - if (likely(p)) { - retval = sched_setattr(p, &attr); - put_task_struct(p); + if (rwlock_needbreak(lock) || resched) { + read_unlock(lock); + if (!_cond_resched()) + cpu_relax(); + ret = 1; + read_lock(lock); } - - return retval; + return ret; } +EXPORT_SYMBOL(__cond_resched_rwlock_read); -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - * - * Return: On success, the policy of the thread. Otherwise, a negative error - * code. - */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +int __cond_resched_rwlock_write(rwlock_t *lock) { - struct task_struct *p; - int retval; + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; - if (pid < 0) - return -EINVAL; + lockdep_assert_held_write(lock); - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy - | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); + if (rwlock_needbreak(lock) || resched) { + write_unlock(lock); + if (!_cond_resched()) + cpu_relax(); + ret = 1; + write_lock(lock); } - rcu_read_unlock(); - return retval; + return ret; } +EXPORT_SYMBOL(__cond_resched_rwlock_write); -/** - * sys_sched_getparam - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - * - * Return: On success, 0 and the RT priority is in @param. Otherwise, an error - * code. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -{ - struct sched_param lp = { .sched_priority = 0 }; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; +#ifdef CONFIG_PREEMPT_DYNAMIC - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; -} +#ifdef CONFIG_GENERIC_ENTRY +#include <linux/entry-common.h> +#endif /* - * Copy the kernel size attribute structure (which might be larger - * than what user-space knows about) to user-space. - * - * Note that all cases are valid: user-space buffer can be larger or - * smaller than the kernel-space buffer. The usual case is that both - * have the same size. - */ -static int -sched_attr_copy_to_user(struct sched_attr __user *uattr, - struct sched_attr *kattr, - unsigned int usize) -{ - unsigned int ksize = sizeof(*kattr); - - if (!access_ok(uattr, usize)) - return -EFAULT; - - /* - * sched_getattr() ABI forwards and backwards compatibility: - * - * If usize == ksize then we just copy everything to user-space and all is good. - * - * If usize < ksize then we only copy as much as user-space has space for, - * this keeps ABI compatibility as well. We skip the rest. - * - * If usize > ksize then user-space is using a newer version of the ABI, - * which part the kernel doesn't know about. Just ignore it - tooling can - * detect the kernel's knowledge of attributes from the attr->size value - * which is set to ksize in this case. - */ - kattr->size = min(usize, ksize); - - if (copy_to_user(uattr, kattr, kattr->size)) - return -EFAULT; + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * dynamic_preempt_lazy <- false + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- false + * + * LAZY: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + * dynamic_preempt_lazy <- true + */ + +enum { + preempt_dynamic_undefined = -1, + preempt_dynamic_none, + preempt_dynamic_voluntary, + preempt_dynamic_full, + preempt_dynamic_lazy, +}; - return 0; -} +int preempt_dynamic_mode = preempt_dynamic_undefined; -/** - * sys_sched_getattr - similar to sched_getparam, but with sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) for fwd/bwd comp. - * @flags: for future extension. - */ -SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, usize, unsigned int, flags) +int sched_dynamic_mode(const char *str) { - struct sched_attr kattr = { }; - struct task_struct *p; - int retval; +#ifndef CONFIG_PREEMPT_RT + if (!strcmp(str, "none")) + return preempt_dynamic_none; - if (!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags) - return -EINVAL; - - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - kattr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); - -#ifdef CONFIG_UCLAMP_TASK - kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + if (!strcmp(str, "voluntary")) + return preempt_dynamic_voluntary; #endif - rcu_read_unlock(); + if (!strcmp(str, "full")) + return preempt_dynamic_full; - return sched_attr_copy_to_user(uattr, &kattr, usize); +#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY + if (!strcmp(str, "lazy")) + return preempt_dynamic_lazy; +#endif -out_unlock: - rcu_read_unlock(); - return retval; + return -EINVAL; } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -{ - cpumask_var_t cpus_allowed, new_mask; - struct task_struct *p; - int retval; +#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); - return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; - if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) +#else +#error "Unsupported PREEMPT_DYNAMIC mechanism" +#endif - retval = security_task_setscheduler(p); - if (retval) - goto out_free_new_mask; +static DEFINE_MUTEX(sched_dynamic_mutex); +static bool klp_override; + +static void __sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + if (!klp_override) + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); + + switch (mode) { + case preempt_dynamic_none: + if (!klp_override) + preempt_dynamic_enable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: none\n"); + break; + case preempt_dynamic_voluntary: + if (!klp_override) + preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(might_resched); + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: voluntary\n"); + break; - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); + case preempt_dynamic_full: + if (!klp_override) + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_disable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: full\n"); + break; - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ -#ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); + case preempt_dynamic_lazy: + if (!klp_override) + preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(might_resched); + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); + preempt_dynamic_key_enable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: lazy\n"); + break; } -#endif -again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_free_new_mask: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); -out_put_task: - put_task_struct(p); - return retval; + preempt_dynamic_mode = mode; } -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - struct cpumask *new_mask) +void sched_dynamic_update(int mode) { - if (len < cpumask_size()) - cpumask_clear(new_mask); - else if (len > cpumask_size()) - len = cpumask_size(); - - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; + mutex_lock(&sched_dynamic_mutex); + __sched_dynamic_update(mode); + mutex_unlock(&sched_dynamic_mutex); } -/** - * sys_sched_setaffinity - set the CPU affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new CPU mask - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) -{ - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; +#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL - retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval == 0) - retval = sched_setaffinity(pid, new_mask); - free_cpumask_var(new_mask); - return retval; -} - -long sched_getaffinity(pid_t pid, struct cpumask *mask) +static int klp_cond_resched(void) { - struct task_struct *p; - unsigned long flags; - int retval; - - rcu_read_lock(); - - retval = -ESRCH; - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_mask, cpu_active_mask); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - -out_unlock: - rcu_read_unlock(); - - return retval; + __klp_sched_try_switch(); + return __cond_resched(); } -/** - * sys_sched_getaffinity - get the CPU affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current CPU mask - * - * Return: size of CPU mask copied to user_mask_ptr on success. An - * error code otherwise. - */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) +void sched_dynamic_klp_enable(void) { - int ret; - cpumask_var_t mask; - - if ((len * BITS_PER_BYTE) < nr_cpu_ids) - return -EINVAL; - if (len & (sizeof(unsigned long)-1)) - return -EINVAL; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + mutex_lock(&sched_dynamic_mutex); - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - unsigned int retlen = min(len, cpumask_size()); - - if (copy_to_user(user_mask_ptr, mask, retlen)) - ret = -EFAULT; - else - ret = retlen; - } - free_cpumask_var(mask); + klp_override = true; + static_call_update(cond_resched, klp_cond_resched); - return ret; + mutex_unlock(&sched_dynamic_mutex); } -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - * - * Return: 0. - */ -static void do_sched_yield(void) +void sched_dynamic_klp_disable(void) { - struct rq_flags rf; - struct rq *rq; - - rq = this_rq_lock_irq(&rf); + mutex_lock(&sched_dynamic_mutex); - schedstat_inc(rq->yld_count); - current->sched_class->yield_task(rq); + klp_override = false; + __sched_dynamic_update(preempt_dynamic_mode); - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ - preempt_disable(); - rq_unlock(rq, &rf); - sched_preempt_enable_no_resched(); - - schedule(); + mutex_unlock(&sched_dynamic_mutex); } -SYSCALL_DEFINE0(sched_yield) -{ - do_sched_yield(); - return 0; -} +#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ -#ifndef CONFIG_PREEMPTION -int __sched _cond_resched(void) +static int __init setup_preempt_mode(char *str) { - if (should_resched(0)) { - preempt_schedule_common(); - return 1; - } - rcu_all_qs(); - return 0; -} -EXPORT_SYMBOL(_cond_resched); -#endif - -/* - * __cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. - * - * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). - */ -int __cond_resched_lock(spinlock_t *lock) -{ - int resched = should_resched(PREEMPT_LOCK_OFFSET); - int ret = 0; - - lockdep_assert_held(lock); - - if (spin_needbreak(lock) || resched) { - spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else - cpu_relax(); - ret = 1; - spin_lock(lock); + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); + return 0; } - return ret; -} -EXPORT_SYMBOL(__cond_resched_lock); -/** - * yield - yield the current processor to other threads. - * - * Do not ever use this function, there's a 99% chance you're doing it wrong. - * - * The scheduler is at all times free to pick the calling task as the most - * eligible task to run, if removing the yield() call from your code breaks - * it, its already broken. - * - * Typical broken usage is: - * - * while (!event) - * yield(); - * - * where one assumes that yield() will let 'the other' process run that will - * make event true. If the current task is a SCHED_FIFO task that will never - * happen. Never use yield() as a progress guarantee!! - * - * If you want to use yield() to wait for something, use wait_event(). - * If you want to use yield() to be 'nice' for others, use cond_resched(). - * If you still want to use yield(), do not! - */ -void __sched yield(void) -{ - set_current_state(TASK_RUNNING); - do_sched_yield(); + sched_dynamic_update(mode); + return 1; } -EXPORT_SYMBOL(yield); +__setup("preempt=", setup_preempt_mode); -/** - * yield_to - yield the current processor to another thread in - * your thread group, or accelerate that thread toward the - * processor it's on. - * @p: target task - * @preempt: whether task preemption is allowed or not - * - * It's the caller's job to ensure that the target task struct - * can't go away on us before we can do any checks. - * - * Return: - * true (>0) if we indeed boosted the target task. - * false (0) if we failed to boost the target. - * -ESRCH if there's no task to yield to. - */ -int __sched yield_to(struct task_struct *p, bool preempt) +static void __init preempt_dynamic_init(void) { - struct task_struct *curr = current; - struct rq *rq, *p_rq; - unsigned long flags; - int yielded = 0; - - local_irq_save(flags); - rq = this_rq(); - -again: - p_rq = task_rq(p); - /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. - */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) { - yielded = -ESRCH; - goto out_irq; - } - - double_rq_lock(rq, p_rq); - if (task_rq(p) != p_rq) { - double_rq_unlock(rq, p_rq); - goto again; + if (preempt_dynamic_mode == preempt_dynamic_undefined) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { + sched_dynamic_update(preempt_dynamic_none); + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { + sched_dynamic_update(preempt_dynamic_voluntary); + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + sched_dynamic_update(preempt_dynamic_lazy); + } else { + /* Default static call setting, nothing to do */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); + preempt_dynamic_mode = preempt_dynamic_full; + pr_info("Dynamic Preempt: full\n"); + } } +} - if (!curr->sched_class->yield_to_task) - goto out_unlock; - - if (curr->sched_class != p->sched_class) - goto out_unlock; - - if (task_running(p_rq, p) || p->state) - goto out_unlock; +#define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ + return preempt_dynamic_mode == preempt_dynamic_##mode; \ + } \ + EXPORT_SYMBOL_GPL(preempt_model_##mode) - yielded = curr->sched_class->yield_to_task(rq, p, preempt); - if (yielded) { - schedstat_inc(rq->yld_count); - /* - * Make p's CPU reschedule; pick_next_entity takes care of - * fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); - } +PREEMPT_MODEL_ACCESSOR(none); +PREEMPT_MODEL_ACCESSOR(voluntary); +PREEMPT_MODEL_ACCESSOR(full); +PREEMPT_MODEL_ACCESSOR(lazy); -out_unlock: - double_rq_unlock(rq, p_rq); -out_irq: - local_irq_restore(flags); +#else /* !CONFIG_PREEMPT_DYNAMIC: */ - if (yielded > 0) - schedule(); +static inline void preempt_dynamic_init(void) { } - return yielded; -} -EXPORT_SYMBOL_GPL(yield_to); +#endif /* CONFIG_PREEMPT_DYNAMIC */ int io_schedule_prepare(void) { int old_iowait = current->in_iowait; current->in_iowait = 1; - blk_schedule_flush_plug(current); - + blk_flush_plug(current->plug, true); return old_iowait; } @@ -5941,156 +7692,31 @@ void __sched io_schedule(void) } EXPORT_SYMBOL(io_schedule); -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the maximum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_DEADLINE: - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - break; - } - return ret; -} - -/** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the minimum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -{ - int ret = -EINVAL; - - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_DEADLINE: - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - } - return ret; -} - -static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -{ - struct task_struct *p; - unsigned int time_slice; - struct rq_flags rf; - struct rq *rq; - int retval; - - if (pid < 0) - return -EINVAL; - - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - rq = task_rq_lock(p, &rf); - time_slice = 0; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &rf); - - rcu_read_unlock(); - jiffies_to_timespec64(time_slice, t); - return 0; - -out_unlock: - rcu_read_unlock(); - return retval; -} - -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct __kernel_timespec __user *, interval) -{ - struct timespec64 t; - int retval = sched_rr_get_interval(pid, &t); - - if (retval == 0) - retval = put_timespec64(&t, interval); - - return retval; -} - -#ifdef CONFIG_COMPAT_32BIT_TIME -SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, - struct old_timespec32 __user *, interval) -{ - struct timespec64 t; - int retval = sched_rr_get_interval(pid, &t); - - if (retval == 0) - retval = put_old_timespec32(&t, interval); - return retval; -} -#endif - void sched_show_task(struct task_struct *p) { - unsigned long free = 0; + unsigned long free; int ppid; if (!try_get_task_stack(p)) return; - printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); + pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); - if (p->state == TASK_RUNNING) - printk(KERN_CONT " running task "); -#ifdef CONFIG_DEBUG_STACK_USAGE + if (task_is_running(p)) + pr_cont(" running task "); free = stack_not_used(p); -#endif ppid = 0; rcu_read_lock(); if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, - (unsigned long)task_thread_info(p)->flags); + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n", + free, task_pid_nr(p), task_tgid_nr(p), + ppid, p->flags, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); + print_stop_info(KERN_INFO, p); + print_scx_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } @@ -6099,36 +7725,31 @@ EXPORT_SYMBOL_GPL(sched_show_task); static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) { + unsigned int state = READ_ONCE(p->__state); + /* no filter, everything matches */ if (!state_filter) return true; /* filter, but doesn't match */ - if (!(p->state & state_filter)) + if (!(state & state_filter)) return false; /* * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows * TASK_KILLABLE). */ - if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD)) return false; return true; } -void show_state_filter(unsigned long state_filter) +void show_state_filter(unsigned int state_filter) { struct task_struct *g, *p; -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif rcu_read_lock(); for_each_process_thread(g, p) { /* @@ -6164,31 +7785,35 @@ void show_state_filter(unsigned long state_filter) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void init_idle(struct task_struct *idle, int cpu) +void __init init_idle(struct task_struct *idle, int cpu) { +#ifdef CONFIG_SMP + struct affinity_context ac = (struct affinity_context) { + .new_mask = cpumask_of(cpu), + .flags = 0, + }; +#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; - __sched_fork(0, idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); - idle->state = TASK_RUNNING; + idle->__state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - idle->flags |= PF_IDLE; - - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); + /* + * PF_KTHREAD should already be set at this point; regardless, make it + * look like a proper per-CPU kthread. + */ + idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; + kthread_set_per_cpu(idle, cpu); #ifdef CONFIG_SMP /* - * Its possible that init_idle() gets called multiple times on a task, - * in that case do_set_cpus_allowed() will not do the right thing. - * - * And since this is boot we can forgo the serialization. + * No validation and serialization required at boot time and for + * setting up the idle tasks of not yet online CPUs. */ - set_cpus_allowed_common(idle, cpumask_of(cpu)); + set_cpus_allowed_common(idle, &ac); #endif /* * We're having a chicken and egg problem, even though we are @@ -6205,12 +7830,13 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->idle = idle; + rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP idle->on_cpu = 1; #endif - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -6234,7 +7860,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, { int ret = 1; - if (!cpumask_weight(cur)) + if (cpumask_empty(cur)) return ret; ret = dl_cpuset_cpumask_can_shrink(cur, trial); @@ -6242,8 +7868,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) +int task_can_attach(struct task_struct *p) { int ret = 0; @@ -6256,16 +7881,9 @@ int task_can_attach(struct task_struct *p, * success of set_cpus_allowed_ptr() on all attached tasks * before cpus_mask may be changed. */ - if (p->flags & PF_NO_SETAFFINITY) { + if (p->flags & PF_NO_SETAFFINITY) ret = -EINVAL; - goto out; - } - - if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) - ret = dl_task_can_attach(p, cs_cpus_allowed); -out: return ret; } @@ -6302,7 +7920,7 @@ void sched_setnuma(struct task_struct *p, int nid) rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); - running = task_current(rq, p); + running = task_current_donor(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); @@ -6321,137 +7939,171 @@ void sched_setnuma(struct task_struct *p, int nid) #ifdef CONFIG_HOTPLUG_CPU /* - * Ensure that the idle task is using init_mm right before its CPU goes - * offline. + * Invoked on the outgoing CPU in context of the CPU hotplug thread + * after ensuring that there are no user space tasks left on the CPU. + * + * If there is a lazy mm in use on the hotplug thread, drop it and + * switch to init_mm. + * + * The reference count on init_mm is dropped in finish_cpu(). */ -void idle_task_exit(void) +static void sched_force_init_mm(void) { struct mm_struct *mm = current->active_mm; - BUG_ON(cpu_online(smp_processor_id())); - BUG_ON(current != this_rq()->idle); - if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + mmgrab_lazy_tlb(&init_mm); + local_irq_disable(); + current->active_mm = &init_mm; + switch_mm_irqs_off(mm, &init_mm, current); + local_irq_enable(); finish_arch_post_lock_switch(); + mmdrop_lazy_tlb(mm); } /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } -/* - * Since this CPU is going 'away' for a while, fold any nr_active delta - * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. We need to take the teardown thread which - * is calling this into account, so we hand in adjust = 1 to the load - * calculation. - * - * Also see the comment "Global load-average calculations". - */ -static void calc_load_migrate(struct rq *rq) +static int __balance_push_cpu_stop(void *arg) { - long delta = calc_load_fold_active(rq, 1); - if (delta) - atomic_long_add(delta, &calc_load_tasks); -} + struct task_struct *p = arg; + struct rq *rq = this_rq(); + struct rq_flags rf; + int cpu; -static struct task_struct *__pick_migrate_task(struct rq *rq) -{ - const struct sched_class *class; - struct task_struct *next; + raw_spin_lock_irq(&p->pi_lock); + rq_lock(rq, &rf); - for_each_class(class) { - next = class->pick_next_task(rq); - if (next) { - next->sched_class->put_prev_task(rq, next); - return next; - } + update_rq_clock(rq); + + if (task_rq(p) == rq && task_on_rq_queued(p)) { + cpu = select_fallback_rq(rq->cpu, p); + rq = __migrate_task(rq, &rf, p, cpu); } - /* The idle class should always have a runnable task */ - BUG(); + rq_unlock(rq, &rf); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + return 0; } +static DEFINE_PER_CPU(struct cpu_stop_work, push_work); + /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). + * Ensure we only run per-cpu kthreads once the CPU goes !active. * - * Called with rq->lock held even though we'er in stop_machine() and - * there's no concurrency possible, we hold the required locks anyway - * because of lock validation efforts. + * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only + * effective when the hotplug motion is down. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +static void balance_push(struct rq *rq) { - struct rq *rq = dead_rq; - struct task_struct *next, *stop = rq->stop; - struct rq_flags orf = *rf; - int dest_cpu; + struct task_struct *push_task = rq->curr; + + lockdep_assert_rq_held(rq); /* - * Fudge the rq selection such that the below task selection loop - * doesn't get stuck on the currently eligible stop task. - * - * We're currently inside stop_machine() and the rq is either stuck - * in the stop_machine_cpu_stop() loop, or we're executing this code, - * either way we should never end up calling schedule() until we're - * done here. + * Ensure the thing is persistent until balance_push_set(.on = false); */ - rq->stop = NULL; + rq->balance_callback = &balance_push_callback; /* - * put_prev_task() and pick_next_task() sched - * class method both need to have an up-to-date - * value of rq->clock[_task] + * Only active while going offline and when invoked on the outgoing + * CPU. */ - update_rq_clock(rq); - - for (;;) { - /* - * There's this thread running, bail when that's the only - * remaining thread: - */ - if (rq->nr_running == 1) - break; + if (!cpu_dying(rq->cpu) || rq != this_rq()) + return; - next = __pick_migrate_task(rq); + /* + * Both the cpu-hotplug and stop task are in this case and are + * required to complete the hotplug process. + */ + if (kthread_is_per_cpu(push_task) || + is_migration_disabled(push_task)) { /* - * Rules for changing task_struct::cpus_mask are holding - * both pi_lock and rq->lock, such that holding either - * stabilizes the mask. + * If this is the idle task on the outgoing CPU try to wake + * up the hotplug control thread which might wait for the + * last task to vanish. The rcuwait_active() check is + * accurate here because the waiter is pinned on this CPU + * and can't obviously be running in parallel. * - * Drop rq->lock is not quite as disastrous as it usually is - * because !cpu_active at this point, which means load-balance - * will not interfere. Also, stop-machine. - */ - rq_unlock(rq, rf); - raw_spin_lock(&next->pi_lock); - rq_relock(rq, rf); - - /* - * Since we're inside stop-machine, _nothing_ should have - * changed the task, WARN if weird stuff happened, because in - * that case the above rq->lock drop is a fail too. + * On RT kernels this also has to check whether there are + * pinned and scheduled out tasks on the runqueue. They + * need to leave the migrate disabled section first. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { - raw_spin_unlock(&next->pi_lock); - continue; + if (!rq->nr_running && !rq_has_pinned_tasks(rq) && + rcuwait_active(&rq->hotplug_wait)) { + raw_spin_rq_unlock(rq); + rcuwait_wake_up(&rq->hotplug_wait); + raw_spin_rq_lock(rq); } + return; + } - /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); - rq = __migrate_task(rq, rf, next, dest_cpu); - if (rq != dead_rq) { - rq_unlock(rq, rf); - rq = dead_rq; - *rf = orf; - rq_relock(rq, rf); - } - raw_spin_unlock(&next->pi_lock); + get_task_struct(push_task); + /* + * Temporarily drop rq->lock such that we can wake-up the stop task. + * Both preemption and IRQs are still disabled. + */ + preempt_disable(); + raw_spin_rq_unlock(rq); + stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, + this_cpu_ptr(&push_work)); + preempt_enable(); + /* + * At this point need_resched() is true and we'll take the loop in + * schedule(). The next pick is obviously going to be the stop task + * which kthread_is_per_cpu() and will push this task away. + */ + raw_spin_rq_lock(rq); +} + +static void balance_push_set(int cpu, bool on) +{ + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (on) { + WARN_ON_ONCE(rq->balance_callback); + rq->balance_callback = &balance_push_callback; + } else if (rq->balance_callback == &balance_push_callback) { + rq->balance_callback = NULL; } + rq_unlock_irqrestore(rq, &rf); +} - rq->stop = stop; +/* + * Invoked from a CPUs hotplug control thread after the CPU has been marked + * inactive. All tasks which are not per CPU kernel threads are either + * pushed off this CPU now via balance_push() or placed on a different CPU + * during wakeup. Wait until the CPU is quiescent. + */ +static void balance_hotplug_wait(void) +{ + struct rq *rq = this_rq(); + + rcuwait_wait_event(&rq->hotplug_wait, + rq->nr_running == 1 && !rq_has_pinned_tasks(rq), + TASK_UNINTERRUPTIBLE); } + +#else + +static inline void balance_push(struct rq *rq) +{ +} + +static inline void balance_push_set(int cpu, bool on) +{ +} + +static inline void balance_hotplug_wait(void) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -6474,6 +8126,7 @@ void set_rq_offline(struct rq *rq) if (rq->online) { const struct sched_class *class; + update_rq_clock(rq); for_each_class(class) { if (class->rq_offline) class->rq_offline(rq); @@ -6484,6 +8137,30 @@ void set_rq_offline(struct rq *rq) } } +static inline void sched_set_rq_online(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + +static inline void sched_set_rq_offline(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + /* * used to mark begin/end of suspend/resume: */ @@ -6519,38 +8196,56 @@ static void cpuset_cpu_active(void) cpuset_update_active_cpus(); } -static int cpuset_cpu_inactive(unsigned int cpu) +static void cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - if (dl_cpu_busy(cpu)) - return -EBUSY; cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); } - return 0; +} + +static inline void sched_smt_present_inc(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(&sched_smt_present); +#endif +} + +static inline void sched_smt_present_dec(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); +#endif } int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; -#ifdef CONFIG_SCHED_SMT + /* + * Clear the balance_push callback and prepare to schedule + * regular tasks. + */ + balance_push_set(cpu, false); + /* * When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_inc_cpuslocked(&sched_smt_present); -#endif + sched_smt_present_inc(cpu); set_cpu_active(cpu, true); if (sched_smp_initialized) { + sched_update_numa(cpu, true); sched_domains_numa_masks_set(cpu); cpuset_cpu_active(); } + scx_rq_activate(rq); + /* * Put the rq online, if not already. This happens: * @@ -6560,46 +8255,67 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_online(rq, cpu); return 0; } int sched_cpu_deactivate(unsigned int cpu) { + struct rq *rq = cpu_rq(cpu); int ret; + ret = dl_bw_deactivate(cpu); + + if (ret) + return ret; + + /* + * Remove CPU from nohz.idle_cpus_mask to prevent participating in + * load balancing when not active + */ + nohz_balance_exit_idle(rq); + set_cpu_active(cpu, false); + + /* + * From this point forward, this CPU will refuse to run any task that + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively + * push those tasks away until this gets cleared, see + * sched_cpu_dying(). + */ + balance_push_set(cpu, true); + /* - * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU - * users of this state to go away such that all new such users will - * observe it. + * We've cleared cpu_active_mask / set balance_push, wait for all + * preempt-disabled and RCU users of this state to go away such that + * all new such users will observe it. + * + * Specifically, we rely on ttwu to no longer target this CPU, see + * ttwu_queue_cond() and is_cpu_allowed(). * - * Do sync before park smpboot threads to take care the rcu boost case. + * Do sync before park smpboot threads to take care the RCU boost case. */ synchronize_rcu(); -#ifdef CONFIG_SCHED_SMT + sched_set_rq_offline(rq, cpu); + + scx_rq_deactivate(rq); + /* * When going down, decrement the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_dec_cpuslocked(&sched_smt_present); + sched_smt_present_dec(cpu); + +#ifdef CONFIG_SCHED_SMT + sched_core_cpu_deactivate(cpu); #endif if (!sched_smp_initialized) return 0; - ret = cpuset_cpu_inactive(cpu); - if (ret) { - set_cpu_active(cpu, true); - return ret; - } + sched_update_numa(cpu, false); + cpuset_cpu_inactive(cpu); sched_domains_numa_masks_clear(cpu); return 0; } @@ -6614,12 +8330,68 @@ static void sched_rq_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { + sched_core_cpu_starting(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; } #ifdef CONFIG_HOTPLUG_CPU + +/* + * Invoked immediately before the stopper thread is invoked to bring the + * CPU down completely. At this point all per CPU kthreads except the + * hotplug thread (current) and the stopper thread (inactive) have been + * either parked or have been unbound from the outgoing CPU. Ensure that + * any of those which might be on the way out are gone. + * + * If after this point a bound task is being woken on this CPU then the + * responsible hotplug callback has failed to do it's job. + * sched_cpu_dying() will catch it with the appropriate fireworks. + */ +int sched_cpu_wait_empty(unsigned int cpu) +{ + balance_hotplug_wait(); + sched_force_init_mm(); + return 0; +} + +/* + * Since this CPU is going 'away' for a while, fold any nr_active delta we + * might have. Called from the CPU stopper task after ensuring that the + * stopper is the last running task on the CPU, so nr_active count is + * stable. We need to take the tear-down thread which is calling this into + * account, so we hand in adjust = 1 to the load calculation. + * + * Also see the comment "Global load-average calculations". + */ +static void calc_load_migrate(struct rq *rq) +{ + long delta = calc_load_fold_active(rq, 1); + + if (delta) + atomic_long_add(delta, &calc_load_tasks); +} + +static void dump_rq_tasks(struct rq *rq, const char *loglvl) +{ + struct task_struct *g, *p; + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); + for_each_process_thread(g, p) { + if (task_cpu(p) != cpu) + continue; + + if (!task_on_rq_queued(p)) + continue; + + printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); + } +} + int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6629,25 +8401,23 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); + if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { + WARN(true, "Dying CPU not properly vacated!"); + dump_rq_tasks(rq, KERN_WARNING); } - migrate_tasks(rq, &rf); - BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(rq); hrtick_clear(rq); + sched_core_cpu_dying(cpu); return 0; } #endif void __init sched_init_smp(void) { - sched_init_numa(); + sched_init_numa(NUMA_NO_NODE); /* * There's no userspace yet to cause hotplug operations; hence all the @@ -6659,8 +8429,9 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) BUG(); + current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); init_sched_rt_class(); @@ -6699,17 +8470,26 @@ struct task_group root_task_group; LIST_HEAD(task_groups); /* Cacheline aligned slab cache for task_group */ -static struct kmem_cache *task_group_cache __read_mostly; +static struct kmem_cache *task_group_cache __ro_after_init; #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); -DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); - void __init sched_init(void) { unsigned long ptr = 0; int i; + /* Make sure the linker didn't screw up */ +#ifdef CONFIG_SMP + BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); +#endif + BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); + BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); + BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); +#ifdef CONFIG_SCHED_CLASS_EXT + BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6729,8 +8509,11 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_EXT_GROUP_SCHED + root_task_group.scx_weight = CGROUP_WEIGHT_DFL; +#endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -6740,17 +8523,6 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ - - init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); #ifdef CONFIG_SMP init_defrootdomain(); @@ -6774,7 +8546,7 @@ void __init sched_init(void) struct rq *rq; rq = cpu_rq(i); - raw_spin_lock_init(&rq->lock); + raw_spin_lock_init(&rq->__lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -6787,7 +8559,7 @@ void __init sched_init(void) /* * How much CPU bandwidth does root_task_group get? * - * In case of task-groups formed thr' the cgroup filesystem, it + * In case of task-groups formed through the cgroup filesystem, it * gets 100% of the CPU resources in the system. This overall * system CPU resource is divided among the tasks of * root_task_group and its child task-groups in a fair manner, @@ -6806,15 +8578,20 @@ void __init sched_init(void) init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED + /* + * This is required for init cpu because rt.c:__enable_runtime() + * starts working after scheduler_running, which is not the case + * yet. + */ + rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->balance_callback = NULL; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -6831,72 +8608,117 @@ void __init sched_init(void) rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); - rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); + INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); +#endif +#ifdef CONFIG_HOTPLUG_CPU + rcuwait_init(&rq->hotplug_wait); #endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + fair_server_init(rq); + +#ifdef CONFIG_SCHED_CORE + rq->core = rq; + rq->core_pick = NULL; + rq->core_dl_server = NULL; + rq->core_enabled = 0; + rq->core_tree = RB_ROOT; + rq->core_forceidle_count = 0; + rq->core_forceidle_occupation = 0; + rq->core_forceidle_start = 0; + + rq->core_cookie = 0UL; +#endif + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); } set_load_weight(&init_task, false); + init_task.se.slice = sysctl_sched_base_slice, /* * The boot idle thread does lazy MMU switching as well: */ - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); enter_lazy_tlb(&init_mm, current); /* + * The idle task doesn't need the kthread struct to function, but it + * is dressed up as a per-CPU kthread and thus needs to play the part + * if we want to avoid special-casing it in code that deals with per-CPU + * kthreads. + */ + WARN_ON(!set_kthread_struct(current)); + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again * when this runqueue becomes "idle". */ + __sched_fork(0, current); init_idle(current, smp_processor_id()); calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); - - init_schedstats(); + init_sched_ext_class(); psi_init(); init_uclamp(); + preempt_dynamic_init(); + scheduler_running = 1; } #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline int preempt_count_equals(int preempt_offset) -{ - int nested = preempt_count() + rcu_preempt_depth(); - return (nested == preempt_offset); -} - -void __might_sleep(const char *file, int line, int preempt_offset) +void __might_sleep(const char *file, int line) { + unsigned int state = get_current_state(); /* * Blocking primitives will set (and therefore destroy) current->state, * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, + WARN_ONCE(state != TASK_RUNNING && current->task_state_change, "do not call blocking ops when !TASK_RUNNING; " - "state=%lx set at [<%p>] %pS\n", - current->state, + "state=%x set at [<%p>] %pS\n", state, (void *)current->task_state_change, (void *)current->task_state_change); - ___might_sleep(file, line, preempt_offset); + __might_resched(file, line, 0); } EXPORT_SYMBOL(__might_sleep); -void ___might_sleep(const char *file, int line, int preempt_offset) +static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) +{ + if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) + return; + + if (preempt_count() == preempt_offset) + return; + + pr_err("Preemption disabled at:"); + print_ip_sym(KERN_ERR, ip); +} + +static inline bool resched_offsets_ok(unsigned int offsets) +{ + unsigned int nested = preempt_count(); + + nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; + + return nested == offsets; +} + +void __might_resched(const char *file, int line, unsigned int offsets) { /* Ratelimiting timestamp: */ static unsigned long prev_jiffy; @@ -6906,7 +8728,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) /* WARN_ON_ONCE() by default, no rate limit required: */ rcu_sleep_check(); - if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + if ((resched_offsets_ok(offsets) && !irqs_disabled() && !is_idle_task(current) && !current->non_block_count) || system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || oops_in_progress) @@ -6919,29 +8741,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset) /* Save this before calling printk(), since that will clobber it: */ preempt_disable_ip = get_preempt_disable_ip(current); - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), current->non_block_count, - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->non_block_count, + current->pid, current->comm); + pr_err("preempt_count: %x, expected: %x\n", preempt_count(), + offsets & MIGHT_RESCHED_PREEMPT_MASK); + + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + pr_err("RCU nest depth: %d, expected: %u\n", + rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); + } if (task_stack_end_corrupted(current)) - printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + pr_emerg("Thread overran stack, or stack corrupted\n"); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && !preempt_count_equals(preempt_offset)) { - pr_err("Preemption disabled at:"); - print_ip_sym(KERN_ERR, preempt_disable_ip); - } + + print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, + preempt_disable_ip); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } -EXPORT_SYMBOL(___might_sleep); +EXPORT_SYMBOL(__might_resched); void __cant_sleep(const char *file, int line, int preempt_offset) { @@ -6970,6 +8796,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_sleep); + +#ifdef CONFIG_SMP +void __cant_migrate(const char *file, int line) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (is_migration_disabled(current)) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > 0) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), is_migration_disabled(current), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_migrate); +#endif #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -6989,11 +8848,11 @@ void normalize_rt_tasks(void) continue; p->se.exec_start = 0; - schedstat_set(p->se.statistics.wait_start, 0); - schedstat_set(p->se.statistics.sleep_start, 0); - schedstat_set(p->se.statistics.block_start, 0); + schedstat_set(p->stats.wait_start, 0); + schedstat_set(p->stats.sleep_start, 0); + schedstat_set(p->stats.block_start, 0); - if (!dl_task(p) && !rt_task(p)) { + if (!rt_or_dl_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: @@ -7010,9 +8869,9 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) +#if defined(CONFIG_KGDB_KDB) /* - * These functions are only useful for the IA64 MCA handling, or kdb. + * These functions are only useful for KDB. * * They can only be called when the whole system has been * stopped - every CPU needs to be quiescent, and no scheduling @@ -7034,30 +8893,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ - -#ifdef CONFIG_IA64 -/** - * ia64_set_curr_task - set the current task for a given CPU. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a CPU in a non-blocking manner. This function - * must be called with all CPU's synchronized, and interrupts disabled, the - * and caller must save the original value of the current task (see - * curr_task() above) and restore that value before reenabling interrupts and - * re-starting the system. - * - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! - */ -void ia64_set_curr_task(int cpu, struct task_struct *p) -{ - cpu_curr(cpu) = p; -} - -#endif +#endif /* defined(CONFIG_KGDB_KDB) */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -7085,6 +8921,22 @@ static void sched_free_group(struct task_group *tg) kmem_cache_free(task_group_cache, tg); } +static void sched_free_group_rcu(struct rcu_head *rcu) +{ + sched_free_group(container_of(rcu, struct task_group, rcu)); +} + +static void sched_unregister_group(struct task_group *tg) +{ + unregister_fair_sched_group(tg); + unregister_rt_sched_group(tg); + /* + * We have to wait for yet another RCU grace period to expire, as + * print_cfs_stats() might run concurrently. + */ + call_rcu(&tg->rcu, sched_free_group_rcu); +} + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *parent) { @@ -7100,6 +8952,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); alloc_uclamp_sched_group(tg, parent); return tg; @@ -7127,33 +8980,43 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) online_fair_sched_group(tg); } -/* rcu callback to free various structures associated with a task group */ -static void sched_free_group_rcu(struct rcu_head *rhp) +/* RCU callback to free various structures associated with a task group */ +static void sched_unregister_group_rcu(struct rcu_head *rhp) { /* Now it should be safe to free those cfs_rqs: */ - sched_free_group(container_of(rhp, struct task_group, rcu)); + sched_unregister_group(container_of(rhp, struct task_group, rcu)); } void sched_destroy_group(struct task_group *tg) { /* Wait for possible concurrent references to cfs_rqs complete: */ - call_rcu(&tg->rcu, sched_free_group_rcu); + call_rcu(&tg->rcu, sched_unregister_group_rcu); } -void sched_offline_group(struct task_group *tg) +void sched_release_group(struct task_group *tg) { unsigned long flags; - /* End participation in shares distribution: */ - unregister_fair_sched_group(tg); - + /* + * Unlink first, to avoid walk_tg_tree_from() from finding us (via + * sched_cfs_period_timer()). + * + * For this to be effective, we have to wait for all pending users of + * this task group to leave their RCU critical section to ensure no new + * user will see our dying task group any more. Specifically ensure + * that tg_unthrottle_up() won't add decayed cfs_rq's to it. + * + * We therefore defer calling unregister_fair_sched_group() to + * sched_unregister_group() which is guarantied to get called only after the + * current RCU grace period has expired. + */ spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); } -static void sched_change_group(struct task_struct *tsk, int type) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -7169,7 +9032,7 @@ static void sched_change_group(struct task_struct *tsk, int type) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) - tsk->sched_class->task_change_group(tsk, type); + tsk->sched_class->task_change_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -7182,17 +9045,18 @@ static void sched_change_group(struct task_struct *tsk, int type) * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &rf); + CLASS(task_rq_lock, rq_guard)(tsk); + rq = rq_guard.rq; + update_rq_clock(rq); - running = task_current(rq, tsk); + running = task_current_donor(rq, tsk); queued = task_on_rq_queued(tsk); if (queued) @@ -7200,7 +9064,9 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, TASK_MOVE_GROUP); + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -7213,13 +9079,6 @@ void sched_move_task(struct task_struct *tsk) */ resched_curr(rq); } - - task_rq_unlock(rq, tsk, &rf); -} - -static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct task_group, css) : NULL; } static struct cgroup_subsys_state * @@ -7245,23 +9104,37 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); + int ret; + + ret = scx_tg_online(tg); + if (ret) + return ret; if (parent) sched_online_group(tg, parent); #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ + guard(mutex)(&uclamp_mutex); + guard(rcu)(); cpu_util_update_eff(css); #endif return 0; } +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + scx_tg_offline(tg); +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_offline_group(tg); + sched_release_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) @@ -7271,55 +9144,21 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) /* * Relies on the RCU grace period between css_released() and this. */ - sched_free_group(tg); -} - -/* - * This is called before wake_up_new_task(), therefore we really only - * have to set its group bits, all the other stuff does not apply. - */ -static void cpu_cgroup_fork(struct task_struct *task) -{ - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(task, &rf); - - update_rq_clock(rq); - sched_change_group(task, TASK_SET_GROUP); - - task_rq_unlock(rq, task, &rf); + sched_unregister_group(tg); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_RT_GROUP_SCHED struct task_struct *task; struct cgroup_subsys_state *css; - int ret = 0; cgroup_taskset_for_each(task, css, tset) { -#ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#endif - /* - * Serialize against wake_up_new_task() such that if its - * running, we're sure to observe its full state. - */ - raw_spin_lock_irq(&task->pi_lock); - /* - * Avoid calling sched_move_task() before wake_up_new_task() - * has happened. This would lead to problems with PELT, due to - * move wanting to detach+attach while we're not attached yet. - */ - if (task->state == TASK_NEW) - ret = -EINVAL; - raw_spin_unlock_irq(&task->pi_lock); - - if (ret) - break; } - return ret; +#endif + return scx_cgroup_can_attach(tset); } static void cpu_cgroup_attach(struct cgroup_taskset *tset) @@ -7328,7 +9167,14 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); + + scx_cgroup_finish_attach(); +} + +static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + scx_cgroup_cancel_attach(tset); } #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -7341,6 +9187,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) enum uclamp_id clamp_id; unsigned int clamps; + lockdep_assert_held(&uclamp_mutex); + SCHED_WARN_ON(!rcu_read_lock_held()); + css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL; @@ -7373,7 +9222,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) } /* Immediately update descendants RUNNABLE tasks */ - uclamp_update_active_tasks(css, clamps); + uclamp_update_active_tasks(css); } } @@ -7431,8 +9280,10 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + static_branch_enable(&sched_uclamp_used); + + guard(mutex)(&uclamp_mutex); + guard(rcu)(); tg = css_tg(of_css(of)); if (tg->uclamp_req[clamp_id].value != req.util) @@ -7447,9 +9298,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, /* Update effective clamps to track the most restrictive value */ cpu_util_update_eff(of_css(of)); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); - return nbytes; } @@ -7475,10 +9323,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf, u64 percent; u32 rem; - rcu_read_lock(); - tg = css_tg(seq_css(sf)); - util_clamp = tg->uclamp_req[clamp_id].value; - rcu_read_unlock(); + scoped_guard (rcu) { + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + } if (util_clamp == SCHED_CAPACITY_SCALE) { seq_puts(sf, "max\n"); @@ -7503,22 +9351,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) } #endif /* CONFIG_UCLAMP_TASK_GROUP */ +#ifdef CONFIG_GROUP_SCHED_WEIGHT +static unsigned long tg_weight(struct task_group *tg) +{ #ifdef CONFIG_FAIR_GROUP_SCHED + return scale_load_down(tg->shares); +#else + return sched_weight_from_cgroup(tg->scx_weight); +#endif +} + static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + int ret; + if (shareval > scale_load_down(ULONG_MAX)) shareval = MAX_SHARES; - return sched_group_set_shares(css_tg(css), scale_load(shareval)); + ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(shareval)); + return ret; } static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - - return (u64) scale_load_down(tg->shares); + return tg_weight(css_tg(css)); } +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); @@ -7530,7 +9392,8 @@ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, + u64 burst) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; @@ -7547,7 +9410,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) return -EINVAL; /* - * Likewise, bound things on the otherside by preventing insane quota + * Likewise, bound things on the other side by preventing insane quota * periods. This also allows us to normalize in computing quota * feasibility. */ @@ -7560,15 +9423,20 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (quota != RUNTIME_INF && quota > max_cfs_runtime) return -EINVAL; + if (quota != RUNTIME_INF && (burst > quota || + burst + quota > max_cfs_runtime)) + return -EINVAL; + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - get_online_cpus(); - mutex_lock(&cfs_constraints_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); if (ret) - goto out_unlock; + return ret; runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; @@ -7578,45 +9446,46 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) */ if (runtime_enabled && !runtime_was_enabled) cfs_bandwidth_usage_inc(); - raw_spin_lock_irq(&cfs_b->lock); - cfs_b->period = ns_to_ktime(period); - cfs_b->quota = quota; - __refill_cfs_bandwidth_runtime(cfs_b); + scoped_guard (raw_spinlock_irq, &cfs_b->lock) { + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + cfs_b->burst = burst; - /* Restart the period timer (if active) to handle new period expiry: */ - if (runtime_enabled) - start_cfs_bandwidth(cfs_b); + __refill_cfs_bandwidth_runtime(cfs_b); - raw_spin_unlock_irq(&cfs_b->lock); + /* + * Restart the period timer (if active) to handle new + * period expiry: + */ + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); + } for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; - struct rq_flags rf; - rq_lock_irq(rq, &rf); + guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - rq_unlock_irq(rq, &rf); } + if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); -out_unlock: - mutex_unlock(&cfs_constraints_mutex); - put_online_cpus(); - return ret; + return 0; } static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { - u64 quota, period; + u64 quota, period, burst; period = ktime_to_ns(tg->cfs_bandwidth.period); + burst = tg->cfs_bandwidth.burst; if (cfs_quota_us < 0) quota = RUNTIME_INF; else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) @@ -7624,7 +9493,7 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) else return -EINVAL; - return tg_set_cfs_bandwidth(tg, period, quota); + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static long tg_get_cfs_quota(struct task_group *tg) @@ -7642,15 +9511,16 @@ static long tg_get_cfs_quota(struct task_group *tg) static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { - u64 quota, period; + u64 quota, period, burst; if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) return -EINVAL; period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; + burst = tg->cfs_bandwidth.burst; - return tg_set_cfs_bandwidth(tg, period, quota); + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static long tg_get_cfs_period(struct task_group *tg) @@ -7663,6 +9533,30 @@ static long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } +static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) +{ + u64 quota, period, burst; + + if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + + burst = (u64)cfs_burst_us * NSEC_PER_USEC; + period = ktime_to_ns(tg->cfs_bandwidth.period); + quota = tg->cfs_bandwidth.quota; + + return tg_set_cfs_bandwidth(tg, period, quota, burst); +} + +static long tg_get_cfs_burst(struct task_group *tg) +{ + u64 burst_us; + + burst_us = tg->cfs_bandwidth.burst; + do_div(burst_us, NSEC_PER_USEC); + + return burst_us; +} + static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -7687,6 +9581,18 @@ static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, return tg_set_cfs_period(css_tg(css), cfs_period_us); } +static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_burst(css_tg(css)); +} + +static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_burst_us) +{ + return tg_set_cfs_burst(css_tg(css), cfs_burst_us); +} + struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -7732,11 +9638,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) /* * Ensure max(child_quota) <= parent_quota. On cgroup2, - * always take the min. On cgroup1, only inherit when no - * limit is set: + * always take the non-RUNTIME_INF min. On cgroup1, only + * inherit when no limit is set. In both cases this is used + * by the scheduler to determine if a given CFS task has a + * bandwidth constraint at some higher level. */ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { - quota = min(quota, parent_quota); + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF) + quota = min(quota, parent_quota); } else { if (quota == RUNTIME_INF) quota = parent_quota; @@ -7751,7 +9662,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) { - int ret; struct cfs_schedulable_data data = { .tg = tg, .period = period, @@ -7763,11 +9673,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) do_div(data.quota, NSEC_PER_USEC); } - rcu_read_lock(); - ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); } static int cpu_cfs_stat_show(struct seq_file *sf, void *v) @@ -7780,19 +9687,45 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); if (schedstat_enabled() && tg != &root_task_group) { + struct sched_statistics *stats; u64 ws = 0; int i; - for_each_possible_cpu(i) - ws += schedstat_val(tg->se[i]->statistics.wait_sum); + for_each_possible_cpu(i) { + stats = __schedstats_from_se(tg->se[i]); + ws += schedstat_val(stats->wait_sum); + } seq_printf(sf, "wait_sum %llu\n", ws); } + seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst); + seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time); + + return 0; +} + +static u64 throttled_time_self(struct task_group *tg) +{ + int i; + u64 total = 0; + + for_each_possible_cpu(i) { + total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); + } + + return total; +} + +static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg)); + return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, @@ -7820,13 +9753,37 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_GROUP_SCHED_WEIGHT +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) +{ + int ret; + + ret = sched_group_set_idle(css_tg(css), idle); + if (!ret) + scx_group_set_idle(css_tg(css), idle); + return ret; +} +#endif + static struct cftype cpu_legacy_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT { .name = "shares", .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -7840,9 +9797,18 @@ static struct cftype cpu_legacy_files[] = { .write_u64 = cpu_cfs_period_write_u64, }, { + .name = "cfs_burst_us", + .read_u64 = cpu_cfs_burst_read_u64, + .write_u64 = cpu_cfs_burst_write_u64, + }, + { .name = "stat", .seq_show = cpu_cfs_stat_show, }, + { + .name = "stat.local", + .seq_show = cpu_cfs_local_stat_show, + }, #endif #ifdef CONFIG_RT_GROUP_SCHED { @@ -7880,53 +9846,72 @@ static int cpu_extra_stat_show(struct seq_file *sf, { struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - u64 throttled_usec; + u64 throttled_usec, burst_usec; throttled_usec = cfs_b->throttled_time; do_div(throttled_usec, NSEC_PER_USEC); + burst_usec = cfs_b->burst_time; + do_div(burst_usec, NSEC_PER_USEC); seq_printf(sf, "nr_periods %d\n" "nr_throttled %d\n" - "throttled_usec %llu\n", + "throttled_usec %llu\n" + "nr_bursts %d\n" + "burst_usec %llu\n", cfs_b->nr_periods, cfs_b->nr_throttled, - throttled_usec); + throttled_usec, cfs_b->nr_burst, burst_usec); } #endif return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED +static int cpu_local_stat_show(struct seq_file *sf, + struct cgroup_subsys_state *css) +{ +#ifdef CONFIG_CFS_BANDWIDTH + { + struct task_group *tg = css_tg(css); + u64 throttled_self_usec; + + throttled_self_usec = throttled_time_self(tg); + do_div(throttled_self_usec, NSEC_PER_USEC); + + seq_printf(sf, "throttled_usec %llu\n", + throttled_self_usec); + } +#endif + return 0; +} + +#ifdef CONFIG_GROUP_SCHED_WEIGHT + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct task_group *tg = css_tg(css); - u64 weight = scale_load_down(tg->shares); - - return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); + return sched_weight_to_cgroup(tg_weight(css_tg(css))); } static int cpu_weight_write_u64(struct cgroup_subsys_state *css, - struct cftype *cft, u64 weight) + struct cftype *cft, u64 cgrp_weight) { - /* - * cgroup weight knobs should use the common MIN, DFL and MAX - * values which are 1, 100 and 10000 respectively. While it loses - * a bit of range on both ends, it maps pretty well onto the shares - * value used by scheduler and the round-trip conversions preserve - * the original value over the entire range. - */ - if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + unsigned long weight; + int ret; + + if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) return -ERANGE; - weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + weight = sched_weight_from_cgroup(cgrp_weight); - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), cgrp_weight); + return ret; } static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - unsigned long weight = scale_load_down(css_tg(css)->shares); + unsigned long weight = tg_weight(css_tg(css)); int last_delta = INT_MAX; int prio, delta; @@ -7945,7 +9930,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; - int idx; + int idx, ret; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; @@ -7954,9 +9939,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, idx = array_index_nospec(idx, 40); weight = sched_prio_to_weight[idx]; - return sched_group_set_shares(css_tg(css), scale_load(weight)); + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); + if (!ret) + scx_group_set_weight(css_tg(css), + sched_weight_to_cgroup(weight)); + return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, long period, long quota) @@ -8004,18 +9993,19 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, { struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); + u64 burst = tg->cfs_bandwidth.burst; u64 quota; int ret; ret = cpu_period_quota_parse(buf, &period, "a); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota); + ret = tg_set_cfs_bandwidth(tg, period, quota, burst); return ret ?: nbytes; } #endif static struct cftype cpu_files[] = { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, @@ -8028,6 +10018,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -8036,6 +10032,12 @@ static struct cftype cpu_files[] = { .seq_show = cpu_max_show, .write = cpu_max_write, }, + { + .name = "max.burst", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_cfs_burst_read_u64, + .write_u64 = cpu_cfs_burst_write_u64, + }, #endif #ifdef CONFIG_UCLAMP_TASK_GROUP { @@ -8057,12 +10059,14 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, - .fork = cpu_cgroup_fork, + .css_local_stat_show = cpu_local_stat_show, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .cancel_attach = cpu_cgroup_cancel_attach, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, .early_init = true, @@ -8073,6 +10077,19 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { + if (in_hardirq() && cpu == smp_processor_id()) { + struct pt_regs *regs; + + regs = get_irq_regs(); + if (regs) { + show_regs(regs); + return; + } + } + + if (trigger_single_cpu_backtrace(cpu)) + return; + pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } @@ -8101,10 +10118,10 @@ const int sched_prio_to_weight[40] = { }; /* - * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated. * * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions + * pre-calculated inverse to speed up arithmetics by turning divisions * into multiplications: */ const u32 sched_prio_to_wmult[40] = { @@ -8118,4 +10135,564 @@ const u32 sched_prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -#undef CREATE_TRACE_POINTS +void call_trace_sched_update_nr_running(struct rq *rq, int count) +{ + trace_sched_update_nr_running_tp(rq, count); +} + +#ifdef CONFIG_SCHED_MM_CID + +/* + * @cid_lock: Guarantee forward-progress of cid allocation. + * + * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock + * is only used when contention is detected by the lock-free allocation so + * forward progress can be guaranteed. + */ +DEFINE_RAW_SPINLOCK(cid_lock); + +/* + * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. + * + * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is + * detected, it is set to 1 to ensure that all newly coming allocations are + * serialized by @cid_lock until the allocation which detected contention + * completes and sets @use_cid_lock back to 0. This guarantees forward progress + * of a cid allocation. + */ +int use_cid_lock; + +/* + * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid + * concurrently with respect to the execution of the source runqueue context + * switch. + * + * There is one basic properties we want to guarantee here: + * + * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively + * used by a task. That would lead to concurrent allocation of the cid and + * userspace corruption. + * + * Provide this guarantee by introducing a Dekker memory ordering to guarantee + * that a pair of loads observe at least one of a pair of stores, which can be + * shown as: + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible. But rather than using + * values 0 and 1, this algorithm cares about specific state transitions of the + * runqueue current task (as updated by the scheduler context switch), and the + * per-mm/cpu cid value. + * + * Let's introduce task (Y) which has task->mm == mm and task (N) which has + * task->mm != mm for the rest of the discussion. There are two scheduler state + * transitions on context switch we care about: + * + * (TSA) Store to rq->curr with transition from (N) to (Y) + * + * (TSB) Store to rq->curr with transition from (Y) to (N) + * + * On the remote-clear side, there is one transition we care about: + * + * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag + * + * There is also a transition to UNSET state which can be performed from all + * sides (scheduler, remote-clear). It is always performed with a cmpxchg which + * guarantees that only a single thread will succeed: + * + * (TMB) cmpxchg to *pcpu_cid to mark UNSET + * + * Just to be clear, what we do _not_ want to happen is a transition to UNSET + * when a thread is actively using the cid (property (1)). + * + * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. + * + * Scenario A) (TSA)+(TMA) (from next task perspective) + * + * CPU0 CPU1 + * + * Context switch CS-1 Remote-clear + * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) + * (implied barrier after cmpxchg) + * - switch_mm_cid() + * - memory barrier (see switch_mm_cid() + * comment explaining how this barrier + * is combined with other scheduler + * barriers) + * - mm_cid_get (next) + * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) + * + * This Dekker ensures that either task (Y) is observed by the + * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are + * observed. + * + * If task (Y) store is observed by rcu_dereference(), it means that there is + * still an active task on the cpu. Remote-clear will therefore not transition + * to UNSET, which fulfills property (1). + * + * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), + * it will move its state to UNSET, which clears the percpu cid perhaps + * uselessly (which is not an issue for correctness). Because task (Y) is not + * observed, CPU1 can move ahead to set the state to UNSET. Because moving + * state to UNSET is done with a cmpxchg expecting that the old state has the + * LAZY flag set, only one thread will successfully UNSET. + * + * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 + * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and + * CPU1 will observe task (Y) and do nothing more, which is fine. + * + * What we are effectively preventing with this Dekker is a scenario where + * neither LAZY flag nor store (Y) are observed, which would fail property (1) + * because this would UNSET a cid which is actively used. + */ + +void sched_mm_cid_migrate_from(struct task_struct *t) +{ + t->migrate_from_cpu = task_cpu(t); +} + +static +int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, + struct task_struct *t, + struct mm_cid *src_pcpu_cid) +{ + struct mm_struct *mm = t->mm; + struct task_struct *src_task; + int src_cid, last_mm_cid; + + if (!mm) + return -1; + + last_mm_cid = t->last_mm_cid; + /* + * If the migrated task has no last cid, or if the current + * task on src rq uses the cid, it means the source cid does not need + * to be moved to the destination cpu. + */ + if (last_mm_cid == -1) + return -1; + src_cid = READ_ONCE(src_pcpu_cid->cid); + if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) + return -1; + + /* + * If we observe an active task using the mm on this rq, it means we + * are not the last task to be migrated from this cpu for this mm, so + * there is no need to move src_cid to the destination cpu. + */ + guard(rcu)(); + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + t->last_mm_cid = -1; + return -1; + } + + return src_cid; +} + +static +int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, + struct task_struct *t, + struct mm_cid *src_pcpu_cid, + int src_cid) +{ + struct task_struct *src_task; + struct mm_struct *mm = t->mm; + int lazy_cid; + + if (src_cid == -1) + return -1; + + /* + * Attempt to clear the source cpu cid to move it to the destination + * cpu. + */ + lazy_cid = mm_cid_set_lazy_put(src_cid); + if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) + return -1; + + /* + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm matches the scheduler barrier in context_switch() + * between store to rq->curr and load of prev and next task's + * per-mm/cpu cid. + * + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm_cid_active matches the barrier in + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and + * sched_mm_cid_after_execve() between store to t->mm_cid_active and + * load of per-mm/cpu cid. + */ + + /* + * If we observe an active task using the mm on this rq after setting + * the lazy-put flag, this task will be responsible for transitioning + * from lazy-put flag set to MM_CID_UNSET. + */ + scoped_guard (rcu) { + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + /* + * We observed an active task for this mm, there is therefore + * no point in moving this cid to the destination cpu. + */ + t->last_mm_cid = -1; + return -1; + } + } + + /* + * The src_cid is unused, so it can be unset. + */ + if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + return -1; + WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); + return src_cid; +} + +/* + * Migration to dst cpu. Called with dst_rq lock held. + * Interrupts are disabled, which keeps the window of cid ownership without the + * source rq lock held small. + */ +void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) +{ + struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; + struct mm_struct *mm = t->mm; + int src_cid, src_cpu; + bool dst_cid_is_set; + struct rq *src_rq; + + lockdep_assert_rq_held(dst_rq); + + if (!mm) + return; + src_cpu = t->migrate_from_cpu; + if (src_cpu == -1) { + t->last_mm_cid = -1; + return; + } + /* + * Move the src cid if the dst cid is unset. This keeps id + * allocation closest to 0 in cases where few threads migrate around + * many CPUs. + * + * If destination cid or recent cid is already set, we may have + * to just clear the src cid to ensure compactness in frequent + * migrations scenarios. + * + * It is not useful to clear the src cid when the number of threads is + * greater or equal to the number of allowed CPUs, because user-space + * can expect that the number of allowed cids can reach the number of + * allowed CPUs. + */ + dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); + dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || + !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); + if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) + return; + src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); + src_rq = cpu_rq(src_cpu); + src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); + if (src_cid == -1) + return; + src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, + src_cid); + if (src_cid == -1) + return; + if (dst_cid_is_set) { + __mm_cid_put(mm, src_cid); + return; + } + /* Move src_cid to dst cpu. */ + mm_cid_snapshot_time(dst_rq, mm); + WRITE_ONCE(dst_pcpu_cid->cid, src_cid); + WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); +} + +static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, + int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *t; + int cid, lazy_cid; + + cid = READ_ONCE(pcpu_cid->cid); + if (!mm_cid_is_valid(cid)) + return; + + /* + * Clear the cpu cid if it is set to keep cid allocation compact. If + * there happens to be other tasks left on the source cpu using this + * mm, the next task using this mm will reallocate its cid on context + * switch. + */ + lazy_cid = mm_cid_set_lazy_put(cid); + if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) + return; + + /* + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm matches the scheduler barrier in context_switch() + * between store to rq->curr and load of prev and next task's + * per-mm/cpu cid. + * + * The implicit barrier after cmpxchg per-mm/cpu cid before loading + * rq->curr->mm_cid_active matches the barrier in + * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and + * sched_mm_cid_after_execve() between store to t->mm_cid_active and + * load of per-mm/cpu cid. + */ + + /* + * If we observe an active task using the mm on this rq after setting + * the lazy-put flag, that task will be responsible for transitioning + * from lazy-put flag set to MM_CID_UNSET. + */ + scoped_guard (rcu) { + t = rcu_dereference(rq->curr); + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) + return; + } + + /* + * The cid is unused, so it can be unset. + * Disable interrupts to keep the window of cid ownership without rq + * lock small. + */ + scoped_guard (irqsave) { + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + __mm_cid_put(mm, cid); + } +} + +static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct mm_cid *pcpu_cid; + struct task_struct *curr; + u64 rq_clock; + + /* + * rq->clock load is racy on 32-bit but one spurious clear once in a + * while is irrelevant. + */ + rq_clock = READ_ONCE(rq->clock); + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); + + /* + * In order to take care of infrequently scheduled tasks, bump the time + * snapshot associated with this cid if an active task using the mm is + * observed on this rq. + */ + scoped_guard (rcu) { + curr = rcu_dereference(rq->curr); + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { + WRITE_ONCE(pcpu_cid->time, rq_clock); + return; + } + } + + if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) + return; + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); +} + +static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, + int weight) +{ + struct mm_cid *pcpu_cid; + int cid; + + pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); + cid = READ_ONCE(pcpu_cid->cid); + if (!mm_cid_is_valid(cid) || cid < weight) + return; + sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); +} + +static void task_mm_cid_work(struct callback_head *work) +{ + unsigned long now = jiffies, old_scan, next_scan; + struct task_struct *t = current; + struct cpumask *cidmask; + struct mm_struct *mm; + int weight, cpu; + + SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + + work->next = work; /* Prevent double-add */ + if (t->flags & PF_EXITING) + return; + mm = t->mm; + if (!mm) + return; + old_scan = READ_ONCE(mm->mm_cid_next_scan); + next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); + if (!old_scan) { + unsigned long res; + + res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); + if (res != old_scan) + old_scan = res; + else + old_scan = next_scan; + } + if (time_before(now, old_scan)) + return; + if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) + return; + cidmask = mm_cidmask(mm); + /* Clear cids that were not recently used. */ + for_each_possible_cpu(cpu) + sched_mm_cid_remote_clear_old(mm, cpu); + weight = cpumask_weight(cidmask); + /* + * Clear cids that are greater or equal to the cidmask weight to + * recompact it. + */ + for_each_possible_cpu(cpu) + sched_mm_cid_remote_clear_weight(mm, cpu, weight); +} + +void init_sched_mm_cid(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + int mm_users = 0; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) + mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); + } + t->cid_work.next = &t->cid_work; /* Protect against double add */ + init_task_work(&t->cid_work, task_mm_cid_work); +} + +void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->cid_work; + unsigned long now = jiffies; + + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || + work->next != work) + return; + if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) + return; + + /* No page allocation under rq lock */ + task_work_add(curr, work, TWA_RESUME); +} + +void sched_mm_cid_exit_signals(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct rq *rq; + + if (!mm) + return; + + preempt_disable(); + rq = this_rq(); + guard(rq_lock_irqsave)(rq); + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 0); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + mm_cid_put(mm); + t->last_mm_cid = t->mm_cid = -1; +} + +void sched_mm_cid_before_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct rq *rq; + + if (!mm) + return; + + preempt_disable(); + rq = this_rq(); + guard(rq_lock_irqsave)(rq); + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 0); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + mm_cid_put(mm); + t->last_mm_cid = t->mm_cid = -1; +} + +void sched_mm_cid_after_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct rq *rq; + + if (!mm) + return; + + preempt_disable(); + rq = this_rq(); + scoped_guard (rq_lock_irqsave, rq) { + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 1); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); + } + rseq_set_notify_resume(t); +} + +void sched_mm_cid_fork(struct task_struct *t) +{ + WARN_ON_ONCE(!t->mm || t->mm_cid != -1); + t->mm_cid_active = 1; +} +#endif + +#ifdef CONFIG_SCHED_CLASS_EXT +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); + + *ctx = (struct sched_enq_and_set_ctx){ + .p = p, + .queue_flags = queue_flags, + .queued = task_on_rq_queued(p), + .running = task_current(rq, p), + }; + + update_rq_clock(rq); + if (ctx->queued) + dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + if (ctx->running) + put_prev_task(rq, p); +} + +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +{ + struct rq *rq = task_rq(ctx->p); + + lockdep_assert_rq_held(rq); + + if (ctx->queued) + enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + if (ctx->running) + set_next_task(rq, ctx->p); +} +#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c new file mode 100644 index 000000000000..1ef98a93eb1d --- /dev/null +++ b/kernel/sched/core_sched.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * A simple wrapper around refcount. An allocated sched_core_cookie's + * address is used to compute the cookie of the task. + */ +struct sched_core_cookie { + refcount_t refcnt; +}; + +static unsigned long sched_core_alloc_cookie(void) +{ + struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL); + if (!ck) + return 0; + + refcount_set(&ck->refcnt, 1); + sched_core_get(); + + return (unsigned long)ck; +} + +static void sched_core_put_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr && refcount_dec_and_test(&ptr->refcnt)) { + kfree(ptr); + sched_core_put(); + } +} + +static unsigned long sched_core_get_cookie(unsigned long cookie) +{ + struct sched_core_cookie *ptr = (void *)cookie; + + if (ptr) + refcount_inc(&ptr->refcnt); + + return cookie; +} + +/* + * sched_core_update_cookie - replace the cookie on a task + * @p: the task to update + * @cookie: the new cookie + * + * Effectively exchange the task cookie; caller is responsible for lifetimes on + * both ends. + * + * Returns: the old cookie + */ +static unsigned long sched_core_update_cookie(struct task_struct *p, + unsigned long cookie) +{ + unsigned long old_cookie; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + + /* + * Since creating a cookie implies sched_core_get(), and we cannot set + * a cookie until after we've created it, similarly, we cannot destroy + * a cookie until after we've removed it, we must have core scheduling + * enabled here. + */ + SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + + if (sched_core_enqueued(p)) + sched_core_dequeue(rq, p, DEQUEUE_SAVE); + + old_cookie = p->core_cookie; + p->core_cookie = cookie; + + /* + * Consider the cases: !prev_cookie and !cookie. + */ + if (cookie && task_on_rq_queued(p)) + sched_core_enqueue(rq, p); + + /* + * If task is currently running, it may not be compatible anymore after + * the cookie change, so enter the scheduler on its CPU to schedule it + * away. + * + * Note that it is possible that as a result of this cookie change, the + * core has now entered/left forced idle state. Defer accounting to the + * next scheduling edge, rather than always forcing a reschedule here. + */ + if (task_on_cpu(rq, p)) + resched_curr(rq); + + task_rq_unlock(rq, p, &rf); + + return old_cookie; +} + +static unsigned long sched_core_clone_cookie(struct task_struct *p) +{ + unsigned long cookie, flags; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cookie = sched_core_get_cookie(p->core_cookie); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return cookie; +} + +void sched_core_fork(struct task_struct *p) +{ + RB_CLEAR_NODE(&p->core_node); + p->core_cookie = sched_core_clone_cookie(current); +} + +void sched_core_free(struct task_struct *p) +{ + sched_core_put_cookie(p->core_cookie); +} + +static void __sched_core_set(struct task_struct *p, unsigned long cookie) +{ + cookie = sched_core_get_cookie(cookie); + cookie = sched_core_update_cookie(p, cookie); + sched_core_put_cookie(cookie); +} + +/* Called from prctl interface: PR_SCHED_CORE */ +int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr) +{ + unsigned long cookie = 0, id = 0; + struct task_struct *task, *p; + struct pid *grp; + int err = 0; + + if (!static_branch_likely(&sched_smt_present)) + return -ENODEV; + + BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID); + BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID); + BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID); + + if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 || + (cmd != PR_SCHED_CORE_GET && uaddr)) + return -EINVAL; + + rcu_read_lock(); + if (pid == 0) { + task = current; + } else { + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + } + get_task_struct(task); + rcu_read_unlock(); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out; + } + + switch (cmd) { + case PR_SCHED_CORE_GET: + if (type != PIDTYPE_PID || uaddr & 7) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + if (cookie) { + /* XXX improve ? */ + ptr_to_hashval((void *)cookie, &id); + } + err = put_user(id, (u64 __user *)uaddr); + goto out; + + case PR_SCHED_CORE_CREATE: + cookie = sched_core_alloc_cookie(); + if (!cookie) { + err = -ENOMEM; + goto out; + } + break; + + case PR_SCHED_CORE_SHARE_TO: + cookie = sched_core_clone_cookie(current); + break; + + case PR_SCHED_CORE_SHARE_FROM: + if (type != PIDTYPE_PID) { + err = -EINVAL; + goto out; + } + cookie = sched_core_clone_cookie(task); + __sched_core_set(current, cookie); + goto out; + + default: + err = -EINVAL; + goto out; + } + + if (type == PIDTYPE_PID) { + __sched_core_set(task, cookie); + goto out; + } + + read_lock(&tasklist_lock); + grp = task_pid_type(task, type); + + do_each_pid_thread(grp, type, p) { + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) { + err = -EPERM; + goto out_tasklist; + } + } while_each_pid_thread(grp, type, p); + + do_each_pid_thread(grp, type, p) { + __sched_core_set(p, cookie); + } while_each_pid_thread(grp, type, p); +out_tasklist: + read_unlock(&tasklist_lock); + +out: + sched_core_put_cookie(cookie); + put_task_struct(task); + return err; +} + +#ifdef CONFIG_SCHEDSTATS + +/* REQUIRES: rq->core's clock recently updated. */ +void __sched_core_account_forceidle(struct rq *rq) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + u64 delta, now = rq_clock(rq->core); + struct rq *rq_i; + struct task_struct *p; + int i; + + lockdep_assert_rq_held(rq); + + WARN_ON_ONCE(!rq->core->core_forceidle_count); + + if (rq->core->core_forceidle_start == 0) + return; + + delta = now - rq->core->core_forceidle_start; + if (unlikely((s64)delta <= 0)) + return; + + rq->core->core_forceidle_start = now; + + if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) { + /* can't be forced idle without a running task */ + } else if (rq->core->core_forceidle_count > 1 || + rq->core->core_forceidle_occupation > 1) { + /* + * For larger SMT configurations, we need to scale the charged + * forced idle amount since there can be more than one forced + * idle sibling and more than one running cookied task. + */ + delta *= rq->core->core_forceidle_count; + delta = div_u64(delta, rq->core->core_forceidle_occupation); + } + + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + p = rq_i->core_pick ?: rq_i->curr; + + if (p == rq_i->idle) + continue; + + /* + * Note: this will account forceidle to the current CPU, even + * if it comes from our SMT sibling. + */ + __account_forceidle_time(p, delta); + } +} + +void __sched_core_tick(struct rq *rq) +{ + if (!rq->core->core_forceidle_count) + return; + + if (rq != rq->core) + update_rq_clock(rq->core); + + __sched_core_account_forceidle(rq); +} + +#endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 941c28cf9738..0de9dda09949 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 + /* * CPU accounting code for task groups. * * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ -#include <asm/irq_regs.h> -#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { @@ -21,15 +20,11 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -struct cpuacct_usage { - u64 usages[CPUACCT_STAT_NSTATS]; -}; - /* track CPU usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */ - struct cpuacct_usage __percpu *cpuusage; + u64 __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +44,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +63,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(struct cpuacct_usage); + ca->cpuusage = alloc_percpu(u64); if (!ca->cpuusage) goto out_free_ca; @@ -99,56 +94,66 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data; /* * We allow index == CPUACCT_STAT_NSTATS here to read - * the sum of suages. + * the sum of usages. */ - BUG_ON(index > CPUACCT_STAT_NSTATS); + if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS)) + return 0; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - if (index == CPUACCT_STAT_NSTATS) { - int i = 0; - - data = 0; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - data += cpuusage->usages[i]; - } else { - data = cpuusage->usages[index]; + switch (index) { + case CPUACCT_STAT_USER: + data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE]; + break; + case CPUACCT_STAT_SYSTEM: + data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] + + cpustat[CPUTIME_SOFTIRQ]; + break; + case CPUACCT_STAT_NSTATS: + data = *cpuusage; + break; } #ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif return data; } -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - int i; + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + + /* Don't allow to reset global kernel_cpustat */ + if (ca == &root_cpuacct) + return; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - cpuusage->usages[i] = val; + *cpuusage = 0; + cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0; + cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0; + cpustat[CPUTIME_SOFTIRQ] = 0; #ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_rq_unlock_irq(cpu_rq(cpu)); #endif } @@ -196,7 +201,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; for_each_possible_cpu(cpu) - cpuacct_cpuusage_write(ca, cpu, 0); + cpuacct_cpuusage_write(ca, cpu); return 0; } @@ -243,25 +248,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) seq_puts(m, "\n"); for_each_possible_cpu(cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - seq_printf(m, "%d", cpu); - - for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit - * platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); -#endif - - seq_printf(m, " %llu", cpuusage->usages[index]); - -#ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#endif - } + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %llu", + cpuacct_cpuusage_read(ca, cpu, index)); seq_puts(m, "\n"); } return 0; @@ -270,25 +260,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); - s64 val[CPUACCT_STAT_NSTATS]; + struct task_cputime cputime; + u64 val[CPUACCT_STAT_NSTATS]; int cpu; int stat; - memset(val, 0, sizeof(val)); + memset(&cputime, 0, sizeof(cputime)); for_each_possible_cpu(cpu) { u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; + cputime.utime += cpustat[CPUTIME_USER]; + cputime.utime += cpustat[CPUTIME_NICE]; + cputime.stime += cpustat[CPUTIME_SYSTEM]; + cputime.stime += cpustat[CPUTIME_IRQ]; + cputime.stime += cpustat[CPUTIME_SOFTIRQ]; + + cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu); } + cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime, + &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { - seq_printf(sf, "%s %lld\n", - cpuacct_stat_desc[stat], - (long long)nsec_to_clock_t(val[stat])); + seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat], + nsec_to_clock_t(val[stat])); } return 0; @@ -338,19 +333,13 @@ static struct cftype files[] = { */ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { + unsigned int cpu = task_cpu(tsk); struct cpuacct *ca; - int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); - if (regs && user_mode(regs)) - index = CPUACCT_STAT_USER; - - rcu_read_lock(); + lockdep_assert_rq_held(cpu_rq(cpu)); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(ca->cpuusage->usages[index], cputime); - - rcu_read_unlock(); + *per_cpu_ptr(ca->cpuusage, cpu) += cputime; } /* @@ -362,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { struct cpuacct *ca; - rcu_read_lock(); for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) __this_cpu_add(ca->cpustat->cpustat[index], val); - rcu_read_unlock(); } struct cgroup_subsys cpuacct_cgrp_subsys = { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5cc4012572ec..95baa12a1029 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -1,12 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * kernel/sched/cpudl.c + * kernel/sched/cpudeadline.c * * Global CPU deadline management * * Author: Juri Lelli <j.lelli@sssup.it> */ -#include "sched.h" static inline int parent(int i) { @@ -120,14 +119,38 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { + cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; + + if (!sched_asym_cpucap_active()) + return 1; + + /* Ensure the capacity of the CPUs fits the task. */ + for_each_cpu(cpu, later_mask) { + if (!dl_task_fits_capacity(p, cpu)) { + cpumask_clear_cpu(cpu, later_mask); + + cap = arch_scale_cpu_capacity(cpu); + + if (cap > max_cap || + (cpu == task_cpu(p) && cap == max_cap)) { + max_cap = cap; + max_cpu = cpu; + } + } + } + + if (cpumask_empty(later_mask)) + cpumask_set_cpu(max_cpu, later_mask); + return 1; } else { int best_cpu = cpudl_maximum(cp); WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && + if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { if (later_mask) cpumask_set_cpu(best_cpu, later_mask); diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 7c2fe50fd76d..5252fb191fae 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,9 +5,6 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ -#include <linux/cpufreq.h> - -#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7fbaee24c824..1a19d69b91ed 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -6,13 +6,6 @@ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include "sched.h" - -#include <linux/sched/cpufreq.h> -#include <trace/events/power.h> - #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) struct sugov_tunables { @@ -26,7 +19,7 @@ struct sugov_policy { struct sugov_tunables *tunables; struct list_head tunables_hook; - raw_spinlock_t update_lock; /* For shared policies */ + raw_spinlock_t update_lock; u64 last_freq_update_time; s64 freq_update_delay_ns; unsigned int next_freq; @@ -53,8 +46,8 @@ struct sugov_cpu { unsigned int iowait_boost; u64 last_update; - unsigned long bw_dl; - unsigned long max; + unsigned long util; + unsigned long bw_min; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -90,7 +83,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (unlikely(sg_policy->limits_changed)) { sg_policy->limits_changed = false; - sg_policy->need_freq_update = true; + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); return true; } @@ -102,7 +95,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq) + if (sg_policy->need_freq_update) + sg_policy->need_freq_update = false; + else if (sg_policy->next_freq == next_freq) return false; sg_policy->next_freq = next_freq; @@ -111,37 +106,38 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, return true; } -static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static void sugov_deferred_update(struct sugov_policy *sg_policy) { - struct cpufreq_policy *policy = sg_policy->policy; - int cpu; - - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; - - policy->cur = next_freq; - - if (trace_cpu_frequency_enabled()) { - for_each_cpu(cpu, policy->cpus) - trace_cpu_frequency(next_freq, cpu); + if (!sg_policy->work_in_progress) { + sg_policy->work_in_progress = true; + irq_work_queue(&sg_policy->irq_work); } } -static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +/** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) { - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; + unsigned int freq = arch_scale_freq_ref(policy->cpu); - if (!sg_policy->work_in_progress) { - sg_policy->work_in_progress = true; - irq_work_queue(&sg_policy->irq_work); - } + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + /* + * Apply a 25% margin so that we select a higher frequency than + * the current one before the CPU is fully busy: + */ + return policy->cur + (policy->cur >> 2); } /** @@ -170,135 +166,45 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; + freq = get_capacity_ref_freq(policy); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max) { - unsigned long dl_util, util, irq; - struct rq *rq = cpu_rq(cpu); - - if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } + /* Add dvfs headroom to actual utilization */ + actual = map_util_perf(actual); + /* Actually we don't need to target the max performance */ + if (actual < max) + max = actual; /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). + * Ensure at least minimum performance while providing more compute + * capacity when possible. */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. - */ - util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); - - /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. - */ - if (util + dl_util >= max) - return max; - - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, max); - util += irq; - - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); + return max(min, max); } -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) { - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util = cpu_util_cfs(rq); - unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); + unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu); - sg_cpu->max = max; - sg_cpu->bw_dl = cpu_bw_dl(rq); - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + if (!scx_switched_all()) + util += cpu_util_cfs_boost(sg_cpu->cpu); + util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); + util = max(util, boost); + sg_cpu->bw_min = min; + sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max); } /** @@ -375,8 +281,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller - * @util: the utilization to (eventually) boost - * @max: the maximum value the utilization can be boosted to + * @max_cap: the max CPU capacity * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -391,17 +296,15 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * being more conservative on tasks which does sporadic IO operations. */ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long util, unsigned long max) + unsigned long max_cap) { - unsigned long boost; - /* No boost currently required */ if (!sg_cpu->iowait_boost) - return util; + return 0; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return util; + return 0; if (!sg_cpu->iowait_boost_pending) { /* @@ -410,114 +313,173 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return util; + return 0; } } sg_cpu->iowait_boost_pending = false; /* - * @util is already in capacity scale; convert iowait_boost + * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; - return max(boost, util); + return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; } #ifdef CONFIG_NO_HZ_COMMON -static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); - bool ret = idle_calls == sg_cpu->saved_idle_calls; + unsigned long idle_calls; + bool ret; + + /* + * The heuristics in this function is for the fair class. For SCX, the + * performance target comes directly from the BPF scheduler. Let's just + * follow it. + */ + if (scx_switched_all()) + return false; + + /* if capped by uclamp_max, always update to be in compliance */ + if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) + return false; + + /* + * Maintain the frequency if the CPU has not been idle recently, as + * reduction is likely to be premature. + */ + idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); + ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; return ret; } #else -static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } #endif /* CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL * has increased the utilization. */ -static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) +static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) - sg_policy->limits_changed = true; + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) + sg_cpu->sg_policy->limits_changed = true; } -static void sugov_update_single(struct update_util_data *hook, u64 time, - unsigned int flags) +static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, + u64 time, unsigned long max_cap, + unsigned int flags) { - struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); - struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; - unsigned int next_f; - bool busy; + unsigned long boost; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - ignore_dl_rate_limit(sg_cpu, sg_policy); + ignore_dl_rate_limit(sg_cpu); + + if (!sugov_should_update_freq(sg_cpu->sg_policy, time)) + return false; + + boost = sugov_iowait_apply(sg_cpu, time, max_cap); + sugov_get_util(sg_cpu, boost); + + return true; +} + +static void sugov_update_single_freq(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned long max_cap; + unsigned int next_f; + + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); - if (!sugov_should_update_freq(sg_policy, time)) + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - /* Limits may have changed, don't skip frequency update */ - busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); + next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); - util = sugov_get_util(sg_cpu); - max = sg_cpu->max; - util = sugov_iowait_apply(sg_cpu, time, util, max); - next_f = get_next_freq(sg_policy, util, max); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - */ - if (busy && next_f < sg_policy->next_freq) { + if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && + !sg_policy->need_freq_update) { next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; + /* Restore cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = cached_freq; } + if (!sugov_update_next_freq(sg_policy, time, next_f)) + return; + /* * This code runs under rq->lock for the target CPU, so it won't run * concurrently on two different CPUs for the same target and it is not * necessary to acquire the lock in the fast switch case. */ if (sg_policy->policy->fast_switch_enabled) { - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); } else { raw_spin_lock(&sg_policy->update_lock); - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); raw_spin_unlock(&sg_policy->update_lock); } } +static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + unsigned long prev_util = sg_cpu->util; + unsigned long max_cap; + + /* + * Fall back to the "frequency" path if frequency invariance is not + * supported, because the direct mapping between the utilization and + * the performance levels depends on the frequency invariance. + */ + if (!arch_scale_freq_invariant()) { + sugov_update_single_freq(hook, time, flags); + return; + } + + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) + return; + + if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) + sg_cpu->util = prev_util; + + cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + sg_cpu->util, max_cap); + + sg_cpu->sg_policy->last_freq_update_time = time; +} + static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0, max_cap; unsigned int j; + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; + unsigned long boost; - j_util = sugov_get_util(j_sg_cpu); - j_max = j_sg_cpu->max; - j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); + boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); + sugov_get_util(j_sg_cpu, boost); - if (j_util * max > j_max * util) { - util = j_util; - max = j_max; - } + util = max(j_sg_cpu->util, util); } - return get_next_freq(sg_policy, util, max); + return get_next_freq(sg_policy, util, max_cap); } static void @@ -532,17 +494,20 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; - ignore_dl_rate_limit(sg_cpu, sg_policy); + ignore_dl_rate_limit(sg_cpu); if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); + if (!sugov_update_next_freq(sg_policy, time, next_f)) + goto unlock; + if (sg_policy->policy->fast_switch_enabled) - sugov_fast_switch(sg_policy, time, next_f); + cpufreq_driver_fast_switch(sg_policy->policy, next_f); else - sugov_deferred_update(sg_policy, time, next_f); + sugov_deferred_update(sg_policy); } - +unlock: raw_spin_unlock(&sg_policy->update_lock); } @@ -554,7 +519,7 @@ static void sugov_work(struct kthread_work *work) /* * Hold sg_policy->update_lock shortly to handle the case where: - * incase sg_policy->next_freq is read here, and then updated by + * in case sg_policy->next_freq is read here, and then updated by * sugov_deferred_update() just before work_in_progress is set to false * here, we may miss queueing the new update. * @@ -624,9 +589,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); -static struct kobj_type sugov_tunables_ktype = { +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = to_gov_attr_set(kobj); + + kfree(to_sugov_tunables(attr_set)); +} + +static const struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -664,9 +637,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) * Fake (unused) bandwidth; workaround to "fix" * priority inheritance. */ - .sched_runtime = 1000000, - .sched_deadline = 10000000, - .sched_period = 10000000, + .sched_runtime = NSEC_PER_MSEC, + .sched_deadline = 10 * NSEC_PER_MSEC, + .sched_period = 10 * NSEC_PER_MSEC, }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -693,7 +666,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + if (policy->dvfs_possible_from_any_cpu) + set_cpus_allowed_ptr(thread, policy->related_cpus); + else + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -726,12 +703,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -788,13 +763,18 @@ static int sugov_init(struct cpufreq_policy *policy) goto fail; out: + /* + * Schedutil is the preferred governor for EAS, so rebuild sched domains + * on governor changes to make sure the scheduler knows about them. + */ + em_rebuild_sched_domains(); mutex_unlock(&global_tunables_lock); return 0; fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -821,18 +801,21 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); + + em_rebuild_sched_domains(); } static int sugov_start(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; + void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); unsigned int cpu; sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; @@ -840,24 +823,24 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->limits_changed = false; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; - for_each_cpu(cpu, policy->cpus) { - struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); - memset(sg_cpu, 0, sizeof(*sg_cpu)); - sg_cpu->cpu = cpu; - sg_cpu->sg_policy = sg_policy; - } + if (policy_is_shared(policy)) + uu = sugov_update_shared; + else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) + uu = sugov_update_single_perf; + else + uu = sugov_update_single_freq; for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - policy_is_shared(policy) ? - sugov_update_shared : - sugov_update_single); + memset(sg_cpu, 0, sizeof(*sg_cpu)); + sg_cpu->cpu = cpu; + sg_cpu->sg_policy = sg_policy; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); } return 0; } @@ -894,7 +877,7 @@ static void sugov_limits(struct cpufreq_policy *policy) struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, @@ -909,41 +892,4 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif -static int __init sugov_register(void) -{ - return cpufreq_register_governor(&schedutil_gov); -} -core_initcall(sugov_register); - -#ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - -static void rebuild_sd_workfn(struct work_struct *work) -{ - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); -} -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - struct cpufreq_governor *old_gov) -{ - if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); - } - -} -#endif +cpufreq_governor_init(schedutil_gov); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 0033731a0797..42c40cfdf836 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -11,7 +11,7 @@ * This code tracks the priority of each CPU so that global migration * decisions are easy to calculate. Each CPU can be in a state as follows: * - * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * (INVALID), NORMAL, RT1, ... RT99, HIGHER * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with @@ -19,24 +19,47 @@ * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ -#include "sched.h" -/* Convert between a 140 based task->prio, and our 102 based cpupri */ +/* + * p->rt_priority p->prio newpri cpupri + * + * -1 -1 (CPUPRI_INVALID) + * + * 99 0 (CPUPRI_NORMAL) + * + * 1 98 98 1 + * ... + * 49 50 50 49 + * 50 49 49 50 + * ... + * 99 0 0 99 + * + * 100 100 (CPUPRI_HIGHER) + */ static int convert_prio(int prio) { int cpupri; - if (prio == CPUPRI_INVALID) - cpupri = CPUPRI_INVALID; - else if (prio == MAX_PRIO) - cpupri = CPUPRI_IDLE; - else if (prio >= MAX_RT_PRIO) - cpupri = CPUPRI_NORMAL; - else - cpupri = MAX_RT_PRIO - prio + 1; + switch (prio) { + case CPUPRI_INVALID: + cpupri = CPUPRI_INVALID; /* -1 */ + break; + + case 0 ... 98: + cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */ + break; + + case MAX_RT_PRIO-1: + cpupri = CPUPRI_NORMAL; /* 0 */ + break; + + case MAX_RT_PRIO: + cpupri = CPUPRI_HIGHER; /* 100 */ + break; + } return cpupri; } @@ -53,7 +76,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, * When looking at the vector, we need to read the counter, * do a memory barrier, then read the mask. * - * Note: This is still all racey, but we can deal with it. + * Note: This is still all racy, but we can deal with it. * Ideally, we only want to look at masks that are set. * * If a mask is not set, then the only thing wrong is that we @@ -73,11 +96,12 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) return 0; - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) return 0; if (lowest_mask) { - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); + cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * We have to ensure that we have at least one bit @@ -124,7 +148,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, int task_pri = convert_prio(p->prio); int idx, cpu; - BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); + WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { @@ -162,7 +186,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, * The cost of this trade-off is not entirely clear and will probably * be good for some workloads and bad for others. * - * The main idea here is that if some CPUs were overcommitted, we try + * The main idea here is that if some CPUs were over-committed, we try * to spread which is what the scheduler traditionally did. Sys admins * must do proper RT planning to avoid overloading the system if they * really care. @@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the CPU priority setting * @cp: The cpupri context * @cpu: The target CPU - * @newpri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index efbb492bb94c..d6cba0020064 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) #define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 -/* values 2-101 are RT priorities 0-99 */ +#define CPUPRI_NORMAL 0 +/* values 1-99 are for RT1-RT99 priorities */ +#define CPUPRI_HIGHER 100 struct cpupri_vec { atomic_t count; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ff9435dee1df..6dab4854c6c0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,7 +2,10 @@ /* * Simple CPU accounting cgroup controller */ -#include "sched.h" + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + #include <asm/cputime.h> +#endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -11,15 +14,15 @@ * They are only modified in vtime_account, on corresponding CPU * with interrupts disabled. So, writes are safe. * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can + * This may result in other CPU reading this CPU's IRQ time and can * race with irq/vtime_account on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. + * or new value with a side effect of accounting a slice of IRQ time to wrong + * task when IRQ is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each IRQ in account_system_time. */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -static int sched_clock_irqtime; +int sched_clock_irqtime; void enable_sched_clock_irqtime(void) { @@ -44,21 +47,23 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, } /* - * Called before incrementing preempt_count on {soft,}irq_enter + * Called after incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void irqtime_account_irq(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr, unsigned int offset) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); + unsigned int pc; s64 delta; int cpu; - if (!sched_clock_irqtime) + if (!irqtime_enabled()) return; cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; + pc = irq_count() - offset; /* * We do not account for softirq time from ksoftirqd here. @@ -66,12 +71,11 @@ void irqtime_account_irq(struct task_struct *curr) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) + if (pc & HARDIRQ_MASK) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } -EXPORT_SYMBOL_GPL(irqtime_account_irq); static u64 irqtime_tick_accounted(u64 maxtime) { @@ -86,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime) #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -#define sched_clock_irqtime (0) - static u64 irqtime_tick_accounted(u64 dummy) { return 0; @@ -147,10 +149,10 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); cpustat[CPUTIME_GUEST] += cputime; } } @@ -226,6 +228,21 @@ void account_idle_time(u64 cputime) cpustat[CPUTIME_IDLE] += cputime; } + +#ifdef CONFIG_SCHED_CORE +/* + * Account for forceidle time due to core scheduling. + * + * REQUIRES: schedstat is enabled. + */ +void __account_forceidle_time(struct task_struct *p, u64 delta) +{ + __schedstat_add(p->stats.core_forceidle_sum, delta); + + task_group_account_field(p, CPUTIME_FORCEIDLE, delta); +} +#endif + /* * When a guest is interrupted for a longer amount of time, missed clock * ticks are not redelivered later. Due to that, this function may on @@ -250,7 +267,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) } /* - * Account how much elapsed time was spent in steal, irq, or softirq time. + * Account how much elapsed time was spent in steal, IRQ, or softirq time. */ static inline u64 account_other_time(u64 max) { @@ -351,7 +368,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) * Check for hardirq is done both for system and user time as there is * no timer going off while we are on hardirq and hence we may never get an * opportunity to update it solely in system time. - * p->stime and friends are only updated on system time and not on irq + * p->stime and friends are only updated on system time and not on IRQ * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, @@ -361,7 +378,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, /* * When returning from idle, many ticks can get accounted at - * once, including some ticks of steal, irq, and softirq time. + * once, including some ticks of steal, IRQ, and softirq time. * Subtract those ticks from the amount of time accounted to * idle, or potentially user or system time. Due to rounding, * other time can exceed ticks occasionally. @@ -405,37 +422,21 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -# ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) +void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_kernel(prev); - - vtime_flush(prev); - arch_vtime_task_switch(prev); -} -# endif - -/* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_kernel() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). - */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) -{ - if (!in_interrupt() && is_idle_task(tsk)) + unsigned int pc = irq_count() - offset; + + if (pc & HARDIRQ_OFFSET) { + vtime_account_hardirq(tsk); + } else if (pc & SOFTIRQ_OFFSET) { + vtime_account_softirq(tsk); + } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + is_idle_task(tsk)) { vtime_account_idle(tsk); - else + } else { vtime_account_kernel(tsk); + } } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, u64 *ut, u64 *st) @@ -475,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (vtime_accounting_enabled_this_cpu()) return; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_process_tick(p, user_tick, 1); return; } @@ -504,7 +505,7 @@ void account_idle_ticks(unsigned long ticks) { u64 cputime, steal; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_idle_ticks(ticks); return; } @@ -520,50 +521,6 @@ void account_idle_ticks(unsigned long ticks) } /* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static u64 scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) - swap(rtime, stime); - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return scaled; -} - -/* * Adjust tick based cputime random precision against scheduler runtime * accounting. * @@ -609,7 +566,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, /* * If either stime or utime are 0, assume all runtime is userspace. - * Once a task gets some ticks, the monotonicy code at 'update:' + * Once a task gets some ticks, the monotonicity code at 'update:' * will ensure things converge to the observed ratio. */ if (stime == 0) { @@ -622,7 +579,13 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, goto update; } - stime = scale_stime(stime, rtime, stime + utime); + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); + /* + * Because mul_u64_u64_div_u64() can approximate on some + * achitectures; enforce the constraint that: a*b/(b+c) <= a. + */ + if (unlikely(stime > rtime)) + stime = rtime; update: /* @@ -661,7 +624,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) .sum_exec_runtime = p->se.sum_exec_runtime, }; - task_cputime(p, &cputime.utime, &cputime.stime); + if (task_cputime(p, &cputime.utime, &cputime.stime)) + cputime.sum_exec_runtime = task_sched_runtime(p); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } EXPORT_SYMBOL_GPL(task_cputime_adjusted); @@ -874,19 +838,21 @@ u64 task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) +bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { struct vtime *vtime = &t->vtime; unsigned int seq; u64 delta; + int ret; if (!vtime_accounting_enabled()) { *utime = t->utime; *stime = t->stime; - return; + return false; } do { + ret = false; seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; @@ -896,6 +862,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) if (vtime->state < VTIME_SYS) continue; + ret = true; delta = vtime_delta(vtime); /* @@ -907,6 +874,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) else *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return ret; } static int vtime_state_fetch(struct vtime *vtime, int cpu) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f63f337c7147..ff4df16b5186 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -15,13 +15,52 @@ * Michael Trimarchi <michael@amarulasolutions.com>, * Fabio Checconi <fchecconi@gmail.com> */ -#include "sched.h" -#include "pelt.h" -struct dl_bandwidth def_dl_bandwidth; +#include <linux/cpuset.h> + +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ +#ifdef CONFIG_SYSCTL +static const struct ctl_table sched_dl_sysctls[] = { + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)&sysctl_sched_dl_period_min, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = (void *)&sysctl_sched_dl_period_max, + }, +}; + +static int __init sched_dl_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_dl_sysctls); + return 0; +} +late_initcall(sched_dl_sysctl_init); +#endif + +static bool dl_server(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server; +} static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { + BUG_ON(dl_server(dl_se)); return container_of(dl_se, struct task_struct, dl); } @@ -30,12 +69,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) return container_of(dl_rq, struct rq, dl); } -static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq = dl_se->rq; + + if (!dl_server(dl_se)) + rq = task_rq(dl_task_of(dl_se)); - return &rq->dl; + return rq; +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + return &rq_of_dl_se(dl_se)->dl; } static inline int on_dl_rq(struct sched_dl_entity *dl_se) @@ -43,6 +89,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -54,15 +122,75 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; return cpus; } + +static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) +{ + unsigned long cap = 0; + int i; + + for_each_cpu_and(i, mask, cpu_active_mask) + cap += arch_scale_cpu_capacity(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!sched_asym_cpucap_active() && + arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return __dl_bw_capacity(cpu_rq(i)->rd->span); + } +} + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + struct root_domain *rd = cpu_rq(cpu)->rd; + + if (rd->visit_gen == gen) + return true; + + rd->visit_gen = gen; + return false; +} + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -73,14 +201,53 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + return false; +} + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} #endif static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline bool +__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +static inline void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); @@ -93,7 +260,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) @@ -107,7 +274,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ } @@ -117,7 +284,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) @@ -153,33 +320,63 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -static void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) { - struct rq *rq; - - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); - - if (task_on_rq_queued(p)) - return; + if (dl_se->dl_non_contending) { + sub_running_bw(dl_se, &rq->dl); + dl_se->dl_non_contending = 0; - rq = task_rq(p); - if (p->dl.dl_non_contending) { - sub_running_bw(&p->dl, &rq->dl); - p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } - __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __sub_rq_bw(dl_se->dl_bw, &rq->dl); __add_rq_bw(new_bw, &rq->dl); } +static __always_inline +void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer) +{ + /* + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). + */ + if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); +} + +static __always_inline +void cancel_replenish_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->dl_timer); +} + +static __always_inline +void cancel_inactive_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->inactive_timer); +} + +static void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); + + if (task_on_rq_queued(p)) + return; + + dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); +} + +static void __dl_clear_params(struct sched_dl_entity *dl_se); + /* * The utilization of a task cannot be immediately removed from * the rq active utilization (running_bw) when the task blocks. @@ -190,7 +387,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * fires. * * If the task wakes up again before the inactive timer fires, - * the timer is cancelled, whereas if the task wakes up after the + * the timer is canceled, whereas if the task wakes up after the * inactive timer fired (and running_bw has been decreased) the * task's utilization has to be added to running_bw again. * A flag in the deadline scheduling entity (dl_non_contending) @@ -234,12 +431,11 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct task_struct *p) +static void task_non_contending(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->inactive_timer; - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); + struct dl_rq *dl_rq = &rq->dl; s64 zerolag_time; /* @@ -269,24 +465,33 @@ static void task_non_contending(struct task_struct *p) * utilization now, instead of starting a timer */ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { - if (dl_task(p)) + if (dl_server(dl_se)) { sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || p->state == TASK_DEAD) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - if (p->state == TASK_DEAD) - sub_rq_bw(&p->dl, &rq->dl); - raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - __dl_clear_params(p); - raw_spin_unlock(&dl_b->lock); + } else { + struct task_struct *p = dl_task_of(dl_se); + + if (dl_task(p)) + sub_running_bw(dl_se, dl_rq); + + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (READ_ONCE(p->__state) == TASK_DEAD) + sub_rq_bw(dl_se, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(dl_se); + } } return; } dl_se->dl_non_contending = 1; - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } @@ -308,13 +513,12 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) dl_se->dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) - put_task_struct(dl_task_of(dl_se)); + cancel_inactive_timer(dl_se); } else { /* * Since "dl_non_contending" is not set, the @@ -327,31 +531,20 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) } } -static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - struct sched_dl_entity *dl_se = &p->dl; - - return dl_rq->root.rb_leftmost == &dl_se->rb_node; + return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); -void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) -{ - raw_spin_lock_init(&dl_b->dl_runtime_lock); - dl_b->dl_period = period; - dl_b->dl_runtime = runtime; -} - void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); - raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); if (global_rt_runtime() == RUNTIME_INF) dl_b->bw = -1; else dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); - raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); dl_b->total_bw = 0; } @@ -363,7 +556,6 @@ void init_dl_rq(struct dl_rq *dl_rq) /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; - dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else @@ -407,37 +599,17 @@ static inline void dl_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); } -static void update_dl_migration(struct dl_rq *dl_rq) -{ - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { - if (!dl_rq->overloaded) { - dl_set_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 1; - } - } else if (dl_rq->overloaded) { - dl_clear_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 0; - } -} +#define __node_2_pdl(node) \ + rb_entry((node), struct task_struct, pushable_dl_tasks) -static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) { - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory++; - - update_dl_migration(dl_rq); + return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); } -static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +static inline int has_pushable_dl_tasks(struct rq *rq) { - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory--; - - update_dl_migration(dl_rq); + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); } /* @@ -446,69 +618,52 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) */ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { - struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct task_struct *entry; - bool leftmost = true; - - BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct task_struct, - pushable_dl_tasks); - if (dl_entity_preempt(&p->dl, &entry->dl)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = false; - } - } + struct rb_node *leftmost; + WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + + leftmost = rb_add_cached(&p->pushable_dl_tasks, + &rq->dl.pushable_dl_tasks_root, + __pushable_less); if (leftmost) - dl_rq->earliest_dl.next = p->dl.deadline; + rq->dl.earliest_dl.next = p->dl.deadline; - rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color_cached(&p->pushable_dl_tasks, - &dl_rq->pushable_dl_tasks_root, leftmost); + if (!rq->dl.overloaded) { + dl_set_overload(rq); + rq->dl.overloaded = 1; + } } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; + struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root; + struct rb_node *leftmost; if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { - struct rb_node *next_node; - - next_node = rb_next(&p->pushable_dl_tasks); - if (next_node) { - dl_rq->earliest_dl.next = rb_entry(next_node, - struct task_struct, pushable_dl_tasks)->dl.deadline; - } - } + leftmost = rb_erase_cached(&p->pushable_dl_tasks, root); + if (leftmost) + dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; - rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); -} -static inline int has_pushable_dl_tasks(struct rq *rq) -{ - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); + if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) { + dl_clear_overload(rq); + rq->dl.overloaded = 0; + } } static int push_dl_task(struct rq *rq); static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) { - return dl_task(prev); + return rq->online && dl_task(prev); } -static DEFINE_PER_CPU(struct callback_head, dl_push_head); -static DEFINE_PER_CPU(struct callback_head, dl_pull_head); +static DEFINE_PER_CPU(struct balance_callback, dl_push_head); +static DEFINE_PER_CPU(struct balance_callback, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); @@ -547,7 +702,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * Failed to find any suitable CPU. * The task will never come back! */ - BUG_ON(dl_bandwidth_enabled()); + WARN_ON_ONCE(dl_bandwidth_enabled()); /* * If admission control is disabled we @@ -578,7 +733,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p } /* - * And we finally need to fixup root_domain(s) bandwidth accounting, + * And we finally need to fix up root_domain(s) bandwidth accounting, * since p is still hanging out in the old (now moved to default) root * domain. */ @@ -620,15 +775,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } -static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_dl_task(struct rq *rq) -{ -} - static inline void deadline_queue_push_tasks(struct rq *rq) { } @@ -638,9 +784,28 @@ static inline void deadline_queue_pull_task(struct rq *rq) } #endif /* CONFIG_SMP */ +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags); +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); + +static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +{ + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; + + /* + * If it is a deferred reservation, and the server + * is not handling an starvation case, defer it. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running) { + dl_se->dl_throttled = 1; + dl_se->dl_defer_armed = 1; + } +} /* * We are being explicitly informed that a new instance is starting, @@ -659,7 +824,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -675,10 +840,12 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } +static int start_dl_timer(struct sched_dl_entity *dl_se); +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); + /* * Pure Earliest Deadline First (EDF) scheduling does not deal with the * possibility of a entity lasting more than what it declared, and thus @@ -697,21 +864,27 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. + * + * Or, it could be the case of a deferred reservation that + * was not able to consume its runtime in background and + * reached this point with current u > U. + * + * In both cases, set a new period. */ - if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + if (dl_se->dl_deadline == 0 || + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded && dl_se->runtime > 0) @@ -724,8 +897,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -739,14 +912,51 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } if (dl_se->dl_yielded) dl_se->dl_yielded = 0; if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + + /* + * If this is the replenishment of a deferred reservation, + * clear the flag and return. + */ + if (dl_se->dl_defer_armed) { + dl_se->dl_defer_armed = 0; + return; + } + + /* + * A this point, if the deferred server is not armed, and the deadline + * is in the future, if it is not running already, throttle the server + * and arm the defer timer. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running && + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + + /* + * Set dl_se->dl_defer_armed and dl_throttled variables to + * inform the start_dl_timer() that this is a deferred + * activation. + */ + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + if (!start_dl_timer(dl_se)) { + /* + * If for whatever reason (delays), a previous timer was + * queued but not serviced, cancel it and clean the + * deferrable server variables intended for start_dl_timer(). + */ + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + } + } } /* @@ -773,8 +983,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -796,9 +1005,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -867,7 +1076,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * is detected, the runtime and deadline need to be updated. * * If the task has an implicit deadline, i.e., deadline == period, the Original - * CBS is applied. the runtime is replenished and a new absolute deadline is + * CBS is applied. The runtime is replenished and a new absolute deadline is * set, as in the previous cases. * * However, the Original CBS does not work properly for tasks with @@ -883,24 +1092,30 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); + } else if (dl_server(dl_se) && dl_se->dl_defer) { + /* + * The server can still use its previous deadline, so check if + * it left the dl_defer_running state. + */ + if (!dl_se->dl_defer_running) { + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + } } } @@ -919,22 +1134,35 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se) * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct task_struct *p) +static int start_dl_timer(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->dl_timer; - struct rq *rq = task_rq(p); + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; s64 delta; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. + * + * The deferred reservation will have its timer set to + * (deadline - runtime). At that point, the CBS rule will decide + * if the current deadline can be used, or if a replenishment is + * required to avoid add too much pressure on the system + * (current u > U). */ - act = ns_to_ktime(dl_next_period(dl_se)); + if (dl_se->dl_defer_armed) { + WARN_ON_ONCE(!dl_se->dl_throttled); + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); + } else { + /* act = deadline - rel-deadline + period */ + act = ns_to_ktime(dl_next_period(dl_se)); + } + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -957,13 +1185,89 @@ static int start_dl_timer(struct task_struct *p) * and observe our state. */ if (!hrtimer_is_queued(timer)) { - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } return 1; } +static void __push_dl_task(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + rq_unpin_lock(rq, rf); + push_dl_task(rq); + rq_repin_lock(rq, rf); + } +#endif +} + +/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ +static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; + +static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) +{ + struct rq *rq = rq_of_dl_se(dl_se); + u64 fw; + + scoped_guard (rq_lock, rq) { + struct rq_flags *rf = &scope.rf; + + if (!dl_se->dl_throttled || !dl_se->dl_runtime) + return HRTIMER_NORESTART; + + sched_clock_tick(); + update_rq_clock(rq); + + if (!dl_se->dl_runtime) + return HRTIMER_NORESTART; + + if (!dl_se->server_has_tasks(dl_se)) { + replenish_dl_entity(dl_se); + return HRTIMER_NORESTART; + } + + if (dl_se->dl_defer_armed) { + /* + * First check if the server could consume runtime in background. + * If so, it is possible to push the defer timer for this amount + * of time. The dl_server_min_res serves as a limit to avoid + * forwarding the timer for a too small amount of time. + */ + if (dl_time_before(rq_clock(dl_se->rq), + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { + + /* reset the defer timer */ + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; + + hrtimer_forward_now(timer, ns_to_ktime(fw)); + return HRTIMER_RESTART; + } + + dl_se->dl_defer_running = 1; + } + + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) + resched_curr(rq); + + __push_dl_task(rq, rf); + } + + return HRTIMER_NORESTART; +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -982,10 +1286,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, dl_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p; struct rq_flags rf; struct rq *rq; + if (dl_server(dl_se)) + return dl_server_timer(timer, dl_se); + + p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); /* @@ -999,7 +1307,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1027,7 +1335,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1037,9 +1345,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * If the runqueue is no longer available, migrate the * task elsewhere. This necessarily changes rq. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf.cookie); rq = dl_task_offline_migration(rq, p); - rf.cookie = lockdep_pin_lock(&rq->lock); + rf.cookie = lockdep_pin_lock(__rq_lockp(rq)); update_rq_clock(rq); /* @@ -1051,26 +1359,12 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) #endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (dl_task(rq->donor)) + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); -#ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, check if we need - * to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) { - /* - * Nothing relies on rq->lock after this, so its safe to drop - * rq->lock. - */ - rq_unpin_lock(rq, &rf); - push_dl_task(rq); - rq_repin_lock(rq, &rf); - } -#endif + __push_dl_task(rq, &rf); unlock: task_rq_unlock(rq, p, &rf); @@ -1084,7 +1378,7 @@ unlock: return HRTIMER_NORESTART; } -void init_dl_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; @@ -1098,7 +1392,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the @@ -1112,12 +1406,11 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) */ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1131,91 +1424,46 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); } -extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* - * This function implements the GRUB accounting rule: - * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as - * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * This function implements the GRUB accounting rule. According to the + * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt", + * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt", * where u is the utilization of the task, Umax is the maximum reclaimable * utilization, Uinact is the (per-runqueue) inactive utilization, computed * as the difference between the "total runqueue utilization" and the - * runqueue active utilization, and Uextra is the (per runqueue) extra + * "runqueue active utilization", and Uextra is the (per runqueue) extra * reclaimable utilization. - * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations - * multiplied by 2^BW_SHIFT, the result has to be shifted right by - * BW_SHIFT. - * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, - * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. - * Since delta is a 64 bit variable, to have an overflow its value - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. - * So, overflow is not an issue here. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied + * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw + * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value should be + * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is + * not an issue here. */ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { - u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; - u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ /* - * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, - * we compare u_inact + rq->dl.extra_bw with - * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because - * u_inact + rq->dl.extra_bw can be larger than - * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative - * leading to wrong results) + * Instead of computing max{u, (u_max - u_inact - u_extra)}, we + * compare u_inact + u_extra with u_max - u, because u_inact + u_extra + * can be larger than u_max. So, u_max - u_inact - u_extra would be + * negative leading to wrong results. */ - if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) - u_act = u_act_min; + if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw) + u_act = dl_se->dl_bw; else - u_act = BW_UNIT - u_inact - rq->dl.extra_bw; + u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw; + u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT; return (delta * u_act) >> BW_SHIFT; } -/* - * Update the current task's runtime statistics (provided it is still - * a -deadline task and has not been removed from the dl_rq). - */ -static void update_curr_dl(struct rq *rq) +s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec, scaled_delta_exec; - int cpu = cpu_of(rq); - u64 now; - - if (!dl_task(curr) || !on_dl_rq(dl_se)) - return; - - /* - * Consumed budget is computed considering the time as - * observed by schedulable tasks (excluding time spent - * in hardirq context, etc.). Deadlines are instead - * computed using hard walltime. This seems to be the more - * natural solution, but the full ramifications of this - * approach need further study. - */ - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) { - if (unlikely(dl_se->dl_yielded)) - goto throttle; - return; - } - - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); - - if (dl_entity_is_special(dl_se)) - return; + s64 scaled_delta_exec; /* * For tasks that participate in GRUB, we implement GRUB-PA: the @@ -1225,10 +1473,9 @@ static void update_curr_dl(struct rq *rq) * according to current frequency and CPU maximum capacity. */ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { - scaled_delta_exec = grub_reclaim(delta_exec, - rq, - &curr->dl); + scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se); } else { + int cpu = cpu_of(rq); unsigned long scale_freq = arch_scale_freq_capacity(cpu); unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); @@ -1236,8 +1483,64 @@ static void update_curr_dl(struct rq *rq) scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); } + return scaled_delta_exec; +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +{ + s64 scaled_delta_exec; + + if (unlikely(delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; + return; + } + + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) + return; + + if (dl_entity_is_special(dl_se)) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + dl_se->runtime -= scaled_delta_exec; + /* + * The fair server can consume its runtime while throttled (not queued/ + * running as regular CFS). + * + * If the server consumes its entire runtime in this state. The server + * is not required for the current period. Thus, reset the server by + * starting a new period, pushing the activation. + */ + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { + /* + * If the server was previously activated - the starving condition + * took place, it this point it went away because the fair scheduler + * was able to get runtime in background. So return to the initial + * state. + */ + dl_se->dl_defer_running = 0; + + hrtimer_try_to_cancel(&dl_se->dl_timer); + + replenish_dl_new_period(dl_se, dl_se->rq); + + /* + * Not being able to start the timer seems problematic. If it could not + * be started for whatever reason, we need to "unthrottle" the DL server + * and queue right away. Otherwise nothing might queue it. That's similar + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. + */ + WARN_ON_ONCE(!start_dl_timer(dl_se)); + + return; + } + throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; @@ -1247,15 +1550,32 @@ throttle: (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) dl_se->dl_overrun = 1; - __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) - enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + dequeue_dl_entity(dl_se, 0); + if (!dl_server(dl_se)) { + update_stats_dequeue_dl(&rq->dl, dl_se, 0); + dequeue_pushable_dl_task(rq, dl_task_of(dl_se)); + } + + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { + if (dl_server(dl_se)) + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + else + enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } - if (!is_leftmost(curr, &rq->dl)) + if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); } /* + * The fair server (sole dl_server) does not account for real-time + * workload because it is running fair work. + */ + if (dl_se == &rq->fair_server) + return; + +#ifdef CONFIG_RT_GROUP_SCHED + /* * Because -- for now -- we share the rt bandwidth, we need to * account our runtime there too, otherwise actual rt tasks * would be able to exceed the shared quota. @@ -1279,6 +1599,182 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } +#endif +} + +/* + * In the non-defer mode, the idle time is not accounted, as the + * server provides a guarantee. + * + * If the dl_server is in defer mode, the idle time is also considered + * as time available for the fair server, avoiding a penalty for the + * rt scheduler that did not consumed that time. + */ +void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +{ + s64 delta_exec, scaled_delta_exec; + + if (!rq->fair_server.dl_defer) + return; + + /* no need to discount more */ + if (rq->fair_server.runtime < 0) + return; + + delta_exec = rq_clock_task(rq) - p->se.exec_start; + if (delta_exec < 0) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); + + rq->fair_server.runtime -= scaled_delta_exec; + + if (rq->fair_server.runtime < 0) { + rq->fair_server.dl_defer_running = 0; + rq->fair_server.runtime = 0; + } + + p->se.exec_start = rq_clock_task(rq); +} + +void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) +{ + /* 0 runtime = fair server disabled */ + if (dl_se->dl_runtime) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); +} + +void dl_server_start(struct sched_dl_entity *dl_se) +{ + struct rq *rq = dl_se->rq; + + /* + * XXX: the apply do not work fine at the init phase for the + * fair server because things are not yet set. We need to improve + * this before getting generic. + */ + if (!dl_server(dl_se)) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } + + if (!dl_se->dl_runtime) + return; + + dl_se->dl_server_active = 1; + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) + resched_curr(dl_se->rq); +} + +void dl_server_stop(struct sched_dl_entity *dl_se) +{ + if (!dl_se->dl_runtime) + return; + + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + dl_se->dl_server_active = 0; +} + +void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick_task) +{ + dl_se->rq = rq; + dl_se->server_has_tasks = has_tasks; + dl_se->server_pick_task = pick_task; +} + +void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 new_bw = dl_se->dl_bw; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + + dl_b = dl_bw_of(cpu_of(rq)); + guard(raw_spinlock)(&dl_b->lock); + + if (!dl_bw_cpus(cpu)) + return; + + __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); +} + +int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) +{ + u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 new_bw = to_ratio(period, runtime); + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + unsigned long cap; + int retval = 0; + int cpus; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + + if (__dl_overflow(dl_b, cap, old_bw, new_bw)) + return -EBUSY; + + if (init) { + __add_rq_bw(new_bw, &rq->dl); + __dl_add(dl_b, new_bw, cpus); + } else { + __dl_sub(dl_b, dl_se->dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + + dl_rq_change_utilization(rq, dl_se, new_bw); + } + + dl_se->dl_runtime = runtime; + dl_se->dl_deadline = period; + dl_se->dl_period = period; + + dl_se->runtime = 0; + dl_se->deadline = 0; + + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); + + return retval; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *donor = rq->donor; + struct sched_dl_entity *dl_se = &donor->dl; + s64 delta_exec; + + if (!dl_task(donor) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = update_curr_common(rq); + update_curr_dl_se(rq, dl_se, delta_exec); } static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) @@ -1286,19 +1782,28 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, inactive_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p = NULL; struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &rf); + if (!dl_server(dl_se)) { + p = dl_task_of(dl_se); + rq = task_rq_lock(p, &rf); + } else { + rq = dl_se->rq; + rq_lock(rq, &rf); + } sched_clock_tick(); update_rq_clock(rq); - if (!dl_task(p) || p->state == TASK_DEAD) { + if (dl_server(dl_se)) + goto no_task; + + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; @@ -1307,23 +1812,30 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); goto unlock; } + +no_task: if (dl_se->dl_non_contending == 0) goto unlock; sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: - task_rq_unlock(rq, p, &rf); - put_task_struct(p); + + if (!dl_server(dl_se)) { + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + } else { + rq_unlock(rq, &rf); + } return HRTIMER_NORESTART; } -void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; @@ -1331,6 +1843,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) timer->function = inactive_task_timer; } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -1339,6 +1854,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + if (dl_rq->earliest_dl.curr == 0) + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } @@ -1356,11 +1873,11 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { - struct rb_node *leftmost = dl_rq->root.rb_leftmost; - struct sched_dl_entity *entry; + struct rb_node *leftmost = rb_first_cached(&dl_rq->root); + struct sched_dl_entity *entry = __node_2_dle(leftmost); - entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } @@ -1376,54 +1893,106 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; u64 deadline = dl_se->deadline; - WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); - inc_dl_migration(dl_se, dl_rq); } static inline void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; - - WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); - dec_dl_migration(dl_se, dl_rq); +} + +static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); +} + +static __always_inline struct sched_statistics * +__schedstats_from_dl_se(struct sched_dl_entity *dl_se) +{ + if (!schedstat_enabled()) + return NULL; + + if (dl_server(dl_se)) + return NULL; + + return &dl_task_of(dl_se)->stats; +} + +static inline void +update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_dl(dl_rq, dl_se); +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (!schedstat_enabled()) + return; + + if ((flags & DEQUEUE_SLEEP)) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_dl_rq(dl_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_dl_rq(dl_rq))); + } } static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_dl_entity *entry; - int leftmost = 1; - - BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_dl_entity, rb_node); - if (dl_time_before(dl_se->deadline, entry->deadline)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = 0; - } - } - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); + WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node)); + + rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); inc_dl_tasks(dl_se, dl_rq); } @@ -1436,67 +2005,18 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) return; rb_erase_cached(&dl_se->rb_node, &dl_rq->root); + RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - BUG_ON(on_dl_rq(dl_se)); + WARN_ON_ONCE(on_dl_rq(dl_se)); - /* - * If this is a wakeup or a new instance, the scheduling - * parameters of the task might need updating. Otherwise, - * we want a replenishment of its runtime. - */ - if (flags & ENQUEUE_WAKEUP) { - task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); - } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); - } else if ((flags & ENQUEUE_RESTORE) && - dl_time_before(dl_se->deadline, - rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { - setup_new_dl_entity(dl_se); - } - - __enqueue_dl_entity(dl_se); -} - -static void dequeue_dl_entity(struct sched_dl_entity *dl_se) -{ - __dequeue_dl_entity(dl_se); -} - -static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) -{ - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; - } else if (!dl_prio(p->normal_prio)) { - /* - * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceeds its - * runtime while doing so. No point in replenishing - * it, as it's going to return back to its original - * scheduling class after this. - */ - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); - return; - } + update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); /* * Check if a constrained deadline task was activated @@ -1504,12 +2024,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) - dl_check_constrained_dl(&p->dl); + if (!dl_se->dl_throttled && !dl_is_implicit(dl_se)) + dl_check_constrained_dl(dl_se); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(&p->dl, &rq->dl); - add_running_bw(&p->dl, &rq->dl); + if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + add_rq_bw(dl_se, dl_rq); + add_running_bw(dl_se, dl_rq); } /* @@ -1524,33 +2046,60 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * be counted in the active utilization; hence, we need to call * add_running_bw(). */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl, flags); + task_contending(dl_se, flags); return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + /* + * If this is a wakeup or a new instance, the scheduling + * parameters of the task might need updating. Otherwise, + * we want a replenishment of its runtime. + */ + if (flags & ENQUEUE_WAKEUP) { + task_contending(dl_se, flags); + update_dl_entity(dl_se); + } else if (flags & ENQUEUE_REPLENISH) { + replenish_dl_entity(dl_se); + } else if ((flags & ENQUEUE_RESTORE) && + !is_dl_boosted(dl_se) && + dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { + setup_new_dl_entity(dl_se); + } + + /* + * If the reservation is still throttled, e.g., it got replenished but is a + * deferred task and still got to wait, don't enqueue. + */ + if (dl_se->dl_throttled && start_dl_timer(dl_se)) + return; - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) - enqueue_pushable_dl_task(rq, p); -} + /* + * We're about to enqueue, make sure we're not ->dl_throttled! + * In case the timer was not started, say because the defer time + * has passed, mark as not throttled and mark unarmed. + * Also cancel earlier timers, since letting those run is pointless. + */ + if (dl_se->dl_throttled) { + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) -{ - dequeue_dl_entity(&p->dl); - dequeue_pushable_dl_task(rq, p); + __enqueue_dl_entity(dl_se); } -static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - update_curr_dl(rq); - __dequeue_task_dl(rq, p, flags); + __dequeue_dl_entity(dl_se); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(&p->dl, &rq->dl); - sub_rq_bw(&p->dl, &rq->dl); + if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + sub_running_bw(dl_se, dl_rq); + sub_rq_bw(dl_se, dl_rq); } /* @@ -1563,7 +2112,78 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - task_non_contending(p); + task_non_contending(dl_se); +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + if (is_dl_boosted(&p->dl)) { + /* + * Because of delays in the detection of the overrun of a + * thread's runtime, it might be the case that a thread + * goes to sleep in a rt mutex with negative runtime. As + * a consequence, the thread will be throttled. + * + * While waiting for the mutex, this thread can also be + * boosted via PI, resulting in a thread that is throttled + * and boosted at the same time. + * + * In this case, the boost overrides the throttle. + */ + if (p->dl.dl_throttled) { + /* + * The replenish timer needs to be canceled. No + * problem if it fires concurrently: boosted threads + * are ignored in dl_task_timer(). + */ + cancel_replenish_timer(&p->dl); + p->dl.dl_throttled = 0; + } + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task that is going + * to be deboosted, but exceeds its runtime while doing so. No point in + * replenishing it, as it's going to return back to its original + * scheduling class after this. If it has been throttled, we need to + * clear the flag, otherwise the task may wake up as throttled after + * being boosted again with no means to replenish the runtime and clear + * the throttle. + */ + p->dl.dl_throttled = 0; + if (!(flags & ENQUEUE_REPLENISH)) + printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", + task_pid_nr(p)); + + return; + } + + check_schedstat_required(); + update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= ENQUEUE_MIGRATING; + + enqueue_dl_entity(&p->dl, flags); + + if (dl_server(&p->dl)) + return; + + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); +} + +static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + update_curr_dl(rq); + + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= DEQUEUE_MIGRATING; + + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled && !dl_server(&p->dl)) + dequeue_pushable_dl_task(rq, p); + + return true; } /* @@ -1598,21 +2218,31 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP +static inline bool dl_task_is_earliest_deadline(struct task_struct *p, + struct rq *rq) +{ + return (!rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + rq->dl.earliest_dl.curr)); +} + static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; + bool select_rq; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE) + if (!(flags & WF_TTWU)) goto out; rq = cpu_rq(cpu); rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If we are dealing with a -deadline task, we must @@ -1623,16 +2253,23 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(donor)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &donor->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (sched_asym_cpucap_active()) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && - (dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr) || - (cpu_rq(target)->dl.dl_nr_running == 0))) + dl_task_is_earliest_deadline(p, cpu_rq(target))) cpu = target; } rcu_read_unlock(); @@ -1643,9 +2280,10 @@ out: static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq; - if (p->state != TASK_WAKING) + if (READ_ONCE(p->__state) != TASK_WAKING) return; rq = task_rq(p); @@ -1654,22 +2292,22 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) @@ -1679,7 +2317,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) return; /* @@ -1715,10 +2353,10 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; } @@ -1728,26 +2366,31 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... */ - if ((p->dl.deadline == rq->curr->dl.deadline) && + if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); #endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, dl_se->runtime); } #else /* !CONFIG_SCHED_HRTICK */ -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } #endif static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + p->se.exec_start = rq_clock_task(rq); + if (on_dl_rq(&p->dl)) + update_stats_wait_end_dl(dl_rq, dl_se); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); @@ -1755,44 +2398,72 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, p); - - if (rq->curr->sched_class != &dl_sched_class) + if (rq->donor->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); + + if (hrtick_enabled_dl(rq)) + start_hrtick_dl(rq, &p->dl); } -static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, - struct dl_rq *dl_rq) +static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) { struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; - return rb_entry(left, struct sched_dl_entity, rb_node); + return __node_2_dle(left); } -static struct task_struct *pick_next_task_dl(struct rq *rq) +/* + * __pick_next_task_dl - Helper to pick the next -deadline task to run. + * @rq: The runqueue to pick the next task from. + */ +static struct task_struct *__pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; +again: if (!sched_dl_runnable(rq)) return NULL; - dl_se = pick_next_dl_entity(rq, dl_rq); - BUG_ON(!dl_se); - p = dl_task_of(dl_se); - set_next_task_dl(rq, p, true); + dl_se = pick_next_dl_entity(dl_rq); + WARN_ON_ONCE(!dl_se); + + if (dl_server(dl_se)) { + p = dl_se->server_pick_task(dl_se); + if (!p) { + if (dl_server_active(dl_se)) { + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + } + goto again; + } + rq->dl_server = dl_se; + } else { + p = dl_task_of(dl_se); + } + return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static struct task_struct *pick_task_dl(struct rq *rq) +{ + return __pick_task_dl(rq); +} + +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + + if (on_dl_rq(&p->dl)) + update_stats_wait_start_dl(dl_rq, dl_se); + update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); @@ -1818,9 +2489,9 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * not being the leftmost task anymore. In that case NEED_RESCHED will * be set and schedule() will start a new hrtick for the next task. */ - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && - is_leftmost(p, &rq->dl)) - start_hrtick_dl(rq, p); + if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && + is_leftmost(&p->dl, &rq->dl)) + start_hrtick_dl(rq, &p->dl); } static void task_fork_dl(struct task_struct *p) @@ -1836,35 +2507,26 @@ static void task_fork_dl(struct task_struct *p) /* Only try algorithms three times */ #define DL_MAX_TRIES 3 -static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) - return 1; - return 0; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; -next_node: - if (next_node) { - p = rb_entry(next_node, struct task_struct, pushable_dl_tasks); + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + while (next_node) { + p = __node_2_pdl(next_node); - if (pick_dl_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); - goto next_node; } return NULL; @@ -1929,8 +2591,8 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(later_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(later_mask, + sched_domain_span(sd)); /* * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our @@ -1952,7 +2614,7 @@ static int find_later_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(later_mask); + cpu = cpumask_any_distribute(later_mask); if (cpu < nr_cpu_ids) return cpu; @@ -1974,9 +2636,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (later_rq->dl.dl_nr_running && - !dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) { + if (!dl_task_is_earliest_deadline(task, later_rq)) { /* * Target rq has tasks of equal or earlier deadline, * retrying does not release any lock and is unlikely @@ -1989,9 +2649,10 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || - task_running(rq, task) || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || + task_on_cpu(rq, task) || !dl_task(task) || + is_migration_disabled(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; @@ -2004,9 +2665,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) * its earliest one has a later deadline than our * task, the rq is a good one. */ - if (!later_rq->dl.dl_nr_running || - dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) + if (dl_task_is_earliest_deadline(task, later_rq)) break; /* Otherwise we try again. */ @@ -2024,15 +2683,14 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, - struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); return p; } @@ -2048,29 +2706,29 @@ static int push_dl_task(struct rq *rq) struct rq *later_rq; int ret = 0; - if (!rq->dl.overloaded) - return 0; - next_task = pick_next_pushable_dl_task(rq); if (!next_task) return 0; retry: - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * If next_task preempts rq->curr, and rq->curr * can move away, it makes sense to just reschedule * without going further in pushing next_task. */ - if (dl_task(rq->curr) && - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + if (dl_task(rq->donor) && + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { resched_curr(rq); return 0; } + if (is_migration_disabled(next_task)) + return 0; + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -2102,15 +2760,7 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, later_rq->cpu); - - /* - * Update the later_rq clock here, because the clock is used - * by the cpufreq_update_util() inside __add_running_bw(). - */ - update_rq_clock(later_rq); - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); + move_queued_task_locked(rq, later_rq, next_task); ret = 1; resched_curr(later_rq); @@ -2133,7 +2783,7 @@ static void push_dl_tasks(struct rq *rq) static void pull_dl_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; - struct task_struct *p; + struct task_struct *p, *push_task; bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; @@ -2154,7 +2804,7 @@ static void pull_dl_task(struct rq *this_rq) src_rq = cpu_rq(cpu); /* - * It looks racy, abd it is! However, as in sched_rt.c, + * It looks racy, and it is! However, as in sched_rt.c, * we are fine with this. */ if (this_rq->dl.dl_nr_running && @@ -2163,6 +2813,7 @@ static void pull_dl_task(struct rq *this_rq) continue; /* Might drop this_rq->lock */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2180,9 +2831,7 @@ static void pull_dl_task(struct rq *this_rq) * - it will preempt the last one we pulled (if any). */ if (p && dl_time_before(p->dl.deadline, dmin) && - (!this_rq->dl.dl_nr_running || - dl_time_before(p->dl.deadline, - this_rq->dl.earliest_dl.curr))) { + dl_task_is_earliest_deadline(p, this_rq)) { WARN_ON(p == src_rq->curr); WARN_ON(!task_on_rq_queued(p)); @@ -2191,20 +2840,30 @@ static void pull_dl_task(struct rq *this_rq) * deadline than the current task of its runqueue. */ if (dl_time_before(p->dl.deadline, - src_rq->curr->dl.deadline)) + src_rq->donor->dl.deadline)) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - dmin = p->dl.deadline; + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + move_queued_task_locked(src_rq, this_rq, p); + dmin = p->dl.deadline; + resched = true; + } /* Is there any other task even earlier? */ } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + preempt_disable(); + raw_spin_rq_unlock(this_rq); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + preempt_enable(); + raw_spin_rq_lock(this_rq); + } } if (resched) @@ -2217,23 +2876,23 @@ skip: */ static void task_woken_dl(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - dl_task(rq->curr) && + dl_task(rq->donor) && (rq->curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { push_dl_tasks(rq); } } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask) + struct affinity_context *ctx) { struct root_domain *src_rd; struct rq *rq; - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); src_rd = rq->rd; @@ -2243,7 +2902,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, new_mask)) { + if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -2257,7 +2916,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask); + set_cpus_allowed_common(p, ctx); } /* Assumes rq->lock is held */ @@ -2296,9 +2955,13 @@ void dl_add_task_root_domain(struct task_struct *p) struct rq *rq; struct dl_bw *dl_b; - rq = task_rq_lock(p, &rf); - if (!dl_task(p)) - goto unlock; + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + if (!dl_task(p)) { + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return; + } + + rq = __task_rq_lock(p, &rf); dl_b = &rq->rd->dl_bw; raw_spin_lock(&dl_b->lock); @@ -2307,17 +2970,27 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock(&dl_b->lock); -unlock: task_rq_unlock(rq, p, &rf); } void dl_clear_root_domain(struct root_domain *rd) { - unsigned long flags; + int i; - raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); rd->dl_bw.total_bw = 0; - raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); + + /* + * dl_server bandwidth is only restored when CPUs are attached to root + * domains (after domains are created or CPUs moved back to the + * default root doamin). + */ + for_each_cpu(i, rd->span) { + struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; + + if (dl_server(dl_se) && cpu_active(i)) + rd->dl_bw.total_bw += dl_se->dl_bw; + } } #endif /* CONFIG_SMP */ @@ -2333,7 +3006,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(p); + task_non_contending(&p->dl); + + /* + * In case a task is setscheduled out from SCHED_DEADLINE we need to + * keep track of that on its cpuset (for correct bandwidth tracking). + */ + dec_dl_tasks_cs(p); if (!task_on_rq_queued(p)) { /* @@ -2372,8 +3051,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); + + /* + * In case a task is setscheduled to SCHED_DEADLINE we need to keep + * track of that on its cpuset (for correct bandwidth tracking). + */ + inc_dl_tasks_cs(p); /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { @@ -2382,15 +3066,17 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - if (rq->curr != p) { + if (rq->donor != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); #endif - if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (dl_task(rq->donor)) + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); + } else { + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); } } @@ -2401,17 +3087,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || rq->curr == p) { + if (!task_on_rq_queued(p)) + return; + #ifdef CONFIG_SMP - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) - deadline_queue_pull_task(rq); + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + deadline_queue_pull_task(rq); + if (task_current_donor(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this @@ -2419,26 +3108,42 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) resched_curr(rq); -#else + } else { /* - * Again, we don't know if p has a earlier - * or later deadline, so let's blindly set a - * (maybe not needed) rescheduling point. + * Current may not be deadline in case p was throttled but we + * have just replenished it (e.g. rt_mutex_setprio()). + * + * Otherwise, if p was given an earlier deadline, reschedule. */ - resched_curr(rq); -#endif /* CONFIG_SMP */ + if (!dl_task(rq->curr) || + dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) + resched_curr(rq); } +#else + /* + * We don't know if p has a earlier or later deadline, so let's blindly + * set a (maybe not needed) rescheduling point. + */ + resched_curr(rq); +#endif +} + +#ifdef CONFIG_SCHED_CORE +static int task_is_throttled_dl(struct task_struct *p, int cpu) +{ + return p->dl.dl_throttled; } +#endif + +DEFINE_SCHED_CLASS(dl) = { -const struct sched_class dl_sched_class = { - .next = &rt_sched_class, .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, - .check_preempt_curr = check_preempt_curr_dl, + .wakeup_preempt = wakeup_preempt_dl, - .pick_next_task = pick_next_task_dl, + .pick_task = pick_task_dl, .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, @@ -2450,6 +3155,7 @@ const struct sched_class dl_sched_class = { .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, + .find_lock_rq = find_lock_later_rq, #endif .task_tick = task_tick_dl, @@ -2460,35 +3166,44 @@ const struct sched_class dl_sched_class = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_dl, +#endif }; +/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ +static u64 dl_generation; + int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + u64 gen = ++dl_generation; struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus, ret = 0; unsigned long flags; /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! */ - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) + goto next; + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); +next: rcu_read_unlock_sched(); if (ret) @@ -2502,33 +3217,34 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; - dl_rq->extra_bw = 1 << BW_SHIFT; + dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT; } else { dl_rq->bw_ratio = to_ratio(global_rt_runtime(), global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); - dl_rq->extra_bw = to_ratio(global_rt_period(), - global_rt_runtime()); + dl_rq->max_bw = dl_rq->extra_bw = + to_ratio(global_rt_period(), global_rt_runtime()); } } void sched_dl_do_global(void) { u64 new_bw = -1; + u64 gen = ++dl_generation; struct dl_bw *dl_b; int cpu; unsigned long flags; - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - /* - * FIXME: As above... - */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) { + rcu_read_unlock_sched(); + continue; + } + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); @@ -2551,11 +3267,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2570,15 +3287,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2618,7 +3337,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2631,7 +3350,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2646,6 +3366,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2669,22 +3391,29 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } /* * This function clears the sched_dl_entity static params. */ -void __dl_clear_params(struct task_struct *p) +static void __dl_clear_params(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; dl_se->dl_deadline = 0; dl_se->dl_period = 0; @@ -2692,11 +3421,23 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; - dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + dl_se->dl_server = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif +} + +void init_dl_entity(struct sched_dl_entity *dl_se) +{ + RB_CLEAR_NODE(&dl_se->rb_node); + init_dl_task_timer(dl_se); + init_dl_inactive_task_timer(dl_se); + __dl_clear_params(dl_se); } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) @@ -2706,60 +3447,25 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; } #ifdef CONFIG_SMP -int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) -{ - unsigned int dest_cpu; - struct dl_bw *dl_b; - bool overflow; - int cpus, ret; - unsigned long flags; - - dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) { - ret = -EBUSY; - } else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, cpus); - ret = 0; - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - return ret; -} - int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; + unsigned long flags, cap; struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - + cap = __dl_bw_capacity(trial); raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + if (__dl_overflow(cur_dl_b, cap, 0, 0)) ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); @@ -2767,22 +3473,97 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -bool dl_cpu_busy(unsigned int cpu) +enum dl_bw_request { + dl_bw_req_deactivate = 0, + dl_bw_req_alloc, + dl_bw_req_free +}; + +static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; - bool overflow; - int cpus; + bool overflow = 0; + u64 fair_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + + cap = dl_bw_capacity(cpu); + switch (req) { + case dl_bw_req_free: + __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); + break; + case dl_bw_req_alloc: + overflow = __dl_overflow(dl_b, cap, 0, dl_bw); + + if (!overflow) { + /* + * We reserve space in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); + } + break; + case dl_bw_req_deactivate: + /* + * cpu is not off yet, but we need to do the math by + * considering it off already (i.e., what would happen if we + * turn cpu off?). + */ + cap -= arch_scale_cpu_capacity(cpu); + + /* + * cpu is going offline and NORMAL tasks will be moved away + * from it. We can thus discount dl_server bandwidth + * contribution as it won't need to be servicing tasks after + * the cpu is off. + */ + if (cpu_rq(cpu)->fair_server.dl_server) + fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + + /* + * Not much to check if no DEADLINE bandwidth is present. + * dl_servers we can discount, as tasks will be moved out the + * offlined CPUs anyway. + */ + if (dl_b->total_bw - fair_server_bw > 0) { + /* + * Leaving at least one CPU for DEADLINE tasks seems a + * wise thing to do. As said above, cpu is not offline + * yet, so account for that. + */ + if (dl_bw_cpus(cpu) - 1) + overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + else + overflow = 1; + } + + break; + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - return overflow; + return overflow ? -EBUSY : 0; +} + +int dl_bw_deactivate(int cpu) +{ + return dl_bw_manage(dl_bw_req_deactivate, cpu, 0); +} + +int dl_bw_alloc(int cpu, u64 dl_bw) +{ + return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw); +} + +void dl_bw_free(int cpu, u64 dl_bw) +{ + dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 36c54265bb2b..ef047add7f9e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,12 +6,9 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ -#include "sched.h" - -static DEFINE_SPINLOCK(sched_debug_lock); /* - * This allows printing both to /proc/sched_debug and + * This allows printing both to /sys/kernel/debug/sched/debug and * to the console */ #define SEQ_printf(m, x...) \ @@ -169,191 +166,478 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -__read_mostly bool sched_debug_enabled; +#ifdef CONFIG_SMP -static __init int sched_init_debug(void) +static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); + char buf[16]; + unsigned int scaling; - debugfs_create_bool("sched_debug", 0644, NULL, - &sched_debug_enabled); + if (cnt > 15) + cnt = 15; - return 0; + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + buf[cnt] = '\0'; + + if (kstrtouint(buf, 10, &scaling)) + return -EINVAL; + + if (scaling >= SCHED_TUNABLESCALING_END) + return -EINVAL; + + sysctl_sched_tunable_scaling = scaling; + if (sched_update_scaling()) + return -EINVAL; + + *ppos += cnt; + return cnt; } -late_initcall(sched_init_debug); -#ifdef CONFIG_SMP +static int sched_scaling_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", sysctl_sched_tunable_scaling); + return 0; +} -#ifdef CONFIG_SYSCTL +static int sched_scaling_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_scaling_show, NULL); +} -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} +static const struct file_operations sched_scaling_fops = { + .open = sched_scaling_open, + .write = sched_scaling_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} +#endif /* SMP */ + +#ifdef CONFIG_PREEMPT_DYNAMIC + +static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + mode = sched_dynamic_mode(strstrip(buf)); + if (mode < 0) + return mode; + + sched_dynamic_update(mode); + + *ppos += cnt; + + return cnt; +} + +static int sched_dynamic_show(struct seq_file *m, void *v) +{ + static const char * preempt_modes[] = { + "none", "voluntary", "full", "lazy", + }; + int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); + int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + + for (; i < j; i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); + seq_puts(m, preempt_modes[i]); + if (preempt_dynamic_mode == i) + seq_puts(m, ")"); + + seq_puts(m, " "); + } + + seq_puts(m, "\n"); + return 0; +} + +static int sched_dynamic_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_dynamic_show, NULL); +} + +static const struct file_operations sched_dynamic_fops = { + .open = sched_dynamic_open, + .write = sched_dynamic_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; -static struct ctl_table *sd_alloc_ctl_entry(int n) +#endif /* CONFIG_PREEMPT_DYNAMIC */ + +__read_mostly bool sched_debug_verbose; + +#ifdef CONFIG_SMP +static struct dentry *sd_dentry; + + +static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + ssize_t result; + bool orig; - return entry; + cpus_read_lock(); + mutex_lock(&sched_domains_mutex); + + orig = sched_debug_verbose; + result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); + + if (sched_debug_verbose && !orig) + update_sched_domain_debugfs(); + else if (!sched_debug_verbose && orig) { + debugfs_remove(sd_dentry); + sd_dentry = NULL; + } + + mutex_unlock(&sched_domains_mutex); + cpus_read_unlock(); + + return result; } +#else +#define sched_verbose_write debugfs_write_file_bool +#endif + +static const struct file_operations sched_verbose_fops = { + .read = debugfs_read_file_bool, + .write = sched_verbose_write, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct seq_operations sched_debug_sops; -static void sd_free_ctl_entry(struct ctl_table **tablep) +static int sched_debug_open(struct inode *inode, struct file *filp) { - struct ctl_table *entry; + return seq_open(filp, &sched_debug_sops); +} - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); +static const struct file_operations sched_debug_fops = { + .open = sched_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +enum dl_param { + DL_RUNTIME = 0, + DL_PERIOD, +}; + +static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ +static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ + +static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, enum dl_param param) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + u64 runtime, period; + size_t err; + int retval; + u64 value; + + err = kstrtoull_from_user(ubuf, cnt, 10, &value); + if (err) + return err; + + scoped_guard (rq_lock_irqsave, rq) { + runtime = rq->fair_server.dl_runtime; + period = rq->fair_server.dl_period; + + switch (param) { + case DL_RUNTIME: + if (runtime == value) + break; + runtime = value; + break; + case DL_PERIOD: + if (value == period) + break; + period = value; + break; + } + + if (runtime > period || + period > fair_server_period_max || + period < fair_server_period_min) { + return -EINVAL; + } + + if (rq->cfs.h_nr_queued) { + update_rq_clock(rq); + dl_server_stop(&rq->fair_server); + } + + retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); + if (retval) + cnt = retval; + + if (!runtime) + printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", + cpu_of(rq)); + + if (rq->cfs.h_nr_queued) + dl_server_start(&rq->fair_server); } - kfree(*tablep); - *tablep = NULL; + *ppos += cnt; + return cnt; } -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(9); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); - set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); - /* &table[8] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; +static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + u64 value; + + switch (param) { + case DL_RUNTIME: + value = rq->fair_server.dl_runtime; + break; + case DL_PERIOD: + value = rq->fair_server.dl_period; + break; } - return table; + + seq_printf(m, "%llu\n", value); + return 0; + } -static cpumask_var_t sd_sysctl_cpus; -static struct ctl_table_header *sd_sysctl_header; +static ssize_t +sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); +} -void register_sched_domain_sysctl(void) +static int sched_fair_server_runtime_show(struct seq_file *m, void *v) { - static struct ctl_table *cpu_entries; - static struct ctl_table **cpu_idx; - static bool init_done = false; - char buf[32]; - int i; + return sched_fair_server_show(m, v, DL_RUNTIME); +} - if (!cpu_entries) { - cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); - if (!cpu_entries) - return; +static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_fair_server_runtime_show, inode->i_private); +} + +static const struct file_operations fair_server_runtime_fops = { + .open = sched_fair_server_runtime_open, + .write = sched_fair_server_runtime_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static ssize_t +sched_fair_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); +} - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = cpu_entries; +static int sched_fair_server_period_show(struct seq_file *m, void *v) +{ + return sched_fair_server_show(m, v, DL_PERIOD); +} + +static int sched_fair_server_period_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_fair_server_period_show, inode->i_private); +} + +static const struct file_operations fair_server_period_fops = { + .open = sched_fair_server_period_open, + .write = sched_fair_server_period_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *debugfs_sched; + +static void debugfs_fair_server_init(void) +{ + struct dentry *d_fair; + unsigned long cpu; + + d_fair = debugfs_create_dir("fair_server", debugfs_sched); + if (!d_fair) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_fair); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops); } +} - if (!cpu_idx) { - struct ctl_table *e = cpu_entries; +static __init int sched_init_debug(void) +{ + struct dentry __maybe_unused *numa; - cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); - if (!cpu_idx) - return; + debugfs_sched = debugfs_create_dir("sched", NULL); - /* deal with sparse possible map */ - for_each_possible_cpu(i) { - cpu_idx[i] = e; - e++; - } + debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops); + debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops); +#ifdef CONFIG_PREEMPT_DYNAMIC + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); +#endif + + debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); + + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); + +#ifdef CONFIG_SMP + debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); + debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); + debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); + + mutex_lock(&sched_domains_mutex); + update_sched_domain_debugfs(); + mutex_unlock(&sched_domains_mutex); +#endif + +#ifdef CONFIG_NUMA_BALANCING + numa = debugfs_create_dir("numa_balancing", debugfs_sched); + + debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay); + debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min); + debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); + debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); +#endif + + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + + debugfs_fair_server_init(); + + return 0; +} +late_initcall(sched_init_debug); + +#ifdef CONFIG_SMP + +static cpumask_var_t sd_sysctl_cpus; + +static int sd_flags_show(struct seq_file *m, void *v) +{ + unsigned long flags = *(unsigned int *)m->private; + int idx; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + seq_puts(m, sd_flag_debug[idx].name); + seq_puts(m, " "); } + seq_puts(m, "\n"); + + return 0; +} + +static int sd_flags_open(struct inode *inode, struct file *file) +{ + return single_open(file, sd_flags_show, inode->i_private); +} + +static const struct file_operations sd_flags_fops = { + .open = sd_flags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void register_sd(struct sched_domain *sd, struct dentry *parent) +{ +#define SDM(type, mode, member) \ + debugfs_create_##type(#member, mode, parent, &sd->member) + + SDM(ulong, 0644, min_interval); + SDM(ulong, 0644, max_interval); + SDM(u64, 0644, max_newidle_lb_cost); + SDM(u32, 0644, busy_factor); + SDM(u32, 0644, imbalance_pct); + SDM(u32, 0644, cache_nice_tries); + SDM(str, 0444, name); + +#undef SDM + + debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); + debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); + debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); +} + +void update_sched_domain_debugfs(void) +{ + int cpu, i; + + /* + * This can unfortunately be invoked before sched_debug_init() creates + * the debug directory. Don't touch sd_sysctl_cpus until then. + */ + if (!debugfs_sched) + return; + + if (!sched_debug_verbose) + return; if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; + cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } - if (!init_done) { - init_done = true; - /* init to possible to not have holes in @cpu_entries */ - cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); + if (!sd_dentry) { + sd_dentry = debugfs_create_dir("domains", debugfs_sched); + + /* rebuild sd_sysctl_cpus if empty since it gets cleared below */ + if (cpumask_empty(sd_sysctl_cpus)) + cpumask_copy(sd_sysctl_cpus, cpu_online_mask); } - for_each_cpu(i, sd_sysctl_cpus) { - struct ctl_table *e = cpu_idx[i]; + for_each_cpu(cpu, sd_sysctl_cpus) { + struct sched_domain *sd; + struct dentry *d_cpu; + char buf[32]; - if (e->child) - sd_free_ctl_entry(&e->child); + snprintf(buf, sizeof(buf), "cpu%d", cpu); + debugfs_lookup_and_remove(buf, sd_dentry); + d_cpu = debugfs_create_dir(buf, sd_dentry); - if (!e->procname) { - snprintf(buf, 32, "cpu%d", i); - e->procname = kstrdup(buf, GFP_KERNEL); + i = 0; + for_each_domain(cpu, sd) { + struct dentry *d_sd; + + snprintf(buf, sizeof(buf), "domain%d", i); + d_sd = debugfs_create_dir(buf, d_cpu); + + register_sd(sd, d_sd); + i++; } - e->mode = 0555; - e->child = sd_alloc_ctl_cpu_table(i); - __cpumask_clear_cpu(i, sd_sysctl_cpus); + __cpumask_clear_cpu(cpu, sd_sysctl_cpus); } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); } void dirty_sched_domain_sysctl(int cpu) @@ -362,13 +646,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -/* may be called multiple times per register */ -void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; -} -#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -377,9 +654,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group struct sched_entity *se = tg->se[cpu]; #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) +#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \ + #F, (long long)schedstat_val(stats->F)) #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) +#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \ + #F, SPLIT_NS((long long)schedstat_val(stats->F))) if (!se) return; @@ -389,16 +668,19 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->sum_exec_runtime); if (schedstat_enabled()) { - PN_SCHEDSTAT(se->statistics.wait_start); - PN_SCHEDSTAT(se->statistics.sleep_start); - PN_SCHEDSTAT(se->statistics.block_start); - PN_SCHEDSTAT(se->statistics.sleep_max); - PN_SCHEDSTAT(se->statistics.block_max); - PN_SCHEDSTAT(se->statistics.exec_max); - PN_SCHEDSTAT(se->statistics.slice_max); - PN_SCHEDSTAT(se->statistics.wait_max); - PN_SCHEDSTAT(se->statistics.wait_sum); - P_SCHEDSTAT(se->statistics.wait_count); + struct sched_statistics *stats; + stats = __schedstats_from_se(se); + + PN_SCHEDSTAT(wait_start); + PN_SCHEDSTAT(sleep_start); + PN_SCHEDSTAT(block_start); + PN_SCHEDSTAT(sleep_max); + PN_SCHEDSTAT(block_max); + PN_SCHEDSTAT(exec_max); + PN_SCHEDSTAT(slice_max); + PN_SCHEDSTAT(wait_max); + PN_SCHEDSTAT(wait_sum); + P_SCHEDSTAT(wait_count); } P(se->load.weight); @@ -416,43 +698,69 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif #ifdef CONFIG_CGROUP_SCHED +static DEFINE_SPINLOCK(sched_debug_lock); static char group_path[PATH_MAX]; -static char *task_group_path(struct task_group *tg) +static void task_group_path(struct task_group *tg, char *path, int plen) { - if (autogroup_path(tg, group_path, PATH_MAX)) - return group_path; + if (autogroup_path(tg, path, plen)) + return; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, path, plen); +} - return group_path; +/* + * Only 1 SEQ_printf_task_group_path() caller can use the full length + * group_path[] for cgroup path. Other simultaneous callers will have + * to use a shorter stack buffer. A "..." suffix is appended at the end + * of the stack buffer so that it will show up in case the output length + * matches the given buffer size to indicate possible path name truncation. + */ +#define SEQ_printf_task_group_path(m, tg, fmt...) \ +{ \ + if (spin_trylock(&sched_debug_lock)) { \ + task_group_path(tg, group_path, sizeof(group_path)); \ + SEQ_printf(m, fmt, group_path); \ + spin_unlock(&sched_debug_lock); \ + } else { \ + char buf[128]; \ + char *bufend = buf + sizeof(buf) - 3; \ + task_group_path(tg, buf, bufend - buf); \ + strcpy(bufend - 1, "..."); \ + SEQ_printf(m, fmt, buf); \ + } \ } #endif static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - if (rq->curr == p) + if (task_current(rq, p)) SEQ_printf(m, ">R"); else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), + entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', + SPLIT_NS(p->se.deadline), + p->se.custom_slice ? 'S' : ' ', + SPLIT_NS(p->se.slice), + SPLIT_NS(p->se.sum_exec_runtime), (long long)(p->nvcsw + p->nivcsw), p->prio); - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld", + SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED - SEQ_printf(m, " %s", task_group_path(task_group(p))); + SEQ_printf_task_group_path(m, task_group(p), " %s") #endif SEQ_printf(m, "\n"); @@ -464,10 +772,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" - " wait-time sum-exec sum-sleep\n"); + SEQ_printf(m, " S task PID vruntime eligible " + "deadline slice sum-exec switches " + "prio wait-time sum-sleep sum-block" +#ifdef CONFIG_NUMA_BALANCING + " node group-id" +#endif +#ifdef CONFIG_CGROUP_SCHED + " group-path" +#endif + "\n"); SEQ_printf(m, "-------------------------------------------------------" - "------------------------------------------------------\n"); + "------------------------------------------------------" + "------------------------------------------------------" +#ifdef CONFIG_NUMA_BALANCING + "--------------" +#endif +#ifdef CONFIG_CGROUP_SCHED + "--------------" +#endif + "\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -481,46 +805,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; + s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); - struct sched_entity *last; unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, "\n"); - SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); + SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu); #else SEQ_printf(m, "\n"); SEQ_printf(m, "cfs_rq[%d]:\n", cpu); #endif - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", - SPLIT_NS(cfs_rq->exec_clock)); - raw_spin_lock_irqsave(&rq->lock, flags); - if (rb_first_cached(&cfs_rq->tasks_timeline)) - MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; + raw_spin_rq_lock_irqsave(rq, flags); + root = __pick_root_entity(cfs_rq); + if (root) + left_vruntime = root->min_vruntime; + first = __pick_first_entity(cfs_rq); + if (first) + left_deadline = first->deadline; last = __pick_last_entity(cfs_rq); if (last) - max_vruntime = last->vruntime; + right_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - raw_spin_unlock_irqrestore(&rq->lock, flags); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", - SPLIT_NS(MIN_vruntime)); + raw_spin_rq_unlock_irqrestore(rq, flags); + + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", + SPLIT_NS(left_deadline)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", + SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", SPLIT_NS(min_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", - SPLIT_NS(max_vruntime)); - spread = max_vruntime - MIN_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", - SPLIT_NS(spread)); - spread0 = min_vruntime - rq0_min_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", - SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", - cfs_rq->nr_spread_over); - SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", + SPLIT_NS(avg_vruntime(cfs_rq))); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", + SPLIT_NS(right_vruntime)); + spread = right_vruntime - left_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); + SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", @@ -529,8 +855,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->avg.runnable_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); - SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", - cfs_rq->avg.util_est.enqueued); + SEQ_printf(m, " .%-30s: %u\n", "util_est", + cfs_rq->avg.util_est); SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", cfs_rq->removed.load_avg); SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", @@ -560,7 +886,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #ifdef CONFIG_RT_GROUP_SCHED SEQ_printf(m, "\n"); - SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); + SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu); #else SEQ_printf(m, "\n"); SEQ_printf(m, "rt_rq[%d]:\n", cpu); @@ -574,12 +900,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) PU(rt_nr_running); -#ifdef CONFIG_SMP - PU(rt_nr_migratory); -#endif + +#ifdef CONFIG_RT_GROUP_SCHED P(rt_throttled); PN(rt_time); PN(rt_runtime); +#endif #undef PN #undef PU @@ -598,7 +924,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) PU(dl_nr_running); #ifdef CONFIG_SMP - PU(dl_nr_migratory); dl_bw = &cpu_rq(cpu)->rd->dl_bw; #else dl_bw = &dl_rq->dl_bw; @@ -612,7 +937,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; #ifdef CONFIG_X86 { @@ -628,7 +952,7 @@ static void print_cpu(struct seq_file *m, int cpu) #define P(x) \ do { \ if (sizeof(rq->x) == 4) \ - SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ + SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \ else \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ } while (0) @@ -663,13 +987,11 @@ do { \ } #undef P - spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); print_dl_stats(m, cpu); print_rq(m, rq, cpu); - spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } @@ -716,10 +1038,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - PN(sysctl_sched_latency); - PN(sysctl_sched_min_granularity); - PN(sysctl_sched_wakeup_granularity); - P(sysctl_sched_child_runs_first); + PN(sysctl_sched_base_slice); P(sysctl_sched_features); #undef PN #undef P @@ -761,7 +1080,7 @@ void sysrq_sched_debug_show(void) } /* - * This itererator needs some explanation. + * This iterator needs some explanation. * It returns 1 for the header position. * This means 2 is CPU 0. * In a hotplugged system some CPUs, including CPU 0, may be missing so we have @@ -806,18 +1125,10 @@ static const struct seq_operations sched_debug_sops = { .show = sched_debug_show, }; -static int __init init_sched_debug_procfs(void) -{ - if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops)) - return -ENOMEM; - return 0; -} - -__initcall(init_sched_debug_procfs); - #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) #define __P(F) __PS(#F, F) #define P(F) __PS(#F, p->F) +#define PM(F, M) __PS(#F, p->F & (M)) #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) #define __PN(F) __PSN(#F, F) #define PN(F) __PSN(#F, p->F) @@ -837,25 +1148,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING - struct mempolicy *pol; - if (p->mm) P(mm->numa_scan_seq); - task_lock(p); - pol = p->mempolicy; - if (pol && !(pol->flags & MPOL_F_MORON)) - pol = NULL; - mpol_get(pol); - task_unlock(p); - P(numa_pages_migrated); P(numa_preferred_nid); P(total_numa_faults); SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); - mpol_put(pol); #endif } @@ -870,8 +1171,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "---------------------------------------------------------" "----------\n"); -#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F)) -#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F)) +#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F)) +#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F)) PN(se.exec_start); PN(se.vruntime); @@ -884,33 +1185,34 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); - PN_SCHEDSTAT(se.statistics.wait_start); - PN_SCHEDSTAT(se.statistics.sleep_start); - PN_SCHEDSTAT(se.statistics.block_start); - PN_SCHEDSTAT(se.statistics.sleep_max); - PN_SCHEDSTAT(se.statistics.block_max); - PN_SCHEDSTAT(se.statistics.exec_max); - PN_SCHEDSTAT(se.statistics.slice_max); - PN_SCHEDSTAT(se.statistics.wait_max); - PN_SCHEDSTAT(se.statistics.wait_sum); - P_SCHEDSTAT(se.statistics.wait_count); - PN_SCHEDSTAT(se.statistics.iowait_sum); - P_SCHEDSTAT(se.statistics.iowait_count); - P_SCHEDSTAT(se.statistics.nr_migrations_cold); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); - P_SCHEDSTAT(se.statistics.nr_forced_migrations); - P_SCHEDSTAT(se.statistics.nr_wakeups); - P_SCHEDSTAT(se.statistics.nr_wakeups_sync); - P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); - P_SCHEDSTAT(se.statistics.nr_wakeups_local); - P_SCHEDSTAT(se.statistics.nr_wakeups_remote); - P_SCHEDSTAT(se.statistics.nr_wakeups_affine); - P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); - P_SCHEDSTAT(se.statistics.nr_wakeups_passive); - P_SCHEDSTAT(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(sum_sleep_runtime); + PN_SCHEDSTAT(sum_block_runtime); + PN_SCHEDSTAT(wait_start); + PN_SCHEDSTAT(sleep_start); + PN_SCHEDSTAT(block_start); + PN_SCHEDSTAT(sleep_max); + PN_SCHEDSTAT(block_max); + PN_SCHEDSTAT(exec_max); + PN_SCHEDSTAT(slice_max); + PN_SCHEDSTAT(wait_max); + PN_SCHEDSTAT(wait_sum); + P_SCHEDSTAT(wait_count); + PN_SCHEDSTAT(iowait_sum); + P_SCHEDSTAT(iowait_count); + P_SCHEDSTAT(nr_migrations_cold); + P_SCHEDSTAT(nr_failed_migrations_affine); + P_SCHEDSTAT(nr_failed_migrations_running); + P_SCHEDSTAT(nr_failed_migrations_hot); + P_SCHEDSTAT(nr_forced_migrations); + P_SCHEDSTAT(nr_wakeups); + P_SCHEDSTAT(nr_wakeups_sync); + P_SCHEDSTAT(nr_wakeups_migrate); + P_SCHEDSTAT(nr_wakeups_local); + P_SCHEDSTAT(nr_wakeups_remote); + P_SCHEDSTAT(nr_wakeups_affine); + P_SCHEDSTAT(nr_wakeups_affine_attempts); + P_SCHEDSTAT(nr_wakeups_passive); + P_SCHEDSTAT(nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -928,6 +1230,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PN(avg_atom); __PN(avg_per_cpu); + +#ifdef CONFIG_SCHED_CORE + PN_SCHEDSTAT(core_forceidle_sum); +#endif } __P(nr_switches); @@ -943,8 +1249,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.runnable_avg); P(se.avg.util_avg); P(se.avg.last_update_time); - P(se.avg.util_est.ewma); - P(se.avg.util_est.enqueued); + PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); #endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); @@ -957,7 +1262,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); + } else if (fair_policy(p->policy)) { + P(se.slice); } +#ifdef CONFIG_SCHED_CLASS_EXT + __PS("ext.enabled", task_on_scx(p)); +#endif #undef PN_SCHEDSTAT #undef P_SCHEDSTAT @@ -976,6 +1286,18 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + memset(&p->stats, 0, sizeof(p->stats)); #endif } + +void resched_latency_warn(int cpu, u64 latency) +{ + static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1); + + if (likely(!__ratelimit(&latency_check_ratelimit))) + return; + + pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n", + cpu, latency, cpu_rq(cpu)->ticks_without_resched); + dump_stack(); +} diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c new file mode 100644 index 000000000000..7b9dfee858e7 --- /dev/null +++ b/kernel/sched/ext.c @@ -0,0 +1,7870 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_OPS_TASK_ITER_BATCH = 32, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * An exit code can be specified when exiting with scx_bpf_exit() or + * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN + * respectively. The codes are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* exit code if gracefully exiting */ + s64 exit_code; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * If set, only tasks with policy set to SCHED_EXT are attached to + * sched_ext. If clear, SCHED_NORMAL tasks are also included. + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * CPU cgroup support flags + */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_HAS_CGROUP_WEIGHT, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/* + * Informational context provided to dump operations. + */ +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. + */ +struct sched_ext_ops { + /** + * @select_cpu: Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * @enqueue: Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. + * + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * @dequeue: Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). + * + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * @tick: Periodic tick + * @p: task running currently + * + * This operation is called every 1/HZ seconds on CPUs which are + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an + * immediate dispatch cycle on the CPU. + */ + void (*tick)(struct task_struct *p); + + /** + * @runnable: A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU, or when @p is + * being enqueued on a CPU experiencing a hotplug event. Likewise, a + * task may be ->enqueue()'d without being preceded by this operation + * e.g. after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * @running: A task is starting to run on its associated CPU + * @p: task starting to run + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * @stopping: A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * @quiescent: A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * @yield: Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * @core_sched_before: Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + + /** + * @set_weight: Set task weight + * @p: task to set weight for + * @weight: new weight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * @set_cpumask: Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * @update_idle: Update the idle state of a CPU + * @cpu: CPU to update the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * @init_task: Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); + + /** + * @exit_task: Exit a previously-running task from the system + * @p: task to exit + * @args: exit arguments, see the struct definition + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); + + /** + * @enable: Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. enable() is called on @p any time it + * enters SCX, and is always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * @disable: Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + + /** + * @dump: Dump BPF scheduler state on error + * @ctx: debug dump context + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. + */ + void (*dump)(struct scx_dump_ctx *ctx); + + /** + * @dump_cpu: Dump BPF scheduler state for a CPU on error + * @ctx: debug dump context + * @cpu: CPU to generate debug dump for + * @idle: @cpu is currently idle without any runnable tasks + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @cpu. If @idle is %true and this operation doesn't produce any + * output, @cpu is skipped for dump. + */ + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); + + /** + * @dump_task: Dump BPF scheduler state for a runnable task on error + * @ctx: debug dump context + * @p: runnable task to generate debug dump for + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @p. + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * @cgroup_init: Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * @cgroup_exit: Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_move: Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_cancel_move: Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_set_weight: A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @tg's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); +#endif /* CONFIG_EXT_GROUP_SCHED */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * @cpu_online: A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * @cpu_offline: A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * @init: Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * @exit: Clean up after the BPF scheduler + * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. + */ + void (*exit)(struct scx_exit_info *info); + + /** + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * @flags: %SCX_OPS_* flags + */ + u64 flags; + + /** + * @timeout_ms: The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + + /** + * @hotplug_seq: A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** + * @name: BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), +}; + +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * The BPF scheduler is responsible for triggering a follow-up + * scheduling event. Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +enum scx_ops_enable_state { + SCX_OPS_ENABLING, + SCX_OPS_ENABLED, + SCX_OPS_DISABLING, + SCX_OPS_DISABLED, +}; + +static const char *scx_ops_enable_state_str[] = { + [SCX_OPS_ENABLING] = "enabling", + [SCX_OPS_ENABLED] = "enabled", + [SCX_OPS_DISABLING] = "disabling", + [SCX_OPS_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +/* + * During exit, a task may schedule after losing its PIDs. When disabling the + * BPF scheduler, we need to be able to iterate tasks in every state to + * guarantee system safety. Maintain a dedicated task list which contains every + * task between its fork and eventual free. + */ +static DEFINE_SPINLOCK(scx_tasks_lock); +static LIST_HEAD(scx_tasks); + +/* ops enable/disable */ +static struct kthread_worker *scx_ops_helper; +static DEFINE_MUTEX(scx_ops_enable_mutex); +DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); +DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); +static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static unsigned long scx_in_softlockup; +static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0); +static int scx_ops_bypass_depth; +static bool scx_ops_init_task_enabled; +static bool scx_switching_all; +DEFINE_STATIC_KEY_FALSE(__scx_switched_all); + +static struct sched_ext_ops scx_ops; +static bool scx_warned_zero_slice; + +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); +static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +#ifdef CONFIG_SMP +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); +#endif + +static struct static_key_false scx_has_op[SCX_OPI_END] = + { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; + +static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); +static struct scx_exit_info *scx_exit_info; + +static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); + +/* + * A monotically increasing sequence number that is incremented every time a + * scheduler is enabled. This can be used by to check if any custom sched_ext + * scheduler has ever been used in the system. + */ +static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); + +/* + * The maximum amount of time in jiffies that a task may be runnable without + * being scheduled on a CPU. If this timeout is exceeded, it will trigger + * scx_ops_error(). + */ +static unsigned long scx_watchdog_timeout; + +/* + * The last time the delayed work was run. This delayed work relies on + * ksoftirqd being able to run to service timer interrupts, so it's possible + * that this work itself could get wedged. To account for this, we check that + * it's not stalled in the timer tick, and trigger an error if it is. + */ +static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; + +static struct delayed_work scx_watchdog_work; + +/* idle tracking */ +#ifdef CONFIG_SMP +#ifdef CONFIG_CPUMASK_OFFSTACK +#define CL_ALIGNED_IF_ONSTACK +#else +#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp +#endif + +static struct { + cpumask_var_t cpu; + cpumask_var_t smt; +} idle_masks CL_ALIGNED_IF_ONSTACK; + +#endif /* CONFIG_SMP */ + +/* for %SCX_KICK_WAIT */ +static unsigned long __percpu *scx_kick_cpus_pnt_seqs; + +/* + * Direct dispatch marker. + * + * Non-NULL values are used for direct dispatch from enqueue path. A valid + * pointer points to the task currently being enqueued. An ERR_PTR value is used + * to indicate that direct dispatch has already happened. + */ +static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); + +/* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is + * to avoid live-locking in bypass mode where all tasks are dispatched to + * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't + * sufficient, it can be further split. + */ +static struct scx_dispatch_q **global_dsqs; + +static const struct rhashtable_params dsq_hash_params = { + .key_len = sizeof_field(struct scx_dispatch_q, id), + .key_offset = offsetof(struct scx_dispatch_q, id), + .head_offset = offsetof(struct scx_dispatch_q, hash_node), +}; + +static struct rhashtable dsq_hash; +static LLIST_HEAD(dsqs_to_free); + +/* dispatch buf */ +struct scx_dsp_buf_ent { + struct task_struct *task; + unsigned long qseq; + u64 dsq_id; + u64 enq_flags; +}; + +static u32 scx_dsp_max_batch; + +struct scx_dsp_ctx { + struct rq *rq; + u32 cursor; + u32 nr_tasks; + struct scx_dsp_buf_ent buf[]; +}; + +static struct scx_dsp_ctx __percpu *scx_dsp_ctx; + +/* string formatting from BPF */ +struct scx_bstr_buf { + u64 data[MAX_BPRINTF_VARARGS]; + char line[SCX_EXIT_MSG_LEN]; +}; + +static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); +static struct scx_bstr_buf scx_exit_bstr_buf; + +/* ops debug dump */ +struct scx_dump_data { + s32 cpu; + bool first; + s32 cursor; + struct seq_buf *s; + const char *prefix; + struct scx_bstr_buf buf; +}; + +static struct scx_dump_data scx_dump_data = { + .cpu = -1, +}; + +/* /sys/kernel/sched_ext interface */ +static struct kset *scx_kset; +static struct kobject *scx_root_kobj; + +#define CREATE_TRACE_POINTS +#include <trace/events/sched_ext.h> + +static void process_ddsp_deferred_locals(struct rq *rq); +static void scx_bpf_kick_cpu(s32 cpu, u64 flags); +static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, + s64 exit_code, + const char *fmt, ...); + +#define scx_ops_error_kind(err, fmt, args...) \ + scx_ops_exit_kind((err), 0, fmt, ##args) + +#define scx_ops_exit(code, fmt, args...) \ + scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) + +#define scx_ops_error(fmt, args...) \ + scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) + +#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) + +static long jiffies_delta_msecs(unsigned long at, unsigned long now) +{ + if (time_after(at, now)) + return jiffies_to_msecs(at - now); + else + return -(long)jiffies_to_msecs(now - at); +} + +/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ +static u32 higher_bits(u32 flags) +{ + return ~((1 << fls(flags)) - 1); +} + +/* return the mask with only the highest bit set */ +static u32 highest_bit(u32 flags) +{ + int bit = fls(flags); + return ((u64)1 << bit) >> 1; +} + +static bool u32_before(u32 a, u32 b) +{ + return (s32)(a - b) < 0; +} + +static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +{ + return global_dsqs[cpu_to_node(task_cpu(p))]; +} + +static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +{ + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); +} + +/* + * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX + * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate + * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check + * whether it's running from an allowed context. + * + * @mask is constant, always inline to cull the mask calculations. + */ +static __always_inline void scx_kf_allow(u32 mask) +{ + /* nesting is allowed only in increasing scx_kf_mask order */ + WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, + "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", + current->scx.kf_mask, mask); + current->scx.kf_mask |= mask; + barrier(); +} + +static void scx_kf_disallow(u32 mask) +{ + barrier(); + current->scx.kf_mask &= ~mask; +} + +#define SCX_CALL_OP(mask, op, args...) \ +do { \ + if (mask) { \ + scx_kf_allow(mask); \ + scx_ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + scx_ops.op(args); \ + } \ +} while (0) + +#define SCX_CALL_OP_RET(mask, op, args...) \ +({ \ + __typeof__(scx_ops.op(args)) __ret; \ + if (mask) { \ + scx_kf_allow(mask); \ + __ret = scx_ops.op(args); \ + scx_kf_disallow(mask); \ + } else { \ + __ret = scx_ops.op(args); \ + } \ + __ret; \ +}) + +/* + * Some kfuncs are allowed only on the tasks that are subjects of the + * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such + * restrictions, the following SCX_CALL_OP_*() variants should be used when + * invoking scx_ops operations that take task arguments. These can only be used + * for non-nesting operations due to the way the tasks are tracked. + * + * kfuncs which can only operate on such tasks can in turn use + * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on + * the specific task. + */ +#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +do { \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + SCX_CALL_OP(mask, op, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ +} while (0) + +#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +({ \ + __typeof__(scx_ops.op(task, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task; \ + __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + __ret; \ +}) + +#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +({ \ + __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ + BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + current->scx.kf_tasks[0] = task0; \ + current->scx.kf_tasks[1] = task1; \ + __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + current->scx.kf_tasks[0] = NULL; \ + current->scx.kf_tasks[1] = NULL; \ + __ret; \ +}) + +/* @mask is constant, always inline to cull unnecessary branches */ +static __always_inline bool scx_kf_allowed(u32 mask) +{ + if (unlikely(!(current->scx.kf_mask & mask))) { + scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); + return false; + } + + /* + * Enforce nesting boundaries. e.g. A kfunc which can be called from + * DISPATCH must not be called if we're running DEQUEUE which is nested + * inside ops.dispatch(). We don't need to check boundaries for any + * blocking kfuncs as the verifier ensures they're only called from + * sleepable progs. + */ + if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && + (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { + scx_ops_error("cpu_release kfunc called from a nested operation"); + return false; + } + + if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && + (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { + scx_ops_error("dispatch kfunc called from a nested operation"); + return false; + } + + return true; +} + +/* see SCX_CALL_OP_TASK() */ +static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, + struct task_struct *p) +{ + if (!scx_kf_allowed(mask)) + return false; + + if (unlikely((p != current->scx.kf_tasks[0] && + p != current->scx.kf_tasks[1]))) { + scx_ops_error("called on a task not being operated on"); + return false; + } + + return true; +} + +static bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + +/** + * nldsq_next_task - Iterate to the next task in a non-local DSQ + * @dsq: user dsq being iterated + * @cur: current position, %NULL to start iteration + * @rev: walk backwards + * + * Returns %NULL when iteration is finished. + */ +static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, + struct task_struct *cur, bool rev) +{ + struct list_head *list_node; + struct scx_dsq_list_node *dsq_lnode; + + lockdep_assert_held(&dsq->lock); + + if (cur) + list_node = &cur->scx.dsq_list.node; + else + list_node = &dsq->list; + + /* find the next task, need to skip BPF iteration cursors */ + do { + if (rev) + list_node = list_node->prev; + else + list_node = list_node->next; + + if (list_node == &dsq->list) + return NULL; + + dsq_lnode = container_of(list_node, struct scx_dsq_list_node, + node); + } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR); + + return container_of(dsq_lnode, struct task_struct, scx.dsq_list); +} + +#define nldsq_for_each_task(p, dsq) \ + for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ + (p) = nldsq_next_task((dsq), (p), false)) + + +/* + * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] + * dispatch order. BPF-visible iterator is opaque and larger to allow future + * changes without breaking backward compatibility. Can be used with + * bpf_for_each(). See bpf_iter_scx_dsq_*(). + */ +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ + SCX_DSQ_ITER_REV = 1U << 16, + + __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, + __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, + + __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, + __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | + __SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME, +}; + +struct bpf_iter_scx_dsq_kern { + struct scx_dsq_list_node cursor; + struct scx_dispatch_q *dsq; + u64 slice; + u64 vtime; +} __attribute__((aligned(8))); + +struct bpf_iter_scx_dsq { + u64 __opaque[6]; +} __attribute__((aligned(8))); + + +/* + * SCX task iterator. + */ +struct scx_task_iter { + struct sched_ext_entity cursor; + struct task_struct *locked; + struct rq *rq; + struct rq_flags rf; + u32 cnt; +}; + +/** + * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration + * @iter: iterator to init + * + * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter + * must eventually be stopped with scx_task_iter_stop(). + * + * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() + * between this and the first next() call or between any two next() calls. If + * the locks are released between two next() calls, the caller is responsible + * for ensuring that the task being iterated remains accessible either through + * RCU read lock or obtaining a reference count. + * + * All tasks which existed when the iteration started are guaranteed to be + * visited as long as they still exist. + */ +static void scx_task_iter_start(struct scx_task_iter *iter) +{ + BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & + ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + + spin_lock_irq(&scx_tasks_lock); + + iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; + list_add(&iter->cursor.tasks_node, &scx_tasks); + iter->locked = NULL; + iter->cnt = 0; +} + +static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) +{ + if (iter->locked) { + task_rq_unlock(iter->rq, iter->locked, &iter->rf); + iter->locked = NULL; + } +} + +/** + * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator + * @iter: iterator to unlock + * + * If @iter is in the middle of a locked iteration, it may be locking the rq of + * the task currently being visited in addition to scx_tasks_lock. Unlock both. + * This function can be safely called anytime during an iteration. + */ +static void scx_task_iter_unlock(struct scx_task_iter *iter) +{ + __scx_task_iter_rq_unlock(iter); + spin_unlock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() + * @iter: iterator to re-lock + * + * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it + * doesn't re-lock the rq lock. Must be called before other iterator operations. + */ +static void scx_task_iter_relock(struct scx_task_iter *iter) +{ + spin_lock_irq(&scx_tasks_lock); +} + +/** + * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock + * @iter: iterator to exit + * + * Exit a previously initialized @iter. Must be called with scx_tasks_lock held + * which is released on return. If the iterator holds a task's rq lock, that rq + * lock is also released. See scx_task_iter_start() for details. + */ +static void scx_task_iter_stop(struct scx_task_iter *iter) +{ + list_del_init(&iter->cursor.tasks_node); + scx_task_iter_unlock(iter); +} + +/** + * scx_task_iter_next - Next task + * @iter: iterator to walk + * + * Visit the next task. See scx_task_iter_start() for details. Locks are dropped + * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing + * stalls by holding scx_tasks_lock for too long. + */ +static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) +{ + struct list_head *cursor = &iter->cursor.tasks_node; + struct sched_ext_entity *pos; + + if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + scx_task_iter_unlock(iter); + cond_resched(); + scx_task_iter_relock(iter); + } + + list_for_each_entry(pos, cursor, tasks_node) { + if (&pos->tasks_node == &scx_tasks) + return NULL; + if (!(pos->flags & SCX_TASK_CURSOR)) { + list_move(cursor, &pos->tasks_node); + return container_of(pos, struct task_struct, scx); + } + } + + /* can't happen, should always terminate at scx_tasks above */ + BUG(); +} + +/** + * scx_task_iter_next_locked - Next non-idle task with its rq locked + * @iter: iterator to walk + * + * Visit the non-idle task with its rq lock held. Allows callers to specify + * whether they would like to filter out dead tasks. See scx_task_iter_start() + * for details. + */ +static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) +{ + struct task_struct *p; + + __scx_task_iter_rq_unlock(iter); + + while ((p = scx_task_iter_next(iter))) { + /* + * scx_task_iter is used to prepare and move tasks into SCX + * while loading the BPF scheduler and vice-versa while + * unloading. The init_tasks ("swappers") should be excluded + * from the iteration because: + * + * - It's unsafe to use __setschduler_prio() on an init_task to + * determine the sched_class to use as it won't preserve its + * idle_sched_class. + * + * - ops.init/exit_task() can easily be confused if called with + * init_tasks as they, e.g., share PID 0. + * + * As init_tasks are never scheduled through SCX, they can be + * skipped safely. Note that is_idle_task() which tests %PF_IDLE + * doesn't work here: + * + * - %PF_IDLE may not be set for an init_task whose CPU hasn't + * yet been onlined. + * + * - %PF_IDLE can be set on tasks that are not init_tasks. See + * play_idle_precise() used by CONFIG_IDLE_INJECT. + * + * Test for idle_sched_class as only init_tasks are on it. + */ + if (p->sched_class != &idle_sched_class) + break; + } + if (!p) + return NULL; + + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked = p; + + return p; +} + +static enum scx_ops_enable_state scx_ops_enable_state(void) +{ + return atomic_read(&scx_ops_enable_state_var); +} + +static enum scx_ops_enable_state +scx_ops_set_enable_state(enum scx_ops_enable_state to) +{ + return atomic_xchg(&scx_ops_enable_state_var, to); +} + +static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, + enum scx_ops_enable_state from) +{ + int from_v = from; + + return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); +} + +static bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} + +/** + * wait_ops_state - Busy-wait the specified ops state to end + * @p: target task + * @opss: state to wait the end of + * + * Busy-wait for @p to transition out of @opss. This can only be used when the + * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also + * has load_acquire semantics to ensure that the caller can see the updates made + * in the enqueueing and dispatching paths. + */ +static void wait_ops_state(struct task_struct *p, unsigned long opss) +{ + do { + cpu_relax(); + } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); +} + +/** + * ops_cpu_valid - Verify a cpu number + * @cpu: cpu number which came from a BPF ops + * @where: extra information reported on error + * + * @cpu is a cpu number which came from the BPF scheduler and can be any value. + * Verify that it is in range and one of the possible cpus. If invalid, trigger + * an ops error. + */ +static bool ops_cpu_valid(s32 cpu, const char *where) +{ + if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { + return true; + } else { + scx_ops_error("invalid CPU %d%s%s", cpu, + where ? " " : "", where ?: ""); + return false; + } +} + +/** + * ops_sanitize_err - Sanitize a -errno value + * @ops_name: operation to blame on failure + * @err: -errno value to sanitize + * + * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return + * -%EPROTO. This is necessary because returning a rogue -errno up the chain can + * cause misbehaviors. For an example, a large negative return from + * ops.init_task() triggers an oops when passed up the call chain because the + * value fails IS_ERR() test after being encoded with ERR_PTR() and then is + * handled as a pointer. + */ +static int ops_sanitize_err(const char *ops_name, s32 err) +{ + if (err < 0 && err >= -MAX_ERRNO) + return err; + + scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); + return -EPROTO; +} + +static void run_deferred(struct rq *rq) +{ + process_ddsp_deferred_locals(rq); +} + +#ifdef CONFIG_SMP +static void deferred_bal_cb_workfn(struct rq *rq) +{ + run_deferred(rq); +} +#endif + +static void deferred_irq_workfn(struct irq_work *irq_work) +{ + struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work); + + raw_spin_rq_lock(rq); + run_deferred(rq); + raw_spin_rq_unlock(rq); +} + +/** + * schedule_deferred - Schedule execution of deferred actions on an rq + * @rq: target rq + * + * Schedule execution of deferred actions on @rq. Must be called with @rq + * locked. Deferred actions are executed with @rq locked but unpinned, and thus + * can unlock @rq to e.g. migrate tasks to other rqs. + */ +static void schedule_deferred(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + +#ifdef CONFIG_SMP + /* + * If in the middle of waking up a task, task_woken_scx() will be called + * afterwards which will then run the deferred actions, no need to + * schedule anything. + */ + if (rq->scx.flags & SCX_RQ_IN_WAKEUP) + return; + + /* + * If in balance, the balance callbacks will be called before rq lock is + * released. Schedule one. + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) { + queue_balance_callback(rq, &rq->scx.deferred_bal_cb, + deferred_bal_cb_workfn); + return; + } +#endif + /* + * No scheduler hooks available. Queue an irq work. They are executed on + * IRQ re-enable which may take a bit longer than the scheduler hooks. + * The above WAKEUP and BALANCE paths should cover most of the cases and + * the time to IRQ re-enable shouldn't be long. + */ + irq_work_queue(&rq->scx.deferred_irq_work); +} + +/** + * touch_core_sched - Update timestamp used for core-sched task ordering + * @rq: rq to read clock from, must be locked + * @p: task to update the timestamp for + * + * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to + * implement global or local-DSQ FIFO ordering for core-sched. Should be called + * when a task becomes runnable and its turn on the CPU ends (e.g. slice + * exhaustion). + */ +static void touch_core_sched(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + +#ifdef CONFIG_SCHED_CORE + /* + * It's okay to update the timestamp spuriously. Use + * sched_core_disabled() which is cheaper than enabled(). + * + * As this is used to determine ordering between tasks of sibling CPUs, + * it may be better to use per-core dispatch sequence instead. + */ + if (!sched_core_disabled()) + p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq)); +#endif +} + +/** + * touch_core_sched_dispatch - Update core-sched timestamp on dispatch + * @rq: rq to read clock from, must be locked + * @p: task being dispatched + * + * If the BPF scheduler implements custom core-sched ordering via + * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO + * ordering within each local DSQ. This function is called from dispatch paths + * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. + */ +static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + +#ifdef CONFIG_SCHED_CORE + if (SCX_HAS_OP(core_sched_before)) + touch_core_sched(rq, p); +#endif +} + +static void update_curr_scx(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + s64 delta_exec; + + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) + return; + + if (curr->scx.slice != SCX_SLICE_INF) { + curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec); + if (!curr->scx.slice) + touch_core_sched(rq, curr); + } +} + +static bool scx_dsq_priq_less(struct rb_node *node_a, + const struct rb_node *node_b) +{ + const struct task_struct *a = + container_of(node_a, struct task_struct, scx.dsq_priq); + const struct task_struct *b = + container_of(node_b, struct task_struct, scx.dsq_priq); + + return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); +} + +static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) +{ + /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ + WRITE_ONCE(dsq->nr, dsq->nr + delta); +} + +static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) +{ + bool is_local = dsq->id == SCX_DSQ_LOCAL; + + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); + WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || + !RB_EMPTY_NODE(&p->scx.dsq_priq)); + + if (!is_local) { + raw_spin_lock(&dsq->lock); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { + scx_ops_error("attempting to dispatch to a destroyed dsq"); + /* fall back to the global dsq */ + raw_spin_unlock(&dsq->lock); + dsq = find_global_dsq(p); + raw_spin_lock(&dsq->lock); + } + } + + if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && + (enq_flags & SCX_ENQ_DSQ_PRIQ))) { + /* + * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from + * their FIFO queues. To avoid confusion and accidentally + * starving vtime-dispatched tasks by FIFO-dispatched tasks, we + * disallow any internal DSQ from doing vtime ordering of + * tasks. + */ + scx_ops_error("cannot use vtime ordering for built-in DSQs"); + enq_flags &= ~SCX_ENQ_DSQ_PRIQ; + } + + if (enq_flags & SCX_ENQ_DSQ_PRIQ) { + struct rb_node *rbp; + + /* + * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are + * linked to both the rbtree and list on PRIQs, this can only be + * tested easily when adding the first task. + */ + if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && + nldsq_next_task(dsq, NULL, false))) + scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); + + p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; + rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); + + /* + * Find the previous task and insert after it on the list so + * that @dsq->list is vtime ordered. + */ + rbp = rb_prev(&p->scx.dsq_priq); + if (rbp) { + struct task_struct *prev = + container_of(rbp, struct task_struct, + scx.dsq_priq); + list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); + } else { + list_add(&p->scx.dsq_list.node, &dsq->list); + } + } else { + /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ + if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) + scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_list.node, &dsq->list); + else + list_add_tail(&p->scx.dsq_list.node, &dsq->list); + } + + /* seq records the order tasks are queued, used by BPF DSQ iterator */ + dsq->seq++; + p->scx.dsq_seq = dsq->seq; + + dsq_mod_nr(dsq, 1); + p->scx.dsq = dsq; + + /* + * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the + * direct dispatch path, but we clear them here because the direct + * dispatch verdict may be overridden on the enqueue path during e.g. + * bypass. + */ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; + + /* + * We're transitioning out of QUEUEING or DISPATCHING. store_release to + * match waiters' load_acquire. + */ + if (enq_flags & SCX_ENQ_CLEAR_OPSS) + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + if (is_local) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, + rq->curr->sched_class)) + resched_curr(rq); + } else { + raw_spin_unlock(&dsq->lock); + } +} + +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node)); + + if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase(&p->scx.dsq_priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_priq); + p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; + } + + list_del_init(&p->scx.dsq_list.node); + dsq_mod_nr(dsq, -1); +} + +static void dispatch_dequeue(struct rq *rq, struct task_struct *p) +{ + struct scx_dispatch_q *dsq = p->scx.dsq; + bool is_local = dsq == &rq->scx.local_dsq; + + if (!dsq) { + /* + * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. + * Unlinking is all that's needed to cancel. + */ + if (unlikely(!list_empty(&p->scx.dsq_list.node))) + list_del_init(&p->scx.dsq_list.node); + + /* + * When dispatching directly from the BPF scheduler to a local + * DSQ, the task isn't associated with any DSQ but + * @p->scx.holding_cpu may be set under the protection of + * %SCX_OPSS_DISPATCHING. + */ + if (p->scx.holding_cpu >= 0) + p->scx.holding_cpu = -1; + + return; + } + + if (!is_local) + raw_spin_lock(&dsq->lock); + + /* + * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't + * change underneath us. + */ + if (p->scx.holding_cpu < 0) { + /* @p must still be on @dsq, dequeue */ + task_unlink_from_dsq(p, dsq); + } else { + /* + * We're racing against dispatch_to_local_dsq() which already + * removed @p from @dsq and set @p->scx.holding_cpu. Clear the + * holding_cpu which tells dispatch_to_local_dsq() that it lost + * the race. + */ + WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node)); + p->scx.holding_cpu = -1; + } + p->scx.dsq = NULL; + + if (!is_local) + raw_spin_unlock(&dsq->lock); +} + +static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, + struct task_struct *p) +{ + struct scx_dispatch_q *dsq; + + if (dsq_id == SCX_DSQ_LOCAL) + return &rq->scx.local_dsq; + + if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) + return find_global_dsq(p); + + return &cpu_rq(cpu)->scx.local_dsq; + } + + if (dsq_id == SCX_DSQ_GLOBAL) + dsq = find_global_dsq(p); + else + dsq = find_user_dsq(dsq_id); + + if (unlikely(!dsq)) { + scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); + return find_global_dsq(p); + } + + return dsq; +} + +static void mark_direct_dispatch(struct task_struct *ddsp_task, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + /* + * Mark that dispatch already happened from ops.select_cpu() or + * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value + * which can never match a valid task pointer. + */ + __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); + + /* @p must match the task on the enqueue path */ + if (unlikely(p != ddsp_task)) { + if (IS_ERR(ddsp_task)) + scx_ops_error("%s[%d] already direct-dispatched", + p->comm, p->pid); + else + scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + ddsp_task->comm, ddsp_task->pid, + p->comm, p->pid); + return; + } + + WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); + WARN_ON_ONCE(p->scx.ddsp_enq_flags); + + p->scx.ddsp_dsq_id = dsq_id; + p->scx.ddsp_enq_flags = enq_flags; +} + +static void direct_dispatch(struct task_struct *p, u64 enq_flags) +{ + struct rq *rq = task_rq(p); + struct scx_dispatch_q *dsq = + find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + + touch_core_sched_dispatch(rq, p); + + p->scx.ddsp_enq_flags |= enq_flags; + + /* + * We are in the enqueue path with @rq locked and pinned, and thus can't + * double lock a remote rq and enqueue to its local DSQ. For + * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer + * the enqueue so that it's executed when @rq can be unlocked. + */ + if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) { + unsigned long opss; + + opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK; + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_NONE: + break; + case SCX_OPSS_QUEUEING: + /* + * As @p was never passed to the BPF side, _release is + * not strictly necessary. Still do it for consistency. + */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + break; + default: + WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()", + p->comm, p->pid, opss); + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + break; + } + + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); + list_add_tail(&p->scx.dsq_list.node, + &rq->scx.ddsp_deferred_locals); + schedule_deferred(rq); + return; + } + + dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); +} + +static bool scx_rq_online(struct rq *rq) +{ + /* + * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates + * the online state as seen from the BPF scheduler. cpu_active() test + * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will + * stay set until the current scheduling operation is complete even if + * we aren't locking @rq. + */ + return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq))); +} + +static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, + int sticky_cpu) +{ + struct task_struct **ddsp_taskp; + unsigned long qseq; + + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); + + /* rq migration */ + if (sticky_cpu == cpu_of(rq)) + goto local_norefill; + + /* + * If !scx_rq_online(), we already told the BPF scheduler that the CPU + * is offline and are just running the hotplug path. Don't bother the + * BPF scheduler. + */ + if (!scx_rq_online(rq)) + goto local; + + if (scx_rq_bypassing(rq)) + goto global; + + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* see %SCX_OPS_ENQ_EXITING */ + if (!static_branch_unlikely(&scx_ops_enq_exiting) && + unlikely(p->flags & PF_EXITING)) + goto local; + + /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ + if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + is_migration_disabled(p)) + goto local; + + if (!SCX_HAS_OP(enqueue)) + goto global; + + /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ + qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; + + WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + + *ddsp_taskp = NULL; + if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) + goto direct; + + /* + * If not directly dispatched, QUEUEING isn't clear yet and dispatch or + * dequeue may be waiting. The store_release matches their load_acquire. + */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); + return; + +direct: + direct_dispatch(p, enq_flags); + return; + +local: + /* + * For task-ordering, slice refill must be treated as implying the end + * of the current slice. Otherwise, the longer @p stays on the CPU, the + * higher priority it becomes from scx_prio_less()'s POV. + */ + touch_core_sched(rq, p); + p->scx.slice = SCX_SLICE_DFL; +local_norefill: + dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); + return; + +global: + touch_core_sched(rq, p); /* see the comment in local: */ + p->scx.slice = SCX_SLICE_DFL; + dispatch_enqueue(find_global_dsq(p), p, enq_flags); +} + +static bool task_runnable(const struct task_struct *p) +{ + return !list_empty(&p->scx.runnable_node); +} + +static void set_task_runnable(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + + if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { + p->scx.runnable_at = jiffies; + p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; + } + + /* + * list_add_tail() must be used. scx_ops_bypass() depends on tasks being + * appended to the runnable_list. + */ + list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); +} + +static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) +{ + list_del_init(&p->scx.runnable_node); + if (reset_runnable_at) + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; +} + +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) +{ + int sticky_cpu = p->scx.sticky_cpu; + + if (enq_flags & ENQUEUE_WAKEUP) + rq->scx.flags |= SCX_RQ_IN_WAKEUP; + + enq_flags |= rq->scx.extra_enq_flags; + + if (sticky_cpu >= 0) + p->scx.sticky_cpu = -1; + + /* + * Restoring a running task will be immediately followed by + * set_next_task_scx() which expects the task to not be on the BPF + * scheduler as tasks can only start running through local DSQs. Force + * direct-dispatch into the local DSQ by setting the sticky_cpu. + */ + if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) + sticky_cpu = cpu_of(rq); + + if (p->scx.flags & SCX_TASK_QUEUED) { + WARN_ON_ONCE(!task_runnable(p)); + goto out; + } + + set_task_runnable(rq, p); + p->scx.flags |= SCX_TASK_QUEUED; + rq->scx.nr_running++; + add_nr_running(rq, 1); + + if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + + if (enq_flags & SCX_ENQ_WAKEUP) + touch_core_sched(rq, p); + + do_enqueue_task(rq, p, enq_flags, sticky_cpu); +out: + rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; +} + +static void ops_dequeue(struct task_struct *p, u64 deq_flags) +{ + unsigned long opss; + + /* dequeue is always temporary, don't reset runnable_at */ + clr_task_runnable(p, false); + + /* acquire ensures that we see the preceding updates on QUEUED */ + opss = atomic_long_read_acquire(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_NONE: + break; + case SCX_OPSS_QUEUEING: + /* + * QUEUEING is started and finished while holding @p's rq lock. + * As we're holding the rq lock now, we shouldn't see QUEUEING. + */ + BUG(); + case SCX_OPSS_QUEUED: + if (SCX_HAS_OP(dequeue)) + SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + + if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_NONE)) + break; + fallthrough; + case SCX_OPSS_DISPATCHING: + /* + * If @p is being dispatched from the BPF scheduler to a DSQ, + * wait for the transfer to complete so that @p doesn't get + * added to its DSQ after dequeueing is complete. + * + * As we're waiting on DISPATCHING with the rq locked, the + * dispatching side shouldn't try to lock the rq while + * DISPATCHING is set. See dispatch_to_local_dsq(). + * + * DISPATCHING shouldn't have qseq set and control can reach + * here with NONE @opss from the above QUEUED case block. + * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. + */ + wait_ops_state(p, SCX_OPSS_DISPATCHING); + BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); + break; + } +} + +static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) +{ + if (!(p->scx.flags & SCX_TASK_QUEUED)) { + WARN_ON_ONCE(task_runnable(p)); + return true; + } + + ops_dequeue(p, deq_flags); + + /* + * A currently running task which is going off @rq first gets dequeued + * and then stops running. As we want running <-> stopping transitions + * to be contained within runnable <-> quiescent transitions, trigger + * ->stopping() early here instead of in put_prev_task_scx(). + * + * @p may go through multiple stopping <-> running transitions between + * here and put_prev_task_scx() if task attribute changes occur while + * balance_scx() leaves @rq unlocked. However, they don't contain any + * information meaningful to the BPF scheduler and can be suppressed by + * skipping the callbacks if the task is !QUEUED. + */ + if (SCX_HAS_OP(stopping) && task_current(rq, p)) { + update_curr_scx(rq); + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + } + + if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + + if (deq_flags & SCX_DEQ_SLEEP) + p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; + else + p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; + + p->scx.flags &= ~SCX_TASK_QUEUED; + rq->scx.nr_running--; + sub_nr_running(rq, 1); + + dispatch_dequeue(rq, p); + return true; +} + +static void yield_task_scx(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (SCX_HAS_OP(yield)) + SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + else + p->scx.slice = 0; +} + +static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) +{ + struct task_struct *from = rq->curr; + + if (SCX_HAS_OP(yield)) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + else + return false; +} + +static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, + struct scx_dispatch_q *src_dsq, + struct rq *dst_rq) +{ + struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq; + + /* @dsq is locked and @p is on @dst_rq */ + lockdep_assert_held(&src_dsq->lock); + lockdep_assert_rq_held(dst_rq); + + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_list.node, &dst_dsq->list); + else + list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); + + dsq_mod_nr(dst_dsq, 1); + p->scx.dsq = dst_dsq; +} + +#ifdef CONFIG_SMP +/** + * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ + * @p: task to move + * @enq_flags: %SCX_ENQ_* + * @src_rq: rq to move the task from, locked on entry, released on return + * @dst_rq: rq to move the task into, locked on return + * + * Move @p which is currently on @src_rq to @dst_rq's local DSQ. + */ +static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, + struct rq *src_rq, struct rq *dst_rq) +{ + lockdep_assert_rq_held(src_rq); + + /* the following marks @p MIGRATING which excludes dequeue */ + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu_of(dst_rq)); + p->scx.sticky_cpu = cpu_of(dst_rq); + + raw_spin_rq_unlock(src_rq); + raw_spin_rq_lock(dst_rq); + + /* + * We want to pass scx-specific enq_flags but activate_task() will + * truncate the upper 32 bit. As we own @rq, we can pass them through + * @rq->scx.extra_enq_flags instead. + */ + WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)); + WARN_ON_ONCE(dst_rq->scx.extra_enq_flags); + dst_rq->scx.extra_enq_flags = enq_flags; + activate_task(dst_rq, p, 0); + dst_rq->scx.extra_enq_flags = 0; +} + +/* + * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two + * differences: + * + * - is_cpu_allowed() asks "Can this task run on this CPU?" while + * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to + * this CPU?". + * + * While migration is disabled, is_cpu_allowed() has to say "yes" as the task + * must be allowed to finish on the CPU that it's currently on regardless of + * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the + * BPF scheduler shouldn't attempt to migrate a task which has migration + * disabled. + * + * - The BPF scheduler is bypassed while the rq is offline and we can always say + * no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs. + */ +static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, + bool trigger_error) +{ + int cpu = cpu_of(rq); + + SCHED_WARN_ON(task_cpu(p) == cpu); + + /* + * If @p has migration disabled, @p->cpus_ptr is updated to contain only + * the pinned CPU in migrate_disable_switch() while @p is being switched + * out. However, put_prev_task_scx() is called before @p->cpus_ptr is + * updated and thus another CPU may see @p on a DSQ inbetween leading to + * @p passing the below task_allowed_on_cpu() check while migration is + * disabled. + * + * Test the migration disabled state first as the race window is narrow + * and the BPF scheduler failing to check migration disabled state can + * easily be masked if task_allowed_on_cpu() is done first. + */ + if (unlikely(is_migration_disabled(p))) { + if (trigger_error) + scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); + return false; + } + + /* + * We don't require the BPF scheduler to avoid dispatching to offline + * CPUs mostly for convenience but also because CPUs can go offline + * between scx_bpf_dsq_insert() calls and here. Trigger error iff the + * picked CPU is outside the allowed mask. + */ + if (!task_allowed_on_cpu(p, cpu)) { + if (trigger_error) + scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); + return false; + } + + if (!scx_rq_online(rq)) + return false; + + return true; +} + +/** + * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq + * @p: target task + * @dsq: locked DSQ @p is currently on + * @src_rq: rq @p is currently on, stable with @dsq locked + * + * Called with @dsq locked but no rq's locked. We want to move @p to a different + * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is + * required when transferring into a local DSQ. Even when transferring into a + * non-local DSQ, it's better to use the same mechanism to protect against + * dequeues and maintain the invariant that @p->scx.dsq can only change while + * @src_rq is locked, which e.g. scx_dump_task() depends on. + * + * We want to grab @src_rq but that can deadlock if we try while locking @dsq, + * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As + * this may race with dequeue, which can't drop the rq lock or fail, do a little + * dancing from our side. + * + * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets + * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu + * would be cleared to -1. While other cpus may have updated it to different + * values afterwards, as this operation can't be preempted or recurse, the + * holding_cpu can never become this CPU again before we're done. Thus, we can + * tell whether we lost to dequeue by testing whether the holding_cpu still + * points to this CPU. See dispatch_dequeue() for the counterpart. + * + * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is + * still valid. %false if lost to dequeue. + */ +static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, + struct scx_dispatch_q *dsq, + struct rq *src_rq) +{ + s32 cpu = raw_smp_processor_id(); + + lockdep_assert_held(&dsq->lock); + + WARN_ON_ONCE(p->scx.holding_cpu >= 0); + task_unlink_from_dsq(p, dsq); + p->scx.holding_cpu = cpu; + + raw_spin_unlock(&dsq->lock); + raw_spin_rq_lock(src_rq); + + /* task_rq couldn't have changed if we're still the holding cpu */ + return likely(p->scx.holding_cpu == cpu) && + !WARN_ON_ONCE(src_rq != task_rq(p)); +} + +static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, + struct scx_dispatch_q *dsq, struct rq *src_rq) +{ + raw_spin_rq_unlock(this_rq); + + if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { + move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); + return true; + } else { + raw_spin_rq_unlock(src_rq); + raw_spin_rq_lock(this_rq); + return false; + } +} +#else /* CONFIG_SMP */ +static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } +static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } +static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } +#endif /* CONFIG_SMP */ + +/** + * move_task_between_dsqs() - Move a task from one DSQ to another + * @p: target task + * @enq_flags: %SCX_ENQ_* + * @src_dsq: DSQ @p is currently on, must not be a local DSQ + * @dst_dsq: DSQ @p is being moved to, can be any DSQ + * + * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local + * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq + * will change. As @p's task_rq is locked, this function doesn't need to use the + * holding_cpu mechanism. + * + * On return, @src_dsq is unlocked and only @p's new task_rq, which is the + * return value, is locked. + */ +static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, + struct scx_dispatch_q *src_dsq, + struct scx_dispatch_q *dst_dsq) +{ + struct rq *src_rq = task_rq(p), *dst_rq; + + BUG_ON(src_dsq->id == SCX_DSQ_LOCAL); + lockdep_assert_held(&src_dsq->lock); + lockdep_assert_rq_held(src_rq); + + if (dst_dsq->id == SCX_DSQ_LOCAL) { + dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + dst_dsq = find_global_dsq(p); + dst_rq = src_rq; + } + } else { + /* no need to migrate if destination is a non-local DSQ */ + dst_rq = src_rq; + } + + /* + * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different + * CPU, @p will be migrated. + */ + if (dst_dsq->id == SCX_DSQ_LOCAL) { + /* @p is going from a non-local DSQ to a local DSQ */ + if (src_rq == dst_rq) { + task_unlink_from_dsq(p, src_dsq); + move_local_task_to_local_dsq(p, enq_flags, + src_dsq, dst_rq); + raw_spin_unlock(&src_dsq->lock); + } else { + raw_spin_unlock(&src_dsq->lock); + move_remote_task_to_local_dsq(p, enq_flags, + src_rq, dst_rq); + } + } else { + /* + * @p is going from a non-local DSQ to a non-local DSQ. As + * $src_dsq is already locked, do an abbreviated dequeue. + */ + task_unlink_from_dsq(p, src_dsq); + p->scx.dsq = NULL; + raw_spin_unlock(&src_dsq->lock); + + dispatch_enqueue(dst_dsq, p, enq_flags); + } + + return dst_rq; +} + +/* + * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly + * banging on the same DSQ on a large NUMA system to the point where switching + * to the bypass mode can take a long time. Inject artificial delays while the + * bypass mode is switching to guarantee timely completion. + */ +static void scx_ops_breather(struct rq *rq) +{ + u64 until; + + lockdep_assert_rq_held(rq); + + if (likely(!atomic_read(&scx_ops_breather_depth))) + return; + + raw_spin_rq_unlock(rq); + + until = ktime_get_ns() + NSEC_PER_MSEC; + + do { + int cnt = 1024; + while (atomic_read(&scx_ops_breather_depth) && --cnt) + cpu_relax(); + } while (atomic_read(&scx_ops_breather_depth) && + time_before64(ktime_get_ns(), until)); + + raw_spin_rq_lock(rq); +} + +static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) +{ + struct task_struct *p; +retry: + /* + * This retry loop can repeatedly race against scx_ops_bypass() + * dequeueing tasks from @dsq trying to put the system into the bypass + * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can + * live-lock the machine into soft lockups. Give a breather. + */ + scx_ops_breather(rq); + + /* + * The caller can't expect to successfully consume a task if the task's + * addition to @dsq isn't guaranteed to be visible somehow. Test + * @dsq->list without locking and skip if it seems empty. + */ + if (list_empty(&dsq->list)) + return false; + + raw_spin_lock(&dsq->lock); + + nldsq_for_each_task(p, dsq) { + struct rq *task_rq = task_rq(p); + + if (rq == task_rq) { + task_unlink_from_dsq(p, dsq); + move_local_task_to_local_dsq(p, 0, dsq, rq); + raw_spin_unlock(&dsq->lock); + return true; + } + + if (task_can_run_on_remote_rq(p, rq, false)) { + if (likely(consume_remote_task(rq, p, dsq, task_rq))) + return true; + goto retry; + } + } + + raw_spin_unlock(&dsq->lock); + return false; +} + +static bool consume_global_dsq(struct rq *rq) +{ + int node = cpu_to_node(cpu_of(rq)); + + return consume_dispatch_q(rq, global_dsqs[node]); +} + +/** + * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @rq: current rq which is locked + * @dst_dsq: destination DSQ + * @p: task to dispatch + * @enq_flags: %SCX_ENQ_* + * + * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local + * DSQ. This function performs all the synchronization dancing needed because + * local DSQs are protected with rq locks. + * + * The caller must have exclusive ownership of @p (e.g. through + * %SCX_OPSS_DISPATCHING). + */ +static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, + struct task_struct *p, u64 enq_flags) +{ + struct rq *src_rq = task_rq(p); + struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP + struct rq *locked_rq = rq; +#endif + + /* + * We're synchronized against dequeue through DISPATCHING. As @p can't + * be dequeued, its task_rq and cpus_allowed are stable too. + * + * If dispatching to @rq that @p is already on, no lock dancing needed. + */ + if (rq == src_rq && rq == dst_rq) { + dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + return; + } + +#ifdef CONFIG_SMP + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + dispatch_enqueue(find_global_dsq(p), p, + enq_flags | SCX_ENQ_CLEAR_OPSS); + return; + } + + /* + * @p is on a possibly remote @src_rq which we need to lock to move the + * task. If dequeue is in progress, it'd be locking @src_rq and waiting + * on DISPATCHING, so we can't grab @src_rq lock while holding + * DISPATCHING. + * + * As DISPATCHING guarantees that @p is wholly ours, we can pretend that + * we're moving from a DSQ and use the same mechanism - mark the task + * under transfer with holding_cpu, release DISPATCHING and then follow + * the same protocol. See unlink_dsq_and_lock_src_rq(). + */ + p->scx.holding_cpu = raw_smp_processor_id(); + + /* store_release ensures that dequeue sees the above */ + atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); + + /* switch to @src_rq lock */ + if (locked_rq != src_rq) { + raw_spin_rq_unlock(locked_rq); + locked_rq = src_rq; + raw_spin_rq_lock(src_rq); + } + + /* task_rq couldn't have changed if we're still the holding cpu */ + if (likely(p->scx.holding_cpu == raw_smp_processor_id()) && + !WARN_ON_ONCE(src_rq != task_rq(p))) { + /* + * If @p is staying on the same rq, there's no need to go + * through the full deactivate/activate cycle. Optimize by + * abbreviating move_remote_task_to_local_dsq(). + */ + if (src_rq == dst_rq) { + p->scx.holding_cpu = -1; + dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags); + } else { + move_remote_task_to_local_dsq(p, enq_flags, + src_rq, dst_rq); + /* task has been moved to dst_rq, which is now locked */ + locked_rq = dst_rq; + } + + /* if the destination CPU is idle, wake it up */ + if (sched_class_above(p->sched_class, dst_rq->curr->sched_class)) + resched_curr(dst_rq); + } + + /* switch back to @rq lock */ + if (locked_rq != rq) { + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(rq); + } +#else /* CONFIG_SMP */ + BUG(); /* control can not reach here on UP */ +#endif /* CONFIG_SMP */ +} + +/** + * finish_dispatch - Asynchronously finish dispatching a task + * @rq: current rq which is locked + * @p: task to finish dispatching + * @qseq_at_dispatch: qseq when @p started getting dispatched + * @dsq_id: destination DSQ ID + * @enq_flags: %SCX_ENQ_* + * + * Dispatching to local DSQs may need to wait for queueing to complete or + * require rq lock dancing. As we don't wanna do either while inside + * ops.dispatch() to avoid locking order inversion, we split dispatching into + * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the + * task and its qseq. Once ops.dispatch() returns, this function is called to + * finish up. + * + * There is no guarantee that @p is still valid for dispatching or even that it + * was valid in the first place. Make sure that the task is still owned by the + * BPF scheduler and claim the ownership before dispatching. + */ +static void finish_dispatch(struct rq *rq, struct task_struct *p, + unsigned long qseq_at_dispatch, + u64 dsq_id, u64 enq_flags) +{ + struct scx_dispatch_q *dsq; + unsigned long opss; + + touch_core_sched_dispatch(rq, p); +retry: + /* + * No need for _acquire here. @p is accessed only after a successful + * try_cmpxchg to DISPATCHING. + */ + opss = atomic_long_read(&p->scx.ops_state); + + switch (opss & SCX_OPSS_STATE_MASK) { + case SCX_OPSS_DISPATCHING: + case SCX_OPSS_NONE: + /* someone else already got to it */ + return; + case SCX_OPSS_QUEUED: + /* + * If qseq doesn't match, @p has gone through at least one + * dispatch/dequeue and re-enqueue cycle between + * scx_bpf_dsq_insert() and here and we have no claim on it. + */ + if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) + return; + + /* + * While we know @p is accessible, we don't yet have a claim on + * it - the BPF scheduler is allowed to dispatch tasks + * spuriously and there can be a racing dequeue attempt. Let's + * claim @p by atomically transitioning it from QUEUED to + * DISPATCHING. + */ + if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, + SCX_OPSS_DISPATCHING))) + break; + goto retry; + case SCX_OPSS_QUEUEING: + /* + * do_enqueue_task() is in the process of transferring the task + * to the BPF scheduler while holding @p's rq lock. As we aren't + * holding any kernel or BPF resource that the enqueue path may + * depend upon, it's safe to wait. + */ + wait_ops_state(p, opss); + goto retry; + } + + BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); + + dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p); + + if (dsq->id == SCX_DSQ_LOCAL) + dispatch_to_local_dsq(rq, dsq, p, enq_flags); + else + dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); +} + +static void flush_dispatch_buf(struct rq *rq) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + u32 u; + + for (u = 0; u < dspc->cursor; u++) { + struct scx_dsp_buf_ent *ent = &dspc->buf[u]; + + finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id, + ent->enq_flags); + } + + dspc->nr_tasks += dspc->cursor; + dspc->cursor = 0; +} + +static int balance_one(struct rq *rq, struct task_struct *prev) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; + int nr_loops = SCX_DSP_MAX_LOOPS; + + lockdep_assert_rq_held(rq); + rq->scx.flags |= SCX_RQ_IN_BALANCE; + rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + + if (static_branch_unlikely(&scx_ops_cpu_preempt) && + unlikely(rq->scx.cpu_released)) { + /* + * If the previous sched_class for the current CPU was not SCX, + * notify the BPF scheduler that it again has control of the + * core. This callback complements ->cpu_release(), which is + * emitted in switch_class(). + */ + if (SCX_HAS_OP(cpu_acquire)) + SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); + rq->scx.cpu_released = false; + } + + if (prev_on_scx) { + update_curr_scx(rq); + + /* + * If @prev is runnable & has slice left, it has priority and + * fetching more just increases latency for the fetched tasks. + * Tell pick_task_scx() to keep running @prev. If the BPF + * scheduler wants to handle this explicitly, it should + * implement ->cpu_release(). + * + * See scx_ops_disable_workfn() for the explanation on the + * bypassing test. + */ + if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } + } + + /* if there already are tasks to run, nothing to do */ + if (rq->scx.local_dsq.nr) + goto has_tasks; + + if (consume_global_dsq(rq)) + goto has_tasks; + + if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) + goto no_tasks; + + dspc->rq = rq; + + /* + * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, + * the local DSQ might still end up empty after a successful + * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() + * produced some tasks, retry. The BPF scheduler may depend on this + * looping behavior to simplify its implementation. + */ + do { + dspc->nr_tasks = 0; + + SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), + prev_on_scx ? prev : NULL); + + flush_dispatch_buf(rq); + + if (prev_on_rq && prev->scx.slice) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } + if (rq->scx.local_dsq.nr) + goto has_tasks; + if (consume_global_dsq(rq)) + goto has_tasks; + + /* + * ops.dispatch() can trap us in this loop by repeatedly + * dispatching ineligible tasks. Break out once in a while to + * allow the watchdog to run. As IRQ can't be enabled in + * balance(), we want to complete this scheduling cycle and then + * start a new one. IOW, we want to call resched_curr() on the + * next, most likely idle, task, not the current one. Use + * scx_bpf_kick_cpu() for deferred kicking. + */ + if (unlikely(!--nr_loops)) { + scx_bpf_kick_cpu(cpu_of(rq), 0); + break; + } + } while (dspc->nr_tasks); + +no_tasks: + /* + * Didn't find another task to run. Keep running @prev unless + * %SCX_OPS_ENQ_LAST is in effect. + */ + if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || + scx_rq_bypassing(rq))) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; + return false; + +has_tasks: + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; + return true; +} + +static int balance_scx(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ + int ret; + + rq_unpin_lock(rq, rf); + + ret = balance_one(rq, prev); + +#ifdef CONFIG_SCHED_SMT + /* + * When core-sched is enabled, this ops.balance() call will be followed + * by pick_task_scx() on this CPU and the SMT siblings. Balance the + * siblings too. + */ + if (sched_core_enabled(rq)) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + int scpu; + + for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { + struct rq *srq = cpu_rq(scpu); + struct task_struct *sprev = srq->curr; + + WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); + update_rq_clock(srq); + balance_one(srq, sprev); + } + } +#endif + rq_repin_lock(rq, rf); + + return ret; +} + +static void process_ddsp_deferred_locals(struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_rq_held(rq); + + /* + * Now that @rq can be unlocked, execute the deferred enqueueing of + * tasks directly dispatched to the local DSQs of other CPUs. See + * direct_dispatch(). Keep popping from the head instead of using + * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq + * temporarily. + */ + while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, + struct task_struct, scx.dsq_list.node))) { + struct scx_dispatch_q *dsq; + + list_del_init(&p->scx.dsq_list.node); + + dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) + dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags); + } +} + +static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) +{ + if (p->scx.flags & SCX_TASK_QUEUED) { + /* + * Core-sched might decide to execute @p before it is + * dispatched. Call ops_dequeue() to notify the BPF scheduler. + */ + ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + dispatch_dequeue(rq, p); + } + + p->se.exec_start = rq_clock_task(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + + clr_task_runnable(p, true); + + /* + * @p is getting newly scheduled or got kicked after someone updated its + * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). + */ + if ((p->scx.slice == SCX_SLICE_INF) != + (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { + if (p->scx.slice == SCX_SLICE_INF) + rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; + else + rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; + + sched_update_tick_dependency(rq); + + /* + * For now, let's refresh the load_avgs just when transitioning + * in and out of nohz. In the future, we might want to add a + * mechanism which calls the following periodically on + * tick-stopped CPUs. + */ + update_other_load_avgs(rq); + } +} + +static enum scx_cpu_preempt_reason +preempt_reason_from_class(const struct sched_class *class) +{ +#ifdef CONFIG_SMP + if (class == &stop_sched_class) + return SCX_CPU_PREEMPT_STOP; +#endif + if (class == &dl_sched_class) + return SCX_CPU_PREEMPT_DL; + if (class == &rt_sched_class) + return SCX_CPU_PREEMPT_RT; + return SCX_CPU_PREEMPT_UNKNOWN; +} + +static void switch_class(struct rq *rq, struct task_struct *next) +{ + const struct sched_class *next_class = next->sched_class; + +#ifdef CONFIG_SMP + /* + * Pairs with the smp_load_acquire() issued by a CPU in + * kick_cpus_irq_workfn() who is waiting for this CPU to perform a + * resched. + */ + smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); +#endif + if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + return; + + /* + * The callback is conceptually meant to convey that the CPU is no + * longer under the control of SCX. Therefore, don't invoke the callback + * if the next class is below SCX (in which case the BPF scheduler has + * actively decided not to schedule any tasks on the CPU). + */ + if (sched_class_above(&ext_sched_class, next_class)) + return; + + /* + * At this point we know that SCX was preempted by a higher priority + * sched_class, so invoke the ->cpu_release() callback if we have not + * done so already. We only send the callback once between SCX being + * preempted, and it regaining control of the CPU. + * + * ->cpu_release() complements ->cpu_acquire(), which is emitted the + * next time that balance_scx() is invoked. + */ + if (!rq->scx.cpu_released) { + if (SCX_HAS_OP(cpu_release)) { + struct scx_cpu_release_args args = { + .reason = preempt_reason_from_class(next_class), + .task = next, + }; + + SCX_CALL_OP(SCX_KF_CPU_RELEASE, + cpu_release, cpu_of(rq), &args); + } + rq->scx.cpu_released = true; + } +} + +static void put_prev_task_scx(struct rq *rq, struct task_struct *p, + struct task_struct *next) +{ + update_curr_scx(rq); + + /* see dequeue_task_scx() on why we skip when !QUEUED */ + if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + + if (p->scx.flags & SCX_TASK_QUEUED) { + set_task_runnable(rq, p); + + /* + * If @p has slice left and is being put, @p is getting + * preempted by a higher priority scheduler class or core-sched + * forcing a different task. Leave it at the head of the local + * DSQ. + */ + if (p->scx.slice && !scx_rq_bypassing(rq)) { + dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + goto switch_class; + } + + /* + * If @p is runnable but we're about to enter a lower + * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell + * ops.enqueue() that @p is the only one available for this cpu, + * which should trigger an explicit follow-up scheduling event. + */ + if (sched_class_above(&ext_sched_class, next->sched_class)) { + WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last)); + do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); + } else { + do_enqueue_task(rq, p, 0, -1); + } + } + +switch_class: + if (next && next->sched_class != &ext_sched_class) + switch_class(rq, next); +} + +static struct task_struct *first_local_task(struct rq *rq) +{ + return list_first_entry_or_null(&rq->scx.local_dsq.list, + struct task_struct, scx.dsq_list.node); +} + +static struct task_struct *pick_task_scx(struct rq *rq) +{ + struct task_struct *prev = rq->curr; + struct task_struct *p; + bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + bool kick_idle = false; + + /* + * WORKAROUND: + * + * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just + * have gone through balance_scx(). Unfortunately, there currently is a + * bug where fair could say yes on balance() but no on pick_task(), + * which then ends up calling pick_task_scx() without preceding + * balance_scx(). + * + * Keep running @prev if possible and avoid stalling from entering idle + * without balancing. + * + * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() + * if pick_task_scx() is called without preceding balance_scx(). + */ + if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { + if (prev->scx.flags & SCX_TASK_QUEUED) { + keep_prev = true; + } else { + keep_prev = false; + kick_idle = true; + } + } else if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { + /* + * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is + * conditional on scx_enabled() and may have been skipped. + */ + WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + keep_prev = false; + } + + /* + * If balance_scx() is telling us to keep running @prev, replenish slice + * if necessary and keep running @prev. Otherwise, pop the first one + * from the local DSQ. + */ + if (keep_prev) { + p = prev; + if (!p->scx.slice) + p->scx.slice = SCX_SLICE_DFL; + } else { + p = first_local_task(rq); + if (!p) { + if (kick_idle) + scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); + return NULL; + } + + if (unlikely(!p->scx.slice)) { + if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { + printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", + p->comm, p->pid, __func__); + scx_warned_zero_slice = true; + } + p->scx.slice = SCX_SLICE_DFL; + } + } + + return p; +} + +#ifdef CONFIG_SCHED_CORE +/** + * scx_prio_less - Task ordering for core-sched + * @a: task A + * @b: task B + * @in_fi: in forced idle state + * + * Core-sched is implemented as an additional scheduling layer on top of the + * usual sched_class'es and needs to find out the expected task ordering. For + * SCX, core-sched calls this function to interrogate the task ordering. + * + * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used + * to implement the default task ordering. The older the timestamp, the higher + * priority the task - the global FIFO ordering matching the default scheduling + * behavior. + * + * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to + * implement FIFO ordering within each local DSQ. See pick_task_scx(). + */ +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) +{ + /* + * The const qualifiers are dropped from task_struct pointers when + * calling ops.core_sched_before(). Accesses are controlled by the + * verifier. + */ + if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) + return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + (struct task_struct *)a, + (struct task_struct *)b); + else + return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); +} +#endif /* CONFIG_SCHED_CORE */ + +#ifdef CONFIG_SMP + +static bool test_and_clear_cpu_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from idle_masks.smt. Ensure that @cpu + * is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. + */ + if (cpumask_intersects(smt, idle_masks.smt)) + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + else if (cpumask_test_cpu(cpu, idle_masks.smt)) + __cpumask_clear_cpu(cpu, idle_masks.smt); + } +#endif + return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); +} + +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (test_and_clear_cpu_idle(cpu)) + return cpu; + else + goto retry; +} + +/* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) + return true; + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +static void update_selcpu_topology(void) +{ + bool enable_llc = false, enable_numa = false; + unsigned int nr_cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) + enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + */ + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + str_enabled_disabled(enable_llc)); + pr_debug("sched_ext: NUMA idle selection %s\n", + str_enabled_disabled(enable_numa)); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * 5. Pick any idle CPU usable by the task. + * + * Step 3 and 4 are performed only if the system has, respectively, multiple + * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa). + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ +static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *found) +{ + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; + s32 cpu; + + *found = false; + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = numa_span(prev_cpu); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) + llc_cpus = llc_span(prev_cpu); + } + + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + cpu = smp_processor_id(); + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + if (cpus_share_cache(cpu, prev_cpu) && + test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + if (!cpumask_empty(idle_masks.cpu) && + !(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto cpu_found; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ + if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && + test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any full idle core usable by the task. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Use @prev_cpu if it's idle. + */ + if (test_and_clear_cpu_idle(prev_cpu)) { + cpu = prev_cpu; + goto cpu_found; + } + + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = scx_pick_idle_cpu(llc_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = scx_pick_idle_cpu(numa_cpus, 0); + if (cpu >= 0) + goto cpu_found; + } + + /* + * Search for any idle CPU usable by the task. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); + if (cpu >= 0) + goto cpu_found; + + rcu_read_unlock(); + return prev_cpu; + +cpu_found: + rcu_read_unlock(); + + *found = true; + return cpu; +} + +static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) +{ + /* + * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it + * can be a good migration opportunity with low cache and memory + * footprint. Returning a CPU different than @prev_cpu triggers + * immediate rq migration. However, for SCX, as the current rq + * association doesn't dictate where the task is going to run, this + * doesn't fit well. If necessary, we can later add a dedicated method + * which can decide to preempt self to force it through the regular + * scheduling path. + */ + if (unlikely(wake_flags & WF_EXEC)) + return prev_cpu; + + if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { + s32 cpu; + struct task_struct **ddsp_taskp; + + ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); + WARN_ON_ONCE(*ddsp_taskp); + *ddsp_taskp = p; + + cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, + select_cpu, p, prev_cpu, wake_flags); + *ddsp_taskp = NULL; + if (ops_cpu_valid(cpu, "from ops.select_cpu()")) + return cpu; + else + return prev_cpu; + } else { + bool found; + s32 cpu; + + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); + if (found) { + p->scx.slice = SCX_SLICE_DFL; + p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + } + return cpu; + } +} + +static void task_woken_scx(struct rq *rq, struct task_struct *p) +{ + run_deferred(rq); +} + +static void set_cpus_allowed_scx(struct task_struct *p, + struct affinity_context *ac) +{ + set_cpus_allowed_common(p, ac); + + /* + * The effective cpumask is stored in @p->cpus_ptr which may temporarily + * differ from the configured one in @p->cpus_mask. Always tell the bpf + * scheduler the effective one. + * + * Fine-grained memory write control is enforced by BPF making the const + * designation pointless. Cast it away when calling the operation. + */ + if (SCX_HAS_OP(set_cpumask)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, + (struct cpumask *)p->cpus_ptr); +} + +static void reset_idle_masks(void) +{ + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + cpumask_copy(idle_masks.cpu, cpu_online_mask); + cpumask_copy(idle_masks.smt, cpu_online_mask); +} + +static void update_builtin_idle(int cpu, bool idle) +{ + assign_cpu(cpu, idle_masks.cpu, idle); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + + if (idle) { + /* + * idle_masks.smt handling is racy but that's fine as + * it's only for optimization and self-correcting. + */ + if (!cpumask_subset(smt, idle_masks.cpu)) + return; + cpumask_or(idle_masks.smt, idle_masks.smt, smt); + } else { + cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); + } + } +#endif +} + +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + +static void handle_hotplug(struct rq *rq, bool online) +{ + int cpu = cpu_of(rq); + + atomic_long_inc(&scx_hotplug_seq); + + if (scx_enabled()) + update_selcpu_topology(); + + if (online && SCX_HAS_OP(cpu_online)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); + else if (!online && SCX_HAS_OP(cpu_offline)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); + else + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); +} + +void scx_rq_activate(struct rq *rq) +{ + handle_hotplug(rq, true); +} + +void scx_rq_deactivate(struct rq *rq) +{ + handle_hotplug(rq, false); +} + +static void rq_online_scx(struct rq *rq) +{ + rq->scx.flags |= SCX_RQ_ONLINE; +} + +static void rq_offline_scx(struct rq *rq) +{ + rq->scx.flags &= ~SCX_RQ_ONLINE; +} + +#else /* CONFIG_SMP */ + +static bool test_and_clear_cpu_idle(int cpu) { return false; } +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } +static void reset_idle_masks(void) {} + +#endif /* CONFIG_SMP */ + +static bool check_rq_for_timeouts(struct rq *rq) +{ + struct task_struct *p; + struct rq_flags rf; + bool timed_out = false; + + rq_lock_irqsave(rq, &rf); + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + unsigned long last_runnable = p->scx.runnable_at; + + if (unlikely(time_after(jiffies, + last_runnable + scx_watchdog_timeout))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, + dur_ms / 1000, dur_ms % 1000); + timed_out = true; + break; + } + } + rq_unlock_irqrestore(rq, &rf); + + return timed_out; +} + +static void scx_watchdog_workfn(struct work_struct *work) +{ + int cpu; + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + + for_each_online_cpu(cpu) { + if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) + break; + + cond_resched(); + } + queue_delayed_work(system_unbound_wq, to_delayed_work(work), + scx_watchdog_timeout / 2); +} + +void scx_tick(struct rq *rq) +{ + unsigned long last_check; + + if (!scx_enabled()) + return; + + last_check = READ_ONCE(scx_watchdog_timestamp); + if (unlikely(time_after(jiffies, + last_check + READ_ONCE(scx_watchdog_timeout)))) { + u32 dur_ms = jiffies_to_msecs(jiffies - last_check); + + scx_ops_error_kind(SCX_EXIT_ERROR_STALL, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); + } + + update_other_load_avgs(rq); +} + +static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) +{ + update_curr_scx(rq); + + /* + * While disabling, always resched and refresh core-sched timestamp as + * we can't trust the slice management or ops.core_sched_before(). + */ + if (scx_rq_bypassing(rq)) { + curr->scx.slice = 0; + touch_core_sched(rq, curr); + } else if (SCX_HAS_OP(tick)) { + SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); + } + + if (!curr->scx.slice) + resched_curr(rq); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static struct cgroup *tg_cgrp(struct task_group *tg) +{ + /* + * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, + * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the + * root cgroup. + */ + if (tg && tg->css.cgroup) + return tg->css.cgroup; + else + return &cgrp_dfl_root.cgrp; +} + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), + +#else /* CONFIG_EXT_GROUP_SCHED */ + +#define SCX_INIT_TASK_ARGS_CGROUP(tg) + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +static enum scx_task_state scx_get_task_state(const struct task_struct *p) +{ + return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; +} + +static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +{ + enum scx_task_state prev_state = scx_get_task_state(p); + bool warn = false; + + BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); + + switch (state) { + case SCX_TASK_NONE: + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_READY: + warn = prev_state == SCX_TASK_NONE; + break; + case SCX_TASK_ENABLED: + warn = prev_state != SCX_TASK_READY; + break; + default: + warn = true; + return; + } + + WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state << SCX_TASK_STATE_SHIFT; +} + +static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) +{ + int ret; + + p->scx.disallow = false; + + if (SCX_HAS_OP(init_task)) { + struct scx_init_task_args args = { + SCX_INIT_TASK_ARGS_CGROUP(tg) + .fork = fork, + }; + + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); + if (unlikely(ret)) { + ret = ops_sanitize_err("init_task", ret); + return ret; + } + } + + scx_set_task_state(p, SCX_TASK_INIT); + + if (p->scx.disallow) { + if (!fork) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + + /* + * We're in the load path and @p->policy will be applied + * right after. Reverting @p->policy here and rejecting + * %SCHED_EXT transitions from scx_check_setscheduler() + * guarantees that if ops.init_task() sets @p->disallow, + * @p can never be in SCX. + */ + if (p->policy == SCHED_EXT) { + p->policy = SCHED_NORMAL; + atomic_long_inc(&scx_nr_rejected); + } + + task_rq_unlock(rq, p, &rf); + } else if (p->policy == SCHED_EXT) { + scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork", + p->comm, p->pid); + } + } + + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + return 0; +} + +static void scx_ops_enable_task(struct task_struct *p) +{ + u32 weight; + + lockdep_assert_rq_held(task_rq(p)); + + /* + * Set the weight before calling ops.enable() so that the scheduler + * doesn't see a stale value if they inspect the task struct. + */ + if (task_has_idle_policy(p)) + weight = WEIGHT_IDLEPRIO; + else + weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; + + p->scx.weight = sched_weight_to_cgroup(weight); + + if (SCX_HAS_OP(enable)) + SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + scx_set_task_state(p, SCX_TASK_ENABLED); + + if (SCX_HAS_OP(set_weight)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void scx_ops_disable_task(struct task_struct *p) +{ + lockdep_assert_rq_held(task_rq(p)); + WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + + if (SCX_HAS_OP(disable)) + SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); + scx_set_task_state(p, SCX_TASK_READY); +} + +static void scx_ops_exit_task(struct task_struct *p) +{ + struct scx_exit_task_args args = { + .cancelled = false, + }; + + lockdep_assert_rq_held(task_rq(p)); + + switch (scx_get_task_state(p)) { + case SCX_TASK_NONE: + return; + case SCX_TASK_INIT: + args.cancelled = true; + break; + case SCX_TASK_READY: + break; + case SCX_TASK_ENABLED: + scx_ops_disable_task(p); + break; + default: + WARN_ON_ONCE(true); + return; + } + + if (SCX_HAS_OP(exit_task)) + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); + scx_set_task_state(p, SCX_TASK_NONE); +} + +void init_scx_entity(struct sched_ext_entity *scx) +{ + memset(scx, 0, sizeof(*scx)); + INIT_LIST_HEAD(&scx->dsq_list.node); + RB_CLEAR_NODE(&scx->dsq_priq); + scx->sticky_cpu = -1; + scx->holding_cpu = -1; + INIT_LIST_HEAD(&scx->runnable_node); + scx->runnable_at = jiffies; + scx->ddsp_dsq_id = SCX_DSQ_INVALID; + scx->slice = SCX_SLICE_DFL; +} + +void scx_pre_fork(struct task_struct *p) +{ + /* + * BPF scheduler enable/disable paths want to be able to iterate and + * update all tasks which can become complex when racing forks. As + * enable/disable are very cold paths, let's use a percpu_rwsem to + * exclude forks. + */ + percpu_down_read(&scx_fork_rwsem); +} + +int scx_fork(struct task_struct *p) +{ + percpu_rwsem_assert_held(&scx_fork_rwsem); + + if (scx_ops_init_task_enabled) + return scx_ops_init_task(p, task_group(p), true); + else + return 0; +} + +void scx_post_fork(struct task_struct *p) +{ + if (scx_ops_init_task_enabled) { + scx_set_task_state(p, SCX_TASK_READY); + + /* + * Enable the task immediately if it's running on sched_ext. + * Otherwise, it'll be enabled in switching_to_scx() if and + * when it's ever configured to run with a SCHED_EXT policy. + */ + if (p->sched_class == &ext_sched_class) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_ops_enable_task(p); + task_rq_unlock(rq, p, &rf); + } + } + + spin_lock_irq(&scx_tasks_lock); + list_add_tail(&p->scx.tasks_node, &scx_tasks); + spin_unlock_irq(&scx_tasks_lock); + + percpu_up_read(&scx_fork_rwsem); +} + +void scx_cancel_fork(struct task_struct *p) +{ + if (scx_enabled()) { + struct rq *rq; + struct rq_flags rf; + + rq = task_rq_lock(p, &rf); + WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); + scx_ops_exit_task(p); + task_rq_unlock(rq, p, &rf); + } + + percpu_up_read(&scx_fork_rwsem); +} + +void sched_ext_free(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&scx_tasks_lock, flags); + list_del_init(&p->scx.tasks_node); + spin_unlock_irqrestore(&scx_tasks_lock, flags); + + /* + * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> + * ENABLED transitions can't race us. Disable ops for @p. + */ + if (scx_get_task_state(p) != SCX_TASK_NONE) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + scx_ops_exit_task(p); + task_rq_unlock(rq, p, &rf); + } +} + +static void reweight_task_scx(struct rq *rq, struct task_struct *p, + const struct load_weight *lw) +{ + lockdep_assert_rq_held(task_rq(p)); + + p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); + if (SCX_HAS_OP(set_weight)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); +} + +static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +{ +} + +static void switching_to_scx(struct rq *rq, struct task_struct *p) +{ + scx_ops_enable_task(p); + + /* + * set_cpus_allowed_scx() is not called while @p is associated with a + * different scheduler class. Keep the BPF scheduler up-to-date. + */ + if (SCX_HAS_OP(set_cpumask)) + SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, + (struct cpumask *)p->cpus_ptr); +} + +static void switched_from_scx(struct rq *rq, struct task_struct *p) +{ + scx_ops_disable_task(p); +} + +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void switched_to_scx(struct rq *rq, struct task_struct *p) {} + +int scx_check_setscheduler(struct task_struct *p, int policy) +{ + lockdep_assert_rq_held(task_rq(p)); + + /* if disallow, reject transitioning into SCX */ + if (scx_enabled() && READ_ONCE(p->scx.disallow) && + p->policy != policy && policy == SCHED_EXT) + return -EACCES; + + return 0; +} + +#ifdef CONFIG_NO_HZ_FULL +bool scx_can_stop_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (scx_rq_bypassing(rq)) + return false; + + if (p->sched_class != &ext_sched_class) + return true; + + /* + * @rq can dispatch from different DSQs, so we can't tell whether it + * needs the tick or not by looking at nr_running. Allow stopping ticks + * iff the BPF scheduler indicated so. See set_next_task_scx(). + */ + return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; +} +#endif + +#ifdef CONFIG_EXT_GROUP_SCHED + +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +static bool scx_cgroup_enabled; +static bool cgroup_warned_missing_weight; +static bool cgroup_warned_missing_idle; + +static void scx_cgroup_warn_missing_weight(struct task_group *tg) +{ + if (scx_ops_enable_state() == SCX_OPS_DISABLED || + cgroup_warned_missing_weight) + return; + + if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) + return; + + pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", + scx_ops.name); + cgroup_warned_missing_weight = true; +} + +static void scx_cgroup_warn_missing_idle(struct task_group *tg) +{ + if (!scx_cgroup_enabled || cgroup_warned_missing_idle) + return; + + if (!tg->idle) + return; + + pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", + scx_ops.name); + cgroup_warned_missing_idle = true; +} + +int scx_tg_online(struct task_group *tg) +{ + int ret = 0; + + WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + + percpu_down_read(&scx_cgroup_rwsem); + + scx_cgroup_warn_missing_weight(tg); + + if (scx_cgroup_enabled) { + if (SCX_HAS_OP(cgroup_init)) { + struct scx_cgroup_init_args args = + { .weight = tg->scx_weight }; + + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + tg->css.cgroup, &args); + if (ret) + ret = ops_sanitize_err("cgroup_init", ret); + } + if (ret == 0) + tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + } else { + tg->scx_flags |= SCX_TG_ONLINE; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ret; +} + +void scx_tg_offline(struct task_group *tg) +{ + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + + percpu_down_read(&scx_cgroup_rwsem); + + if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); + tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + + percpu_up_read(&scx_cgroup_rwsem); +} + +int scx_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + int ret; + + /* released in scx_finish/cancel_attach() */ + percpu_down_read(&scx_cgroup_rwsem); + + if (!scx_cgroup_enabled) + return 0; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup *from = tg_cgrp(task_group(p)); + struct cgroup *to = tg_cgrp(css_tg(css)); + + WARN_ON_ONCE(p->scx.cgrp_moving_from); + + /* + * sched_move_task() omits identity migrations. Let's match the + * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() + * always match one-to-one. + */ + if (from == to) + continue; + + if (SCX_HAS_OP(cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, + p, from, css->cgroup); + if (ret) + goto err; + } + + p->scx.cgrp_moving_from = from; + } + + return 0; + +err: + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } + + percpu_up_read(&scx_cgroup_rwsem); + return ops_sanitize_err("cgroup_prep_move", ret); +} + +void scx_cgroup_move_task(struct task_struct *p) +{ + if (!scx_cgroup_enabled) + return; + + /* + * @p must have ops.cgroup_prep_move() called on it and thus + * cgrp_moving_from set. + */ + if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, + p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + p->scx.cgrp_moving_from = NULL; +} + +void scx_cgroup_finish_attach(void) +{ + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *p; + + if (!scx_cgroup_enabled) + goto out_unlock; + + cgroup_taskset_for_each(p, css, tset) { + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, + p->scx.cgrp_moving_from, css->cgroup); + p->scx.cgrp_moving_from = NULL; + } +out_unlock: + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_weight(struct task_group *tg, unsigned long weight) +{ + percpu_down_read(&scx_cgroup_rwsem); + + if (scx_cgroup_enabled && tg->scx_weight != weight) { + if (SCX_HAS_OP(cgroup_set_weight)) + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, + tg_cgrp(tg), weight); + tg->scx_weight = weight; + } + + percpu_up_read(&scx_cgroup_rwsem); +} + +void scx_group_set_idle(struct task_group *tg, bool idle) +{ + percpu_down_read(&scx_cgroup_rwsem); + scx_cgroup_warn_missing_idle(tg); + percpu_up_read(&scx_cgroup_rwsem); +} + +static void scx_cgroup_lock(void) +{ + percpu_down_write(&scx_cgroup_rwsem); +} + +static void scx_cgroup_unlock(void) +{ + percpu_up_write(&scx_cgroup_rwsem); +} + +#else /* CONFIG_EXT_GROUP_SCHED */ + +static inline void scx_cgroup_lock(void) {} +static inline void scx_cgroup_unlock(void) {} + +#endif /* CONFIG_EXT_GROUP_SCHED */ + +/* + * Omitted operations: + * + * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task + * isn't tied to the CPU at that point. Preemption is implemented by resetting + * the victim task's slice to 0 and triggering reschedule on the target CPU. + * + * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. + * + * - task_fork/dead: We need fork/dead notifications for all tasks regardless of + * their current sched_class. Call them directly from sched core instead. + */ +DEFINE_SCHED_CLASS(ext) = { + .enqueue_task = enqueue_task_scx, + .dequeue_task = dequeue_task_scx, + .yield_task = yield_task_scx, + .yield_to_task = yield_to_task_scx, + + .wakeup_preempt = wakeup_preempt_scx, + + .balance = balance_scx, + .pick_task = pick_task_scx, + + .put_prev_task = put_prev_task_scx, + .set_next_task = set_next_task_scx, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_scx, + .task_woken = task_woken_scx, + .set_cpus_allowed = set_cpus_allowed_scx, + + .rq_online = rq_online_scx, + .rq_offline = rq_offline_scx, +#endif + + .task_tick = task_tick_scx, + + .switching_to = switching_to_scx, + .switched_from = switched_from_scx, + .switched_to = switched_to_scx, + .reweight_task = reweight_task_scx, + .prio_changed = prio_changed_scx, + + .update_curr = update_curr_scx, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif +}; + +static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +{ + memset(dsq, 0, sizeof(*dsq)); + + raw_spin_lock_init(&dsq->lock); + INIT_LIST_HEAD(&dsq->list); + dsq->id = dsq_id; +} + +static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) +{ + struct scx_dispatch_q *dsq; + int ret; + + if (dsq_id & SCX_DSQ_FLAG_BUILTIN) + return ERR_PTR(-EINVAL); + + dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) + return ERR_PTR(-ENOMEM); + + init_dsq(dsq, dsq_id); + + ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, + dsq_hash_params); + if (ret) { + kfree(dsq); + return ERR_PTR(ret); + } + return dsq; +} + +static void free_dsq_irq_workfn(struct irq_work *irq_work) +{ + struct llist_node *to_free = llist_del_all(&dsqs_to_free); + struct scx_dispatch_q *dsq, *tmp_dsq; + + llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) + kfree_rcu(dsq, rcu); +} + +static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); + +static void destroy_dsq(u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + unsigned long flags; + + rcu_read_lock(); + + dsq = find_user_dsq(dsq_id); + if (!dsq) + goto out_unlock_rcu; + + raw_spin_lock_irqsave(&dsq->lock, flags); + + if (dsq->nr) { + scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", + dsq->id, dsq->nr); + goto out_unlock_dsq; + } + + if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) + goto out_unlock_dsq; + + /* + * Mark dead by invalidating ->id to prevent dispatch_enqueue() from + * queueing more tasks. As this function can be called from anywhere, + * freeing is bounced through an irq work to avoid nesting RCU + * operations inside scheduler locks. + */ + dsq->id = SCX_DSQ_INVALID; + llist_add(&dsq->free_node, &dsqs_to_free); + irq_work_queue(&free_dsq_irq_work); + +out_unlock_dsq: + raw_spin_unlock_irqrestore(&dsq->lock, flags); +out_unlock_rcu: + rcu_read_unlock(); +} + +#ifdef CONFIG_EXT_GROUP_SCHED +static void scx_cgroup_exit(void) +{ + struct cgroup_subsys_state *css; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + scx_cgroup_enabled = false; + + /* + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * cgroups and exit all the inited ones, all online cgroups are exited. + */ + rcu_read_lock(); + css_for_each_descendant_post(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + + if (!(tg->scx_flags & SCX_TG_INITED)) + continue; + tg->scx_flags &= ~SCX_TG_INITED; + + if (!scx_ops.cgroup_exit) + continue; + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); +} + +static int scx_cgroup_init(void) +{ + struct cgroup_subsys_state *css; + int ret; + + percpu_rwsem_assert_held(&scx_cgroup_rwsem); + + cgroup_warned_missing_weight = false; + cgroup_warned_missing_idle = false; + + /* + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * cgroups and init, all online cgroups are initialized. + */ + rcu_read_lock(); + css_for_each_descendant_pre(css, &root_task_group.css) { + struct task_group *tg = css_tg(css); + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + + scx_cgroup_warn_missing_weight(tg); + scx_cgroup_warn_missing_idle(tg); + + if ((tg->scx_flags & + (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) + continue; + + if (!scx_ops.cgroup_init) { + tg->scx_flags |= SCX_TG_INITED; + continue; + } + + if (WARN_ON_ONCE(!css_tryget(css))) + continue; + rcu_read_unlock(); + + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + css->cgroup, &args); + if (ret) { + css_put(css); + scx_ops_error("ops.cgroup_init() failed (%d)", ret); + return ret; + } + tg->scx_flags |= SCX_TG_INITED; + + rcu_read_lock(); + css_put(css); + } + rcu_read_unlock(); + + WARN_ON_ONCE(scx_cgroup_enabled); + scx_cgroup_enabled = true; + + return 0; +} + +#else +static void scx_cgroup_exit(void) {} +static int scx_cgroup_init(void) { return 0; } +#endif + + +/******************************************************************************** + * Sysfs interface and ops enable/disable. + */ + +#define SCX_ATTR(_name) \ + static struct kobj_attribute scx_attr_##_name = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = scx_attr_##_name##_show, \ + } + +static ssize_t scx_attr_state_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", + scx_ops_enable_state_str[scx_ops_enable_state()]); +} +SCX_ATTR(state); + +static ssize_t scx_attr_switch_all_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); +} +SCX_ATTR(switch_all); + +static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); +} +SCX_ATTR(nr_rejected); + +static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); +} +SCX_ATTR(hotplug_seq); + +static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); +} +SCX_ATTR(enable_seq); + +static struct attribute *scx_global_attrs[] = { + &scx_attr_state.attr, + &scx_attr_switch_all.attr, + &scx_attr_nr_rejected.attr, + &scx_attr_hotplug_seq.attr, + &scx_attr_enable_seq.attr, + NULL, +}; + +static const struct attribute_group scx_global_attr_group = { + .attrs = scx_global_attrs, +}; + +static void scx_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static ssize_t scx_attr_ops_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%s\n", scx_ops.name); +} +SCX_ATTR(ops); + +static struct attribute *scx_sched_attrs[] = { + &scx_attr_ops.attr, + NULL, +}; +ATTRIBUTE_GROUPS(scx_sched); + +static const struct kobj_type scx_ktype = { + .release = scx_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = scx_sched_groups, +}; + +static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); +} + +static const struct kset_uevent_ops scx_uevent_ops = { + .uevent = scx_uevent, +}; + +/* + * Used by sched_fork() and __setscheduler_prio() to pick the matching + * sched_class. dl/rt are already handled. + */ +bool task_should_scx(int policy) +{ + if (!scx_enabled() || + unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) + return false; + if (READ_ONCE(scx_switching_all)) + return true; + return policy == SCHED_EXT; +} + +/** + * scx_softlockup - sched_ext softlockup handler + * @dur_s: number of seconds of CPU stuck due to soft lockup + * + * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can + * live-lock the system by making many CPUs target the same DSQ to the point + * where soft-lockup detection triggers. This function is called from + * soft-lockup watchdog when the triggering point is close and tries to unjam + * the system by enabling the breather and aborting the BPF scheduler. + */ +void scx_softlockup(u32 dur_s) +{ + switch (scx_ops_enable_state()) { + case SCX_OPS_ENABLING: + case SCX_OPS_ENABLED: + break; + default: + return; + } + + /* allow only one instance, cleared at the end of scx_ops_bypass() */ + if (test_and_set_bit(0, &scx_in_softlockup)) + return; + + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", + smp_processor_id(), dur_s, scx_ops.name); + + /* + * Some CPUs may be trapped in the dispatch paths. Enable breather + * immediately; otherwise, we might even be able to get to + * scx_ops_bypass(). + */ + atomic_inc(&scx_ops_breather_depth); + + scx_ops_error("soft lockup - CPU#%d stuck for %us", + smp_processor_id(), dur_s); +} + +static void scx_clear_softlockup(void) +{ + if (test_and_clear_bit(0, &scx_in_softlockup)) + atomic_dec(&scx_ops_breather_depth); +} + +/** + * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * @bypass: true for bypass, false for unbypass + * + * Bypassing guarantees that all runnable tasks make forward progress without + * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might + * be held by tasks that the BPF scheduler is forgetting to run, which + * unfortunately also excludes toggling the static branches. + * + * Let's work around by overriding a couple ops and modifying behaviors based on + * the DISABLING state and then cycling the queued tasks through dequeue/enqueue + * to force global FIFO scheduling. + * + * - ops.select_cpu() is ignored and the default select_cpu() is used. + * + * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order. + * %SCX_OPS_ENQ_LAST is also ignored. + * + * - ops.dispatch() is ignored. + * + * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice + * can't be trusted. Whenever a tick triggers, the running task is rotated to + * the tail of the queue with core_sched_at touched. + * + * - pick_next_task() suppresses zero slice warning. + * + * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * operations. + * + * - scx_prio_less() reverts to the default core_sched_at order. + */ +static void scx_ops_bypass(bool bypass) +{ + static DEFINE_RAW_SPINLOCK(bypass_lock); + int cpu; + unsigned long flags; + + raw_spin_lock_irqsave(&bypass_lock, flags); + if (bypass) { + scx_ops_bypass_depth++; + WARN_ON_ONCE(scx_ops_bypass_depth <= 0); + if (scx_ops_bypass_depth != 1) + goto unlock; + } else { + scx_ops_bypass_depth--; + WARN_ON_ONCE(scx_ops_bypass_depth < 0); + if (scx_ops_bypass_depth != 0) + goto unlock; + } + + atomic_inc(&scx_ops_breather_depth); + + /* + * No task property is changing. We just need to make sure all currently + * queued tasks are re-queued according to the new scx_rq_bypassing() + * state. As an optimization, walk each rq's runnable_list instead of + * the scx_tasks list. + * + * This function can't trust the scheduler and thus can't use + * cpus_read_lock(). Walk all possible CPUs instead of online. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct task_struct *p, *n; + + raw_spin_rq_lock(rq); + + if (bypass) { + WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); + rq->scx.flags |= SCX_RQ_BYPASSING; + } else { + WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING)); + rq->scx.flags &= ~SCX_RQ_BYPASSING; + } + + /* + * We need to guarantee that no tasks are on the BPF scheduler + * while bypassing. Either we see enabled or the enable path + * sees scx_rq_bypassing() before moving tasks to SCX. + */ + if (!scx_enabled()) { + raw_spin_rq_unlock(rq); + continue; + } + + /* + * The use of list_for_each_entry_safe_reverse() is required + * because each task is going to be removed from and added back + * to the runnable_list during iteration. Because they're added + * to the tail of the list, safe reverse iteration can still + * visit all nodes. + */ + list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, + scx.runnable_node) { + struct sched_enq_and_set_ctx ctx; + + /* cycling deq/enq is enough, see the function comment */ + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + sched_enq_and_set_task(&ctx); + } + + /* resched to restore ticks and idle state */ + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); + + raw_spin_rq_unlock(rq); + } + + atomic_dec(&scx_ops_breather_depth); +unlock: + raw_spin_unlock_irqrestore(&bypass_lock, flags); + scx_clear_softlockup(); +} + +static void free_exit_info(struct scx_exit_info *ei) +{ + kfree(ei->dump); + kfree(ei->msg); + kfree(ei->bt); + kfree(ei); +} + +static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) +{ + struct scx_exit_info *ei; + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) + return NULL; + + ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); + ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); + ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); + + if (!ei->bt || !ei->msg || !ei->dump) { + free_exit_info(ei); + return NULL; + } + + return ei; +} + +static const char *scx_exit_reason(enum scx_exit_kind kind) +{ + switch (kind) { + case SCX_EXIT_UNREG: + return "unregistered from user space"; + case SCX_EXIT_UNREG_BPF: + return "unregistered from BPF"; + case SCX_EXIT_UNREG_KERN: + return "unregistered from the main kernel"; + case SCX_EXIT_SYSRQ: + return "disabled by sysrq-S"; + case SCX_EXIT_ERROR: + return "runtime error"; + case SCX_EXIT_ERROR_BPF: + return "scx_bpf_error"; + case SCX_EXIT_ERROR_STALL: + return "runnable task stall"; + default: + return "<UNKNOWN>"; + } +} + +static void scx_ops_disable_workfn(struct kthread_work *work) +{ + struct scx_exit_info *ei = scx_exit_info; + struct scx_task_iter sti; + struct task_struct *p; + struct rhashtable_iter rht_iter; + struct scx_dispatch_q *dsq; + int i, kind, cpu; + + kind = atomic_read(&scx_exit_kind); + while (true) { + /* + * NONE indicates that a new scx_ops has been registered since + * disable was scheduled - don't kill the new ops. DONE + * indicates that the ops has already been disabled. + */ + if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) + return; + if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) + break; + } + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + /* guarantee forward progress by bypassing scx_ops */ + scx_ops_bypass(true); + + switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { + case SCX_OPS_DISABLING: + WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); + break; + case SCX_OPS_DISABLED: + pr_warn("sched_ext: ops error detected without ops (%s)\n", + scx_exit_info->msg); + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != + SCX_OPS_DISABLING); + goto done; + default: + break; + } + + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ + mutex_lock(&scx_ops_enable_mutex); + + static_branch_disable(&__scx_switched_all); + WRITE_ONCE(scx_switching_all, false); + + /* + * Shut down cgroup support before tasks so that the cgroup attach path + * doesn't race against scx_ops_exit_task(). + */ + scx_cgroup_lock(); + scx_cgroup_exit(); + scx_cgroup_unlock(); + + /* + * The BPF scheduler is going away. All tasks including %TASK_DEAD ones + * must be switched out and exited synchronously. + */ + percpu_down_write(&scx_fork_rwsem); + + scx_ops_init_task_enabled = false; + + scx_task_iter_start(&sti); + while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); + struct sched_enq_and_set_ctx ctx; + + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + p->sched_class = new_class; + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + scx_ops_exit_task(p); + } + scx_task_iter_stop(&sti); + percpu_up_write(&scx_fork_rwsem); + + /* + * Invalidate all the rq clocks to prevent getting outdated + * rq clocks from a previous scx scheduler. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + scx_rq_clock_invalidate(rq); + } + + /* no task is on scx, turn off all the switches and flush in-progress calls */ + static_branch_disable(&__scx_ops_enabled); + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) + static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_enq_last); + static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_enq_migration_disabled); + static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&scx_builtin_idle_enabled); + synchronize_rcu(); + + if (ei->kind >= SCX_EXIT_ERROR) { + pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", + scx_ops.name, ei->reason); + + if (ei->msg[0] != '\0') + pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); +#ifdef CONFIG_STACKTRACE + stack_trace_print(ei->bt, ei->bt_len, 2); +#endif + } else { + pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", + scx_ops.name, ei->reason); + } + + if (scx_ops.exit) + SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + + cancel_delayed_work_sync(&scx_watchdog_work); + + /* + * Delete the kobject from the hierarchy eagerly in addition to just + * dropping a reference. Otherwise, if the object is deleted + * asynchronously, sysfs could observe an object of the same name still + * in the hierarchy when another scheduler is loaded. + */ + kobject_del(scx_root_kobj); + kobject_put(scx_root_kobj); + scx_root_kobj = NULL; + + memset(&scx_ops, 0, sizeof(scx_ops)); + + rhashtable_walk_enter(&dsq_hash, &rht_iter); + do { + rhashtable_walk_start(&rht_iter); + + while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + destroy_dsq(dsq->id); + + rhashtable_walk_stop(&rht_iter); + } while (dsq == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&rht_iter); + + free_percpu(scx_dsp_ctx); + scx_dsp_ctx = NULL; + scx_dsp_max_batch = 0; + + free_exit_info(scx_exit_info); + scx_exit_info = NULL; + + mutex_unlock(&scx_ops_enable_mutex); + + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != + SCX_OPS_DISABLING); +done: + scx_ops_bypass(false); +} + +static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); + +static void schedule_scx_ops_disable_work(void) +{ + struct kthread_worker *helper = READ_ONCE(scx_ops_helper); + + /* + * We may be called spuriously before the first bpf_sched_ext_reg(). If + * scx_ops_helper isn't set up yet, there's nothing to do. + */ + if (helper) + kthread_queue_work(helper, &scx_ops_disable_work); +} + +static void scx_ops_disable(enum scx_exit_kind kind) +{ + int none = SCX_EXIT_NONE; + + if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) + kind = SCX_EXIT_ERROR; + + atomic_try_cmpxchg(&scx_exit_kind, &none, kind); + + schedule_scx_ops_disable_work(); +} + +static void dump_newline(struct seq_buf *s) +{ + trace_sched_ext_dump(""); + + /* @s may be zero sized and seq_buf triggers WARN if so */ + if (s->size) + seq_buf_putc(s, '\n'); +} + +static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) +{ + va_list args; + +#ifdef CONFIG_TRACEPOINTS + if (trace_sched_ext_dump_enabled()) { + /* protected by scx_dump_state()::dump_lock */ + static char line_buf[SCX_EXIT_MSG_LEN]; + + va_start(args, fmt); + vscnprintf(line_buf, sizeof(line_buf), fmt, args); + va_end(args); + + trace_sched_ext_dump(line_buf); + } +#endif + /* @s may be zero sized and seq_buf triggers WARN if so */ + if (s->size) { + va_start(args, fmt); + seq_buf_vprintf(s, fmt, args); + va_end(args); + + seq_buf_putc(s, '\n'); + } +} + +static void dump_stack_trace(struct seq_buf *s, const char *prefix, + const unsigned long *bt, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + dump_line(s, "%s%pS", prefix, (void *)bt[i]); +} + +static void ops_dump_init(struct seq_buf *s, const char *prefix) +{ + struct scx_dump_data *dd = &scx_dump_data; + + lockdep_assert_irqs_disabled(); + + dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ + dd->first = true; + dd->cursor = 0; + dd->s = s; + dd->prefix = prefix; +} + +static void ops_dump_flush(void) +{ + struct scx_dump_data *dd = &scx_dump_data; + char *line = dd->buf.line; + + if (!dd->cursor) + return; + + /* + * There's something to flush and this is the first line. Insert a blank + * line to distinguish ops dump. + */ + if (dd->first) { + dump_newline(dd->s); + dd->first = false; + } + + /* + * There may be multiple lines in $line. Scan and emit each line + * separately. + */ + while (true) { + char *end = line; + char c; + + while (*end != '\n' && *end != '\0') + end++; + + /* + * If $line overflowed, it may not have newline at the end. + * Always emit with a newline. + */ + c = *end; + *end = '\0'; + dump_line(dd->s, "%s%s", dd->prefix, line); + if (c == '\0') + break; + + /* move to the next line */ + end++; + if (*end == '\0') + break; + line = end; + } + + dd->cursor = 0; +} + +static void ops_dump_exit(void) +{ + ops_dump_flush(); + scx_dump_data.cpu = -1; +} + +static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, + struct task_struct *p, char marker) +{ + static unsigned long bt[SCX_EXIT_BT_LEN]; + char dsq_id_buf[19] = "(n/a)"; + unsigned long ops_state = atomic_long_read(&p->scx.ops_state); + unsigned int bt_len = 0; + + if (p->scx.dsq) + scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", + (unsigned long long)p->scx.dsq->id); + + dump_newline(s); + dump_line(s, " %c%c %s[%d] %+ldms", + marker, task_state_to_char(p), p->comm, p->pid, + jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); + dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", + scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, + p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, + ops_state >> SCX_OPSS_QSEQ_SHIFT); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); + dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + + if (SCX_HAS_OP(dump_task)) { + ops_dump_init(s, " "); + SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); + ops_dump_exit(); + } + +#ifdef CONFIG_STACKTRACE + bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); +#endif + if (bt_len) { + dump_newline(s); + dump_stack_trace(s, " ", bt, bt_len); + } +} + +static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) +{ + static DEFINE_SPINLOCK(dump_lock); + static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; + struct scx_dump_ctx dctx = { + .kind = ei->kind, + .exit_code = ei->exit_code, + .reason = ei->reason, + .at_ns = ktime_get_ns(), + .at_jiffies = jiffies, + }; + struct seq_buf s; + unsigned long flags; + char *buf; + int cpu; + + spin_lock_irqsave(&dump_lock, flags); + + seq_buf_init(&s, ei->dump, dump_len); + + if (ei->kind == SCX_EXIT_NONE) { + dump_line(&s, "Debug dump triggered by %s", ei->reason); + } else { + dump_line(&s, "%s[%d] triggered exit kind %d:", + current->comm, current->pid, ei->kind); + dump_line(&s, " %s (%s)", ei->reason, ei->msg); + dump_newline(&s); + dump_line(&s, "Backtrace:"); + dump_stack_trace(&s, " ", ei->bt, ei->bt_len); + } + + if (SCX_HAS_OP(dump)) { + ops_dump_init(&s, ""); + SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); + ops_dump_exit(); + } + + dump_newline(&s); + dump_line(&s, "CPU states"); + dump_line(&s, "----------"); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p; + struct seq_buf ns; + size_t avail, used; + bool idle; + + rq_lock(rq, &rf); + + idle = list_empty(&rq->scx.runnable_list) && + rq->curr->sched_class == &idle_sched_class; + + if (idle && !SCX_HAS_OP(dump_cpu)) + goto next; + + /* + * We don't yet know whether ops.dump_cpu() will produce output + * and we may want to skip the default CPU dump if it doesn't. + * Use a nested seq_buf to generate the standard dump so that we + * can decide whether to commit later. + */ + avail = seq_buf_get_buf(&s, &buf); + seq_buf_init(&ns, buf, avail); + + dump_newline(&ns); + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", + cpu, rq->scx.nr_running, rq->scx.flags, + rq->scx.cpu_released, rq->scx.ops_qseq, + rq->scx.pnt_seq); + dump_line(&ns, " curr=%s[%d] class=%ps", + rq->curr->comm, rq->curr->pid, + rq->curr->sched_class); + if (!cpumask_empty(rq->scx.cpus_to_kick)) + dump_line(&ns, " cpus_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick)); + if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) + dump_line(&ns, " idle_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); + if (!cpumask_empty(rq->scx.cpus_to_preempt)) + dump_line(&ns, " cpus_to_preempt: %*pb", + cpumask_pr_args(rq->scx.cpus_to_preempt)); + if (!cpumask_empty(rq->scx.cpus_to_wait)) + dump_line(&ns, " cpus_to_wait : %*pb", + cpumask_pr_args(rq->scx.cpus_to_wait)); + + used = seq_buf_used(&ns); + if (SCX_HAS_OP(dump_cpu)) { + ops_dump_init(&ns, " "); + SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); + ops_dump_exit(); + } + + /* + * If idle && nothing generated by ops.dump_cpu(), there's + * nothing interesting. Skip. + */ + if (idle && used == seq_buf_used(&ns)) + goto next; + + /* + * $s may already have overflowed when $ns was created. If so, + * calling commit on it will trigger BUG. + */ + if (avail) { + seq_buf_commit(&s, seq_buf_used(&ns)); + if (seq_buf_has_overflowed(&ns)) + seq_buf_set_overflow(&s); + } + + if (rq->curr->sched_class == &ext_sched_class) + scx_dump_task(&s, &dctx, rq->curr, '*'); + + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) + scx_dump_task(&s, &dctx, p, ' '); + next: + rq_unlock(rq, &rf); + } + + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) + memcpy(ei->dump + dump_len - sizeof(trunc_marker), + trunc_marker, sizeof(trunc_marker)); + + spin_unlock_irqrestore(&dump_lock, flags); +} + +static void scx_ops_error_irq_workfn(struct irq_work *irq_work) +{ + struct scx_exit_info *ei = scx_exit_info; + + if (ei->kind >= SCX_EXIT_ERROR) + scx_dump_state(ei, scx_ops.exit_dump_len); + + schedule_scx_ops_disable_work(); +} + +static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); + +static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, + s64 exit_code, + const char *fmt, ...) +{ + struct scx_exit_info *ei = scx_exit_info; + int none = SCX_EXIT_NONE; + va_list args; + + if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) + return; + + ei->exit_code = exit_code; +#ifdef CONFIG_STACKTRACE + if (kind >= SCX_EXIT_ERROR) + ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); +#endif + va_start(args, fmt); + vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); + va_end(args); + + /* + * Set ei->kind and ->reason for scx_dump_state(). They'll be set again + * in scx_ops_disable_workfn(). + */ + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + irq_work_queue(&scx_ops_error_irq_work); +} + +static struct kthread_worker *scx_create_rt_helper(const char *name) +{ + struct kthread_worker *helper; + + helper = kthread_run_worker(0, name); + if (helper) + sched_set_fifo(helper->task); + return helper; +} + +static void check_hotplug_seq(const struct sched_ext_ops *ops) +{ + unsigned long long global_hotplug_seq; + + /* + * If a hotplug event has occurred between when a scheduler was + * initialized, and when we were able to attach, exit and notify user + * space about it. + */ + if (ops->hotplug_seq) { + global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); + if (ops->hotplug_seq != global_hotplug_seq) { + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); + } + } +} + +static int validate_ops(const struct sched_ext_ops *ops) +{ + /* + * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the + * ops.enqueue() callback isn't implemented. + */ + if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { + scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + return -EINVAL; + } + + return 0; +} + +static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) +{ + struct scx_task_iter sti; + struct task_struct *p; + unsigned long timeout; + int i, cpu, node, ret; + + if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), + cpu_possible_mask)) { + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); + return -EINVAL; + } + + mutex_lock(&scx_ops_enable_mutex); + + if (!scx_ops_helper) { + WRITE_ONCE(scx_ops_helper, + scx_create_rt_helper("sched_ext_ops_helper")); + if (!scx_ops_helper) { + ret = -ENOMEM; + goto err_unlock; + } + } + + if (!global_dsqs) { + struct scx_dispatch_q **dsqs; + + dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); + if (!dsqs) { + ret = -ENOMEM; + goto err_unlock; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + for_each_node_state(node, N_POSSIBLE) + kfree(dsqs[node]); + kfree(dsqs); + ret = -ENOMEM; + goto err_unlock; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + dsqs[node] = dsq; + } + + global_dsqs = dsqs; + } + + if (scx_ops_enable_state() != SCX_OPS_DISABLED) { + ret = -EBUSY; + goto err_unlock; + } + + scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); + if (!scx_root_kobj) { + ret = -ENOMEM; + goto err_unlock; + } + + scx_root_kobj->kset = scx_kset; + ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); + if (ret < 0) + goto err; + + scx_exit_info = alloc_exit_info(ops->exit_dump_len); + if (!scx_exit_info) { + ret = -ENOMEM; + goto err_del; + } + + /* + * Set scx_ops, transition to ENABLING and clear exit info to arm the + * disable path. Failure triggers full disabling from here on. + */ + scx_ops = *ops; + + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != + SCX_OPS_DISABLED); + + atomic_set(&scx_exit_kind, SCX_EXIT_NONE); + scx_warned_zero_slice = false; + + atomic_long_set(&scx_nr_rejected, 0); + + for_each_possible_cpu(cpu) + cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + + /* + * Keep CPUs stable during enable so that the BPF scheduler can track + * online CPUs by watching ->on/offline_cpu() after ->init(). + */ + cpus_read_lock(); + + if (scx_ops.init) { + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + if (ret) { + ret = ops_sanitize_err("init", ret); + cpus_read_unlock(); + scx_ops_error("ops.init() failed (%d)", ret); + goto err_disable; + } + } + + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable_cpuslocked(&scx_has_op[i]); + + check_hotplug_seq(ops); +#ifdef CONFIG_SMP + update_selcpu_topology(); +#endif + cpus_read_unlock(); + + ret = validate_ops(ops); + if (ret) + goto err_disable; + + WARN_ON_ONCE(scx_dsp_ctx); + scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; + scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, + scx_dsp_max_batch), + __alignof__(struct scx_dsp_ctx)); + if (!scx_dsp_ctx) { + ret = -ENOMEM; + goto err_disable; + } + + if (ops->timeout_ms) + timeout = msecs_to_jiffies(ops->timeout_ms); + else + timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + WRITE_ONCE(scx_watchdog_timeout, timeout); + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + queue_delayed_work(system_unbound_wq, &scx_watchdog_work, + scx_watchdog_timeout / 2); + + /* + * Once __scx_ops_enabled is set, %current can be switched to SCX + * anytime. This can lead to stalls as some BPF schedulers (e.g. + * userspace scheduling) may not function correctly before all tasks are + * switched. Init in bypass mode to guarantee forward progress. + */ + scx_ops_bypass(true); + + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) + if (((void (**)(void))ops)[i]) + static_branch_enable(&scx_has_op[i]); + + if (ops->flags & SCX_OPS_ENQ_LAST) + static_branch_enable(&scx_ops_enq_last); + + if (ops->flags & SCX_OPS_ENQ_EXITING) + static_branch_enable(&scx_ops_enq_exiting); + if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) + static_branch_enable(&scx_ops_enq_migration_disabled); + if (scx_ops.cpu_acquire || scx_ops.cpu_release) + static_branch_enable(&scx_ops_cpu_preempt); + + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { + reset_idle_masks(); + static_branch_enable(&scx_builtin_idle_enabled); + } else { + static_branch_disable(&scx_builtin_idle_enabled); + } + + /* + * Lock out forks, cgroup on/offlining and moves before opening the + * floodgate so that they don't wander into the operations prematurely. + */ + percpu_down_write(&scx_fork_rwsem); + + WARN_ON_ONCE(scx_ops_init_task_enabled); + scx_ops_init_task_enabled = true; + + /* + * Enable ops for every task. Fork is excluded by scx_fork_rwsem + * preventing new tasks from being added. No need to exclude tasks + * leaving as sched_ext_free() can handle both prepped and enabled + * tasks. Prep all tasks first and then enable them with preemption + * disabled. + * + * All cgroups should be initialized before scx_ops_init_task() so that + * the BPF scheduler can reliably track each task's cgroup membership + * from scx_ops_init_task(). Lock out cgroup on/offlining and task + * migrations while tasks are being initialized so that + * scx_cgroup_can_attach() never sees uninitialized tasks. + */ + scx_cgroup_lock(); + ret = scx_cgroup_init(); + if (ret) + goto err_disable_unlock_all; + + scx_task_iter_start(&sti); + while ((p = scx_task_iter_next_locked(&sti))) { + /* + * @p may already be dead, have lost all its usages counts and + * be waiting for RCU grace period before being freed. @p can't + * be initialized for SCX in such cases and should be ignored. + */ + if (!tryget_task_struct(p)) + continue; + + scx_task_iter_unlock(&sti); + + ret = scx_ops_init_task(p, task_group(p), false); + if (ret) { + put_task_struct(p); + scx_task_iter_relock(&sti); + scx_task_iter_stop(&sti); + scx_ops_error("ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); + goto err_disable_unlock_all; + } + + scx_set_task_state(p, SCX_TASK_READY); + + put_task_struct(p); + scx_task_iter_relock(&sti); + } + scx_task_iter_stop(&sti); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + /* + * All tasks are READY. It's safe to turn on scx_enabled() and switch + * all eligible tasks. + */ + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); + static_branch_enable(&__scx_ops_enabled); + + /* + * We're fully committed and can't fail. The task READY -> ENABLED + * transitions here are synchronized against sched_ext_free() through + * scx_tasks_lock. + */ + percpu_down_write(&scx_fork_rwsem); + scx_task_iter_start(&sti); + while ((p = scx_task_iter_next_locked(&sti))) { + const struct sched_class *old_class = p->sched_class; + const struct sched_class *new_class = + __setscheduler_class(p->policy, p->prio); + struct sched_enq_and_set_ctx ctx; + + if (old_class != new_class && p->se.sched_delayed) + dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + + sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; + check_class_changing(task_rq(p), p, old_class); + + sched_enq_and_set_task(&ctx); + + check_class_changed(task_rq(p), p, old_class, p->prio); + } + scx_task_iter_stop(&sti); + percpu_up_write(&scx_fork_rwsem); + + scx_ops_bypass(false); + + if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { + WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); + goto err_disable; + } + + if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) + static_branch_enable(&__scx_switched_all); + + pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", + scx_ops.name, scx_switched_all() ? "" : " (partial)"); + kobject_uevent(scx_root_kobj, KOBJ_ADD); + mutex_unlock(&scx_ops_enable_mutex); + + atomic_long_inc(&scx_enable_seq); + + return 0; + +err_del: + kobject_del(scx_root_kobj); +err: + kobject_put(scx_root_kobj); + scx_root_kobj = NULL; + if (scx_exit_info) { + free_exit_info(scx_exit_info); + scx_exit_info = NULL; + } +err_unlock: + mutex_unlock(&scx_ops_enable_mutex); + return ret; + +err_disable_unlock_all: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + scx_ops_bypass(false); +err_disable: + mutex_unlock(&scx_ops_enable_mutex); + /* + * Returning an error code here would not pass all the error information + * to userspace. Record errno using scx_ops_error() for cases + * scx_ops_error() wasn't already invoked and exit indicating success so + * that the error is notified through ops.exit() with all the details. + * + * Flush scx_ops_disable_work to ensure that error is reported before + * init completion. + */ + scx_ops_error("scx_ops_enable() failed (%d)", ret); + kthread_flush_work(&scx_ops_disable_work); + return 0; +} + + +/******************************************************************************** + * bpf_struct_ops plumbing. + */ +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> + +static const struct btf_type *task_struct_type; + +static bool bpf_scx_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type != BPF_READ) + return false; + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (off % size != 0) + return false; + + return btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, int off, + int size) +{ + const struct btf_type *t; + + t = btf_type_by_id(reg->btf, reg->btf_id); + if (t == task_struct_type) { + if (off >= offsetof(struct task_struct, scx.slice) && + off + size <= offsetofend(struct task_struct, scx.slice)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.disallow) && + off + size <= offsetofend(struct task_struct, scx.disallow)) + return SCALAR_VALUE; + } + + return -EACCES; +} + +static const struct bpf_func_proto * +bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; + default: + return bpf_base_func_proto(func_id, prog); + } +} + +static const struct bpf_verifier_ops bpf_scx_verifier_ops = { + .get_func_proto = bpf_scx_get_func_proto, + .is_valid_access = bpf_scx_is_valid_access, + .btf_struct_access = bpf_scx_btf_struct_access, +}; + +static int bpf_scx_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct sched_ext_ops *uops = udata; + struct sched_ext_ops *ops = kdata; + u32 moff = __btf_member_bit_offset(t, member) / 8; + int ret; + + switch (moff) { + case offsetof(struct sched_ext_ops, dispatch_max_batch): + if (*(u32 *)(udata + moff) > INT_MAX) + return -E2BIG; + ops->dispatch_max_batch = *(u32 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, flags): + if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) + return -EINVAL; + ops->flags = *(u64 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, name): + ret = bpf_obj_name_cpy(ops->name, uops->name, + sizeof(ops->name)); + if (ret < 0) + return ret; + if (ret == 0) + return -EINVAL; + return 1; + case offsetof(struct sched_ext_ops, timeout_ms): + if (msecs_to_jiffies(*(u32 *)(udata + moff)) > + SCX_WATCHDOG_MAX_TIMEOUT) + return -E2BIG; + ops->timeout_ms = *(u32 *)(udata + moff); + return 1; + case offsetof(struct sched_ext_ops, exit_dump_len): + ops->exit_dump_len = + *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; + return 1; + case offsetof(struct sched_ext_ops, hotplug_seq): + ops->hotplug_seq = *(u64 *)(udata + moff); + return 1; + } + + return 0; +} + +static int bpf_scx_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct sched_ext_ops, init_task): +#ifdef CONFIG_EXT_GROUP_SCHED + case offsetof(struct sched_ext_ops, cgroup_init): + case offsetof(struct sched_ext_ops, cgroup_exit): + case offsetof(struct sched_ext_ops, cgroup_prep_move): +#endif + case offsetof(struct sched_ext_ops, cpu_online): + case offsetof(struct sched_ext_ops, cpu_offline): + case offsetof(struct sched_ext_ops, init): + case offsetof(struct sched_ext_ops, exit): + break; + default: + if (prog->sleepable) + return -EINVAL; + } + + return 0; +} + +static int bpf_scx_reg(void *kdata, struct bpf_link *link) +{ + return scx_ops_enable(kdata, link); +} + +static void bpf_scx_unreg(void *kdata, struct bpf_link *link) +{ + scx_ops_disable(SCX_EXIT_UNREG); + kthread_flush_work(&scx_ops_disable_work); +} + +static int bpf_scx_init(struct btf *btf) +{ + task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]); + + return 0; +} + +static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) +{ + /* + * sched_ext does not support updating the actively-loaded BPF + * scheduler, as registering a BPF scheduler can always fail if the + * scheduler returns an error code for e.g. ops.init(), ops.init_task(), + * etc. Similarly, we can always race with unregistration happening + * elsewhere, such as with sysrq. + */ + return -EOPNOTSUPP; +} + +static int bpf_scx_validate(void *kdata) +{ + return 0; +} + +static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } +static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {} +static void sched_ext_ops__tick(struct task_struct *p) {} +static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {} +static void sched_ext_ops__running(struct task_struct *p) {} +static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {} +static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {} +static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; } +static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; } +static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {} +static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {} +static void sched_ext_ops__update_idle(s32 cpu, bool idle) {} +static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {} +static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {} +static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } +static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {} +static void sched_ext_ops__enable(struct task_struct *p) {} +static void sched_ext_ops__disable(struct task_struct *p) {} +#ifdef CONFIG_EXT_GROUP_SCHED +static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } +static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {} +static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } +static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} +static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} +#endif +static void sched_ext_ops__cpu_online(s32 cpu) {} +static void sched_ext_ops__cpu_offline(s32 cpu) {} +static s32 sched_ext_ops__init(void) { return -EINVAL; } +static void sched_ext_ops__exit(struct scx_exit_info *info) {} +static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {} +static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {} +static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {} + +static struct sched_ext_ops __bpf_ops_sched_ext_ops = { + .select_cpu = sched_ext_ops__select_cpu, + .enqueue = sched_ext_ops__enqueue, + .dequeue = sched_ext_ops__dequeue, + .dispatch = sched_ext_ops__dispatch, + .tick = sched_ext_ops__tick, + .runnable = sched_ext_ops__runnable, + .running = sched_ext_ops__running, + .stopping = sched_ext_ops__stopping, + .quiescent = sched_ext_ops__quiescent, + .yield = sched_ext_ops__yield, + .core_sched_before = sched_ext_ops__core_sched_before, + .set_weight = sched_ext_ops__set_weight, + .set_cpumask = sched_ext_ops__set_cpumask, + .update_idle = sched_ext_ops__update_idle, + .cpu_acquire = sched_ext_ops__cpu_acquire, + .cpu_release = sched_ext_ops__cpu_release, + .init_task = sched_ext_ops__init_task, + .exit_task = sched_ext_ops__exit_task, + .enable = sched_ext_ops__enable, + .disable = sched_ext_ops__disable, +#ifdef CONFIG_EXT_GROUP_SCHED + .cgroup_init = sched_ext_ops__cgroup_init, + .cgroup_exit = sched_ext_ops__cgroup_exit, + .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, + .cgroup_move = sched_ext_ops__cgroup_move, + .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, + .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, +#endif + .cpu_online = sched_ext_ops__cpu_online, + .cpu_offline = sched_ext_ops__cpu_offline, + .init = sched_ext_ops__init, + .exit = sched_ext_ops__exit, + .dump = sched_ext_ops__dump, + .dump_cpu = sched_ext_ops__dump_cpu, + .dump_task = sched_ext_ops__dump_task, +}; + +static struct bpf_struct_ops bpf_sched_ext_ops = { + .verifier_ops = &bpf_scx_verifier_ops, + .reg = bpf_scx_reg, + .unreg = bpf_scx_unreg, + .check_member = bpf_scx_check_member, + .init_member = bpf_scx_init_member, + .init = bpf_scx_init, + .update = bpf_scx_update, + .validate = bpf_scx_validate, + .name = "sched_ext_ops", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_ops_sched_ext_ops +}; + + +/******************************************************************************** + * System integration and init. + */ + +static void sysrq_handle_sched_ext_reset(u8 key) +{ + if (scx_ops_helper) + scx_ops_disable(SCX_EXIT_SYSRQ); + else + pr_info("sched_ext: BPF scheduler not yet used\n"); +} + +static const struct sysrq_key_op sysrq_sched_ext_reset_op = { + .handler = sysrq_handle_sched_ext_reset, + .help_msg = "reset-sched-ext(S)", + .action_msg = "Disable sched_ext and revert all tasks to CFS", + .enable_mask = SYSRQ_ENABLE_RTNICE, +}; + +static void sysrq_handle_sched_ext_dump(u8 key) +{ + struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; + + if (scx_enabled()) + scx_dump_state(&ei, 0); +} + +static const struct sysrq_key_op sysrq_sched_ext_dump_op = { + .handler = sysrq_handle_sched_ext_dump, + .help_msg = "dump-sched-ext(D)", + .action_msg = "Trigger sched_ext debug dump", + .enable_mask = SYSRQ_ENABLE_RTNICE, +}; + +static bool can_skip_idle_kick(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + /* + * We can skip idle kicking if @rq is going to go through at least one + * full SCX scheduling cycle before going idle. Just checking whether + * curr is not idle is insufficient because we could be racing + * balance_one() trying to pull the next task from a remote rq, which + * may fail, and @rq may become idle afterwards. + * + * The race window is small and we don't and can't guarantee that @rq is + * only kicked while idle anyway. Skip only when sure. + */ + return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE); +} + +static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) +{ + struct rq *rq = cpu_rq(cpu); + struct scx_rq *this_scx = &this_rq->scx; + bool should_wait = false; + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + /* + * During CPU hotplug, a CPU may depend on kicking itself to make + * forward progress. Allow kicking self regardless of online state. + */ + if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { + if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { + if (rq->curr->sched_class == &ext_sched_class) + rq->curr->scx.slice = 0; + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + } + + if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { + pseqs[cpu] = rq->scx.pnt_seq; + should_wait = true; + } + + resched_curr(rq); + } else { + cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } + + raw_spin_rq_unlock_irqrestore(rq, flags); + + return should_wait; +} + +static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_rq_lock_irqsave(rq, flags); + + if (!can_skip_idle_kick(rq) && + (cpu_online(cpu) || cpu == cpu_of(this_rq))) + resched_curr(rq); + + raw_spin_rq_unlock_irqrestore(rq, flags); +} + +static void kick_cpus_irq_workfn(struct irq_work *irq_work) +{ + struct rq *this_rq = this_rq(); + struct scx_rq *this_scx = &this_rq->scx; + unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); + bool should_wait = false; + s32 cpu; + + for_each_cpu(cpu, this_scx->cpus_to_kick) { + should_wait |= kick_one_cpu(cpu, this_rq, pseqs); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { + kick_one_cpu_if_idle(cpu, this_rq); + cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); + } + + if (!should_wait) + return; + + for_each_cpu(cpu, this_scx->cpus_to_wait) { + unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; + + if (cpu != cpu_of(this_rq)) { + /* + * Pairs with smp_store_release() issued by this CPU in + * switch_class() on the resched path. + * + * We busy-wait here to guarantee that no other task can + * be scheduled on our core before the target CPU has + * entered the resched path. + */ + while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) + cpu_relax(); + } + + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + } +} + +/** + * print_scx_info - print out sched_ext scheduler state + * @log_lvl: the log level to use when printing + * @p: target task + * + * If a sched_ext scheduler is enabled, print the name and state of the + * scheduler. If @p is on sched_ext, print further information about the task. + * + * This function can be safely called on any task as long as the task_struct + * itself is accessible. While safe, this function isn't synchronized and may + * print out mixups or garbages of limited length. + */ +void print_scx_info(const char *log_lvl, struct task_struct *p) +{ + enum scx_ops_enable_state state = scx_ops_enable_state(); + const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; + char runnable_at_buf[22] = "?"; + struct sched_class *class; + unsigned long runnable_at; + + if (state == SCX_OPS_DISABLED) + return; + + /* + * Carefully check if the task was running on sched_ext, and then + * carefully copy the time it's been runnable, and its state. + */ + if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || + class != &ext_sched_class) { + printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, + scx_ops_enable_state_str[state], all); + return; + } + + if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, + sizeof(runnable_at))) + scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", + jiffies_delta_msecs(runnable_at, jiffies)); + + /* print everything onto one line to conserve console space */ + printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", + log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, + runnable_at_buf); +} + +static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) +{ + /* + * SCX schedulers often have userspace components which are sometimes + * involved in critial scheduling paths. PM operations involve freezing + * userspace which can lead to scheduling misbehaviors including stalls. + * Let's bypass while PM operations are in progress. + */ + switch (event) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + case PM_RESTORE_PREPARE: + scx_ops_bypass(true); + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + case PM_POST_RESTORE: + scx_ops_bypass(false); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_pm_notifier = { + .notifier_call = scx_pm_handler, +}; + +void __init init_sched_ext_class(void) +{ + s32 cpu, v; + + /* + * The following is to prevent the compiler from optimizing out the enum + * definitions so that BPF scheduler implementations can use them + * through the generated vmlinux.h. + */ + WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | + SCX_TG_ONLINE); + + BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); +#ifdef CONFIG_SMP + BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); +#endif + scx_kick_cpus_pnt_seqs = + __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, + __alignof__(scx_kick_cpus_pnt_seqs[0])); + BUG_ON(!scx_kick_cpus_pnt_seqs); + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); + INIT_LIST_HEAD(&rq->scx.runnable_list); + INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); + + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); + init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + + if (cpu_online(cpu)) + cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; + } + + register_sysrq_key('S', &sysrq_sched_ext_reset_op); + register_sysrq_key('D', &sysrq_sched_ext_dump_op); + INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); +} + + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ +#include <linux/btf_ids.h> + +__bpf_kfunc_start_defs(); + +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_ops_error("built-in idle tracking is disabled"); + return false; +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can only be called from ops.select_cpu() if the built-in CPU selection is + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ + if (!ops_cpu_valid(prev_cpu, NULL)) + goto prev_cpu; + + if (!check_builtin_idle_enabled()) + goto prev_cpu; + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + +#ifdef CONFIG_SMP + return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); +#endif + +prev_cpu: + *is_idle = false; + return prev_cpu; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, +}; + +static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) +{ + if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + return false; + + lockdep_assert_irqs_disabled(); + + if (unlikely(!p)) { + scx_ops_error("called with NULL task"); + return false; + } + + if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { + scx_ops_error("invalid enq_flags 0x%llx", enq_flags); + return false; + } + + return true; +} + +static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct task_struct *ddsp_task; + + ddsp_task = __this_cpu_read(direct_dispatch_task); + if (ddsp_task) { + mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + return; + } + + if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { + scx_ops_error("dispatch buffer overflow"); + return; + } + + dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ + .task = p, + .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, + .dsq_id = dsq_id, + .enq_flags = enq_flags, + }; +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into + * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @enq_flags: SCX_ENQ_* + * + * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to + * call this function spuriously. Can be called from ops.enqueue(), + * ops.select_cpu(), and ops.dispatch(). + * + * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch + * and @p must match the task being enqueued. + * + * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p + * will be directly inserted into the corresponding dispatch queue after + * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be + * inserted into the local DSQ of the CPU returned by ops.select_cpu(). + * @enq_flags are OR'd with the enqueue flags on the enqueue path before the + * task is inserted. + * + * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id + * and this function can be called upto ops.dispatch_max_batch times to insert + * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the + * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * + * This function doesn't have any locking restrictions and may be called under + * BPF locks (in the future when BPF introduces more flexible locking). + * + * @p is allowed to run for @slice. The scheduling path is triggered on slice + * exhaustion. If zero, the current residual slice is maintained. If + * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with + * scx_bpf_kick_cpu() to trigger scheduling. + */ +__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + if (!scx_dsq_insert_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + scx_dsq_insert_commit(p, dsq_id, enq_flags); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, + u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()"); + scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags); +} + +/** + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into + * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime. All other aspects + * are identical to scx_bpf_dsq_insert(). + * + * @vtime ordering is according to time_before64() which considers wrapping. A + * numerically larger vtime may indicate an earlier position in the ordering and + * vice-versa. + * + * A DSQ can only be used as a FIFO or priority queue at any given time and this + * function must not be called on a DSQ which already has one or more FIFO tasks + * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and + * SCX_DSQ_GLOBAL) cannot be used as priority queues. + */ +__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + if (!scx_dsq_insert_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()"); + scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_enqueue_dispatch, +}; + +static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, + struct task_struct *p, u64 dsq_id, u64 enq_flags) +{ + struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; + struct rq *this_rq, *src_rq, *locked_rq; + bool dispatched = false; + bool in_balance; + unsigned long flags; + + if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + return false; + + /* + * Can be called from either ops.dispatch() locking this_rq() or any + * context where no rq lock is held. If latter, lock @p's task_rq which + * we'll likely need anyway. + */ + src_rq = task_rq(p); + + local_irq_save(flags); + this_rq = this_rq(); + in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE; + + if (in_balance) { + if (this_rq != src_rq) { + raw_spin_rq_unlock(this_rq); + raw_spin_rq_lock(src_rq); + } + } else { + raw_spin_rq_lock(src_rq); + } + + /* + * If the BPF scheduler keeps calling this function repeatedly, it can + * cause similar live-lock conditions as consume_dispatch_q(). Insert a + * breather if necessary. + */ + scx_ops_breather(src_rq); + + locked_rq = src_rq; + raw_spin_lock(&src_dsq->lock); + + /* + * Did someone else get to it? @p could have already left $src_dsq, got + * re-enqueud, or be in the process of being consumed by someone else. + */ + if (unlikely(p->scx.dsq != src_dsq || + u32_before(kit->cursor.priv, p->scx.dsq_seq) || + p->scx.holding_cpu >= 0) || + WARN_ON_ONCE(src_rq != task_rq(p))) { + raw_spin_unlock(&src_dsq->lock); + goto out; + } + + /* @p is still on $src_dsq and stable, determine the destination */ + dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); + + /* + * Apply vtime and slice updates before moving so that the new time is + * visible before inserting into $dst_dsq. @p is still on $src_dsq but + * this is safe as we're locking it. + */ + if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME) + p->scx.dsq_vtime = kit->vtime; + if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE) + p->scx.slice = kit->slice; + + /* execute move */ + locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq); + dispatched = true; +out: + if (in_balance) { + if (this_rq != locked_rq) { + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(this_rq); + } + } else { + raw_spin_rq_unlock_irqrestore(locked_rq, flags); + } + + kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME); + return dispatched; +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots + * + * Can only be called from ops.dispatch(). + */ +__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) +{ + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return 0; + + return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); +} + +/** + * scx_bpf_dispatch_cancel - Cancel the latest dispatch + * + * Cancel the latest dispatch. Can be called multiple times to cancel further + * dispatches. Can only be called from ops.dispatch(). + */ +__bpf_kfunc void scx_bpf_dispatch_cancel(void) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return; + + if (dspc->cursor > 0) + dspc->cursor--; + else + scx_ops_error("dispatch buffer underflow"); +} + +/** + * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ + * @dsq_id: DSQ to move task from + * + * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's + * local DSQ for execution. Can only be called from ops.dispatch(). + * + * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() + * before trying to move from the specified DSQ. It may also grab rq locks and + * thus can't be called under any BPF locks. + * + * Returns %true if a task has been moved, %false if there isn't any task to + * move. + */ +__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) +{ + struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_dispatch_q *dsq; + + if (!scx_kf_allowed(SCX_KF_DISPATCH)) + return false; + + flush_dispatch_buf(dspc->rq); + + dsq = find_user_dsq(dsq_id); + if (unlikely(!dsq)) { + scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); + return false; + } + + if (consume_dispatch_q(dspc->rq, dsq)) { + /* + * A successfully consumed task can be dequeued before it starts + * running while the CPU is trying to migrate other dispatched + * tasks. Bump nr_tasks to tell balance_scx() to retry on empty + * local DSQ. + */ + dspc->nr_tasks++; + return true; + } else { + return false; + } +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); + return scx_bpf_dsq_move_to_local(dsq_id); +} + +/** + * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs + * @it__iter: DSQ iterator in progress + * @slice: duration the moved task can run for in nsecs + * + * Override the slice of the next task that will be moved from @it__iter using + * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous + * slice duration is kept. + */ +__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, + u64 slice) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; + + kit->slice = slice; + kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( + struct bpf_iter_scx_dsq *it__iter, u64 slice) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()"); + scx_bpf_dsq_move_set_slice(it__iter, slice); +} + +/** + * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs + * @it__iter: DSQ iterator in progress + * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ + * + * Override the vtime of the next task that will be moved from @it__iter using + * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice + * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the + * override is ignored and cleared. + */ +__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, + u64 vtime) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter; + + kit->vtime = vtime; + kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( + struct bpf_iter_scx_dsq *it__iter, u64 vtime) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()"); + scx_bpf_dsq_move_set_vtime(it__iter, vtime); +} + +/** + * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ + * @it__iter: DSQ iterator in progress + * @p: task to transfer + * @dsq_id: DSQ to move @p to + * @enq_flags: SCX_ENQ_* + * + * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ + * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can + * be the destination. + * + * For the transfer to be successful, @p must still be on the DSQ and have been + * queued before the DSQ iteration started. This function doesn't care whether + * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have + * been queued before the iteration started. + * + * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update. + * + * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq + * lock (e.g. BPF timers or SYSCALL programs). + * + * Returns %true if @p has been consumed, %false if @p had already been consumed + * or dequeued. + */ +__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); + return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags); +} + +/** + * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ + * @it__iter: DSQ iterator in progress + * @p: task to transfer + * @dsq_id: DSQ to move @p to + * @enq_flags: SCX_ENQ_* + * + * Transfer @p which is on the DSQ currently iterated by @it__iter to the + * priority queue of the DSQ specified by @dsq_id. The destination must be a + * user DSQ as only user DSQs support priority queue. + * + * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice() + * and scx_bpf_dsq_move_set_vtime() to update. + * + * All other aspects are identical to scx_bpf_dsq_move(). See + * scx_bpf_dsq_insert_vtime() for more information on @vtime. + */ +__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter, + p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + +/* for backward compatibility, will be removed in v6.15 */ +__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, + struct task_struct *p, u64 dsq_id, + u64 enq_flags) +{ + printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()"); + return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_dispatch) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) +BTF_ID_FLAGS(func, scx_bpf_consume) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_dispatch) + +static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_dispatch, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of + * processed tasks. Can only be called from ops.cpu_release(). + */ +__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +{ + LIST_HEAD(tasks); + u32 nr_enqueued = 0; + struct rq *rq; + struct task_struct *p, *n; + + if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + return 0; + + rq = cpu_rq(smp_processor_id()); + lockdep_assert_rq_held(rq); + + /* + * The BPF scheduler may choose to dispatch tasks back to + * @rq->scx.local_dsq. Move all candidate tasks off to a private list + * first to avoid processing the same tasks repeatedly. + */ + list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, + scx.dsq_list.node) { + /* + * If @p is being migrated, @p's current CPU may not agree with + * its allowed CPUs and the migration_cpu_stop is about to + * deactivate and re-activate @p anyway. Skip re-enqueueing. + * + * While racing sched property changes may also dequeue and + * re-enqueue a migrating task while its current CPU and allowed + * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to + * the current local DSQ for running tasks and thus are not + * visible to the BPF scheduler. + */ + if (p->migration_pending) + continue; + + dispatch_dequeue(rq, p); + list_add_tail(&p->scx.dsq_list.node, &tasks); + } + + list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { + list_del_init(&p->scx.dsq_list.node); + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + nr_enqueued++; + } + + return nr_enqueued; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) +BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) + +static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_cpu_release, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_create_dsq - Create a custom DSQ + * @dsq_id: DSQ to create + * @node: NUMA node to allocate from + * + * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable + * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. + */ +__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) +{ + if (unlikely(node >= (int)nr_node_ids || + (node < 0 && node != NUMA_NO_NODE))) + return -EINVAL; + return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_unlocked) +BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) +BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_unlocked) + +static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_unlocked, +}; + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct rq *this_rq; + unsigned long irq_flags; + + if (!ops_cpu_valid(cpu, NULL)) + return; + + local_irq_save(irq_flags); + + this_rq = this_rq(); + + /* + * While bypassing for PM ops, IRQ handling may not be online which can + * lead to irq_work_queue() malfunction such as infinite busy wait for + * IRQ status update. Suppress kicking. + */ + if (scx_rq_bypassing(this_rq)) + goto out; + + /* + * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting + * rq locks. We can probably be smarter and avoid bouncing if called + * from ops which don't hold a rq lock. + */ + if (flags & SCX_KICK_IDLE) { + struct rq *target_rq = cpu_rq(cpu); + + if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) + scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + + if (raw_spin_rq_trylock(target_rq)) { + if (can_skip_idle_kick(target_rq)) { + raw_spin_rq_unlock(target_rq); + goto out; + } + raw_spin_rq_unlock(target_rq); + } + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); + } else { + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); + + if (flags & SCX_KICK_PREEMPT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); + if (flags & SCX_KICK_WAIT) + cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); + } + + irq_work_queue(&this_rq->scx.kick_cpus_irq_work); +out: + local_irq_restore(irq_flags); +} + +/** + * scx_bpf_dsq_nr_queued - Return the number of queued tasks + * @dsq_id: id of the DSQ + * + * Return the number of tasks in the DSQ matching @dsq_id. If not found, + * -%ENOENT is returned. + */ +__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +{ + struct scx_dispatch_q *dsq; + s32 ret; + + preempt_disable(); + + if (dsq_id == SCX_DSQ_LOCAL) { + ret = READ_ONCE(this_rq()->scx.local_dsq.nr); + goto out; + } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { + s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + + if (ops_cpu_valid(cpu, NULL)) { + ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); + goto out; + } + } else { + dsq = find_user_dsq(dsq_id); + if (dsq) { + ret = READ_ONCE(dsq->nr); + goto out; + } + } + ret = -ENOENT; +out: + preempt_enable(); + return ret; +} + +/** + * scx_bpf_destroy_dsq - Destroy a custom DSQ + * @dsq_id: DSQ to destroy + * + * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with + * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is + * empty and no further tasks are dispatched to it. Ignored if called on a DSQ + * which doesn't exist. Can be called from any online scx_ops operations. + */ +__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) +{ + destroy_dsq(dsq_id); +} + +/** + * bpf_iter_scx_dsq_new - Create a DSQ iterator + * @it: iterator to initialize + * @dsq_id: DSQ to iterate + * @flags: %SCX_DSQ_ITER_* + * + * Initialize BPF iterator @it which can be used with bpf_for_each() to walk + * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes + * tasks which are already queued when this function is invoked. + */ +__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, + u64 flags) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > + sizeof(struct bpf_iter_scx_dsq)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != + __alignof__(struct bpf_iter_scx_dsq)); + + if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) + return -EINVAL; + + kit->dsq = find_user_dsq(dsq_id); + if (!kit->dsq) + return -ENOENT; + + INIT_LIST_HEAD(&kit->cursor.node); + kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; + kit->cursor.priv = READ_ONCE(kit->dsq->seq); + + return 0; +} + +/** + * bpf_iter_scx_dsq_next - Progress a DSQ iterator + * @it: iterator to progress + * + * Return the next task. See bpf_iter_scx_dsq_new(). + */ +__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + unsigned long flags; + + if (!kit->dsq) + return NULL; + + raw_spin_lock_irqsave(&kit->dsq->lock, flags); + + if (list_empty(&kit->cursor.node)) + p = NULL; + else + p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); + + /* + * Only tasks which were queued before the iteration started are + * visible. This bounds BPF iterations and guarantees that vtime never + * jumps in the other direction while iterating. + */ + do { + p = nldsq_next_task(kit->dsq, p, rev); + } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); + + if (p) { + if (rev) + list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); + else + list_move(&kit->cursor.node, &p->scx.dsq_list.node); + } else { + list_del_init(&kit->cursor.node); + } + + raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); + + return p; +} + +/** + * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator + * @it: iterator to destroy + * + * Undo scx_iter_scx_dsq_new(). + */ +__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) +{ + struct bpf_iter_scx_dsq_kern *kit = (void *)it; + + if (!kit->dsq) + return; + + if (!list_empty(&kit->cursor.node)) { + unsigned long flags; + + raw_spin_lock_irqsave(&kit->dsq->lock, flags); + list_del_init(&kit->cursor.node); + raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); + } + kit->dsq = NULL; +} + +__bpf_kfunc_end_defs(); + +static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, + char *fmt, unsigned long long *data, u32 data__sz) +{ + struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; + s32 ret; + + if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || + (data__sz && !data)) { + scx_ops_error("invalid data=%p and data__sz=%u", + (void *)data, data__sz); + return -EINVAL; + } + + ret = copy_from_kernel_nofault(data_buf, data, data__sz); + if (ret < 0) { + scx_ops_error("failed to read data fields (%d)", ret); + return ret; + } + + ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, + &bprintf_data); + if (ret < 0) { + scx_ops_error("format preparation failed (%d)", ret); + return ret; + } + + ret = bstr_printf(line_buf, line_size, fmt, + bprintf_data.bin_args); + bpf_bprintf_cleanup(&bprintf_data); + if (ret < 0) { + scx_ops_error("(\"%s\", %p, %u) failed to format", + fmt, data, data__sz); + return ret; + } + + return ret; +} + +static s32 bstr_format(struct scx_bstr_buf *buf, + char *fmt, unsigned long long *data, u32 data__sz) +{ + return __bstr_format(buf->data, buf->line, sizeof(buf->line), + fmt, data, data__sz); +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. + * @exit_code: Exit value to pass to user space via struct scx_exit_info. + * @fmt: error message format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops + * disabling. + */ +__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, + unsigned long long *data, u32 data__sz) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); + if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", + scx_exit_bstr_buf.line); + raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); +} + +/** + * scx_bpf_error_bstr - Indicate fatal error + * @fmt: error message format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * Indicate that the BPF scheduler encountered a fatal error and initiate ops + * disabling. + */ +__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, + u32 data__sz) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); + if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", + scx_exit_bstr_buf.line); + raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); +} + +/** + * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler + * @fmt: format string + * @data: format string parameters packaged using ___bpf_fill() macro + * @data__sz: @data len, must end in '__sz' for the verifier + * + * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and + * dump_task() to generate extra debug dump specific to the BPF scheduler. + * + * The extra dump may be multiple lines. A single line may be split over + * multiple calls. The last line is automatically terminated. + */ +__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, + u32 data__sz) +{ + struct scx_dump_data *dd = &scx_dump_data; + struct scx_bstr_buf *buf = &dd->buf; + s32 ret; + + if (raw_smp_processor_id() != dd->cpu) { + scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + return; + } + + /* append the formatted string to the line buf */ + ret = __bstr_format(buf->data, buf->line + dd->cursor, + sizeof(buf->line) - dd->cursor, fmt, data, data__sz); + if (ret < 0) { + dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", + dd->prefix, fmt, data, data__sz, ret); + return; + } + + dd->cursor += ret; + dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); + + if (!dd->cursor) + return; + + /* + * If the line buf overflowed or ends in a newline, flush it into the + * dump. This is to allow the caller to generate a single line over + * multiple calls. As ops_dump_flush() can also handle multiple lines in + * the line buf, the only case which can lead to an unexpected + * truncation is when the caller keeps generating newlines in the middle + * instead of the end consecutively. Don't do that. + */ + if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') + ops_dump_flush(); +} + +/** + * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU + * @cpu: CPU of interest + * + * Return the maximum relative capacity of @cpu in relation to the most + * performant CPU in the system. The return value is in the range [1, + * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) +{ + if (ops_cpu_valid(cpu, NULL)) + return arch_scale_cpu_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU + * @cpu: CPU of interest + * + * Return the current relative performance of @cpu in relation to its maximum. + * The return value is in the range [1, %SCX_CPUPERF_ONE]. + * + * The current performance level of a CPU in relation to the maximum performance + * available in the system can be calculated as follows: + * + * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE + * + * The result is in the range [1, %SCX_CPUPERF_ONE]. + */ +__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) +{ + if (ops_cpu_valid(cpu, NULL)) + return arch_scale_freq_capacity(cpu); + else + return SCX_CPUPERF_ONE; +} + +/** + * scx_bpf_cpuperf_set - Set the relative performance target of a CPU + * @cpu: CPU of interest + * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * + * Set the target performance level of @cpu to @perf. @perf is in linear + * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the + * schedutil cpufreq governor chooses the target frequency. + * + * The actual performance level chosen, CPU grouping, and the overhead and + * latency of the operations are dependent on the hardware and cpufreq driver in + * use. Consult hardware and cpufreq documentation for more information. The + * current performance level can be monitored using scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) +{ + if (unlikely(perf > SCX_CPUPERF_ONE)) { + scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + return; + } + + if (ops_cpu_valid(cpu, NULL)) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.cpuperf_target = perf; + + rcu_read_lock_sched_notrace(); + cpufreq_update_util(cpu_rq(cpu), 0); + rcu_read_unlock_sched_notrace(); + } +} + +/** + * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs + * + * All valid CPU IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) +{ + return nr_cpu_ids; +} + +/** + * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) +{ + return cpu_possible_mask; +} + +/** + * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) +{ + return cpu_online_mask; +} + +/** + * scx_bpf_put_cpumask - Release a possible/online cpumask + * @cpumask: cpumask to release + */ +__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global cpumask, which is read-only in the caller and + * is never released. The acquire / release semantics here are just used + * to make the cpumask is a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns NULL if idle tracking is not enabled, or running on a UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_masks.smt; + else + return idle_masks.cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global idle cpumask, which is read-only in the + * caller and is never released. The acquire / release semantics here + * are just used to make the cpumask a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!check_builtin_idle_enabled()) + return false; + + if (ops_cpu_valid(cpu, NULL)) + return test_and_clear_cpu_idle(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (!check_builtin_idle_enabled()) + return -EBUSY; + + return scx_pick_idle_cpu(cpus_allowed, flags); +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_task_running - Is task currently running? + * @p: task of interest + */ +__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) +{ + return task_rq(p)->curr == p; +} + +/** + * scx_bpf_task_cpu - CPU a task is currently associated with + * @p: task of interest + */ +__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) +{ + return task_cpu(p); +} + +/** + * scx_bpf_cpu_rq - Fetch the rq of a CPU + * @cpu: CPU of the rq + */ +__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) +{ + if (!ops_cpu_valid(cpu, NULL)) + return NULL; + + return cpu_rq(cpu); +} + +/** + * scx_bpf_task_cgroup - Return the sched cgroup of a task + * @p: task of interest + * + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with + * from the scheduler's POV. SCX operations should use this function to + * determine @p's current cgroup as, unlike following @p->cgroups, + * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all + * rq-locked operations. Can be called on the parameter tasks of rq-locked + * operations. The restriction guarantees that @p's rq is locked by the caller. + */ +#ifdef CONFIG_CGROUP_SCHED +__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) +{ + struct task_group *tg = p->sched_task_group; + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + + if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + goto out; + + cgrp = tg_cgrp(tg); + +out: + cgroup_get(cgrp); + return cgrp; +} +#endif + +/** + * scx_bpf_now - Returns a high-performance monotonically non-decreasing + * clock for the current CPU. The clock returned is in nanoseconds. + * + * It provides the following properties: + * + * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently + * to account for execution time and track tasks' runtime properties. + * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which + * eventually reads a hardware timestamp counter -- is neither performant nor + * scalable. scx_bpf_now() aims to provide a high-performance clock by + * using the rq clock in the scheduler core whenever possible. + * + * 2) High enough resolution for the BPF scheduler use cases: In most BPF + * scheduler use cases, the required clock resolution is lower than the most + * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically + * uses the rq clock in the scheduler core whenever it is valid. It considers + * that the rq clock is valid from the time the rq clock is updated + * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). + * + * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() + * guarantees the clock never goes backward when comparing them in the same + * CPU. On the other hand, when comparing clocks in different CPUs, there + * is no such guarantee -- the clock can go backward. It provides a + * monotonically *non-decreasing* clock so that it would provide the same + * clock values in two different scx_bpf_now() calls in the same CPU + * during the same period of when the rq clock is valid. + */ +__bpf_kfunc u64 scx_bpf_now(void) +{ + struct rq *rq; + u64 clock; + + preempt_disable(); + + rq = this_rq(); + if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { + /* + * If the rq clock is valid, use the cached rq clock. + * + * Note that scx_bpf_now() is re-entrant between a process + * context and an interrupt context (e.g., timer interrupt). + * However, we don't need to consider the race between them + * because such race is not observable from a caller. + */ + clock = READ_ONCE(rq->scx.clock); + } else { + /* + * Otherwise, return a fresh rq clock. + * + * The rq clock is updated outside of the rq lock. + * In this case, keep the updated rq clock invalid so the next + * kfunc call outside the rq lock gets a fresh rq clock. + */ + clock = sched_clock_cpu(cpu_of(rq)); + } + + preempt_enable(); + + return clock; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_any) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) +BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +#ifdef CONFIG_CGROUP_SCHED +BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) +#endif +BTF_ID_FLAGS(func, scx_bpf_now) +BTF_KFUNCS_END(scx_kfunc_ids_any) + +static const struct btf_kfunc_id_set scx_kfunc_set_any = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_any, +}; + +static int __init scx_init(void) +{ + int ret; + + /* + * kfunc registration can't be done from init_sched_ext_class() as + * register_btf_kfunc_id_set() needs most of the system to be up. + * + * Some kfuncs are context-sensitive and can only be called from + * specific SCX ops. They are grouped into BTF sets accordingly. + * Unfortunately, BPF currently doesn't have a way of enforcing such + * restrictions. Eventually, the verifier should be able to enforce + * them. For now, register them the same and make each kfunc explicitly + * check using scx_kf_allowed(). + */ + if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_select_cpu)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_enqueue_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_dispatch)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_cpu_release)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_unlocked)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &scx_kfunc_set_unlocked)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &scx_kfunc_set_any)) || + (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, + &scx_kfunc_set_any))) { + pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); + return ret; + } + + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); + if (ret) { + pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); + return ret; + } + + ret = register_pm_notifier(&scx_pm_notifier); + if (ret) { + pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); + return ret; + } + + scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); + if (!scx_kset) { + pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); + return -ENOMEM; + } + + ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); + if (ret < 0) { + pr_err("sched_ext: Failed to add global attributes\n"); + return ret; + } + + return 0; +} +__initcall(scx_init); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h new file mode 100644 index 000000000000..1079b56b0f7a --- /dev/null +++ b/kernel/sched/ext.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifdef CONFIG_SCHED_CLASS_EXT + +void scx_tick(struct rq *rq); +void init_scx_entity(struct sched_ext_entity *scx); +void scx_pre_fork(struct task_struct *p); +int scx_fork(struct task_struct *p); +void scx_post_fork(struct task_struct *p); +void scx_cancel_fork(struct task_struct *p); +bool scx_can_stop_tick(struct rq *rq); +void scx_rq_activate(struct rq *rq); +void scx_rq_deactivate(struct rq *rq); +int scx_check_setscheduler(struct task_struct *p, int policy); +bool task_should_scx(int policy); +void init_sched_ext_class(void); + +static inline u32 scx_cpuperf_target(s32 cpu) +{ + if (scx_enabled()) + return cpu_rq(cpu)->scx.cpuperf_target; + else + return 0; +} + +static inline bool task_on_scx(const struct task_struct *p) +{ + return scx_enabled() && p->sched_class == &ext_sched_class; +} + +#ifdef CONFIG_SCHED_CORE +bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi); +#endif + +#else /* CONFIG_SCHED_CLASS_EXT */ + +static inline void scx_tick(struct rq *rq) {} +static inline void scx_pre_fork(struct task_struct *p) {} +static inline int scx_fork(struct task_struct *p) { return 0; } +static inline void scx_post_fork(struct task_struct *p) {} +static inline void scx_cancel_fork(struct task_struct *p) {} +static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } +static inline bool scx_can_stop_tick(struct rq *rq) { return true; } +static inline void scx_rq_activate(struct rq *rq) {} +static inline void scx_rq_deactivate(struct rq *rq) {} +static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } +static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline void init_sched_ext_class(void) {} + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); + +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + if (scx_enabled()) + __scx_update_idle(rq, idle, do_notify); +} +#else +static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} +#endif + +#ifdef CONFIG_CGROUP_SCHED +#ifdef CONFIG_EXT_GROUP_SCHED +int scx_tg_online(struct task_group *tg); +void scx_tg_offline(struct task_group *tg); +int scx_cgroup_can_attach(struct cgroup_taskset *tset); +void scx_cgroup_move_task(struct task_struct *p); +void scx_cgroup_finish_attach(void); +void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); +void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); +void scx_group_set_idle(struct task_group *tg, bool idle); +#else /* CONFIG_EXT_GROUP_SCHED */ +static inline int scx_tg_online(struct task_group *tg) { return 0; } +static inline void scx_tg_offline(struct task_group *tg) {} +static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } +static inline void scx_cgroup_move_task(struct task_struct *p) {} +static inline void scx_cgroup_finish_attach(void) {} +static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} +static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} +static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} +#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04fa8dbcfa4d..c798d2795243 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,25 +20,43 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include "sched.h" - -#include <trace/events/sched.h> +#include <linux/energy_model.h> +#include <linux/mmap_lock.h> +#include <linux/hugetlb_inline.h> +#include <linux/jiffies.h> +#include <linux/mm_api.h> +#include <linux/highmem.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/isolation.h> +#include <linux/sched/nohz.h> +#include <linux/sched/prio.h> + +#include <linux/cpuidle.h> +#include <linux/interrupt.h> +#include <linux/memory-tiers.h> +#include <linux/mempolicy.h> +#include <linux/mutex_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/ratelimit.h> +#include <linux/task_work.h> +#include <linux/rbtree_augmented.h> + +#include <asm/switch_to.h> + +#include <uapi/linux/sched/types.h> -/* - * Targeted preemption latency for CPU-bound tasks: - * - * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length - * and have no persistent notion like in traditional, time-slice - * based scheduling concepts. - * - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches (cs) field) - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_latency = 6000000ULL; -static unsigned int normalized_sysctl_sched_latency = 6000000ULL; +#include "sched.h" +#include "stats.h" +#include "autogroup.h" /* * The initial- and re-scaling of tunables is configurable @@ -46,55 +64,26 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL; * Options are: * * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; +unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; - -/* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -static unsigned int sched_nr_latency = 8; - -/* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ -unsigned int sysctl_sched_child_runs_first __read_mostly; - -/* - * SCHED_OTHER wake-up granularity. - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int sysctl_sched_base_slice = 750000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { - int _shift = 0; - - if (kstrtoint(str, 0, &_shift)) - pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n"); - - sched_thermal_decay_shift = clamp(_shift, 0, 10); + pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n"); return 1; } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); @@ -115,6 +104,13 @@ int __weak arch_asym_cpu_priority(int cpu) */ #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) +/* + * The margin used when comparing CPU capacities. + * is 'cap1' noticeably greater than 'cap2' + * + * (default: ~5%) + */ +#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -128,7 +124,44 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + +#ifdef CONFIG_NUMA_BALANCING +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +#endif + +#ifdef CONFIG_SYSCTL +static const struct ctl_table sched_fair_sysctls[] = { +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, +#endif +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_promote_rate_limit_MBps", + .data = &sysctl_numa_balancing_promote_rate_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif /* CONFIG_NUMA_BALANCING */ +}; + +static int __init sched_fair_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_fair_sysctls); + return 0; +} +late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) @@ -185,9 +218,7 @@ static void update_sysctl(void) #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } @@ -231,27 +262,40 @@ static void __update_inv_weight(struct load_weight *lw) static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { u64 fact = scale_load_down(weight); + u32 fact_hi = (u32)(fact >> 32); int shift = WMULT_SHIFT; + int fs; __update_inv_weight(lw); - if (unlikely(fact >> 32)) { - while (fact >> 32) { - fact >>= 1; - shift--; - } + if (unlikely(fact_hi)) { + fs = fls(fact_hi); + shift -= fs; + fact >>= fs; } fact = mul_u32_u32(fact, lw->inv_weight); - while (fact >> 32) { - fact >>= 1; - shift--; + fact_hi = (u32)(fact >> 32); + if (fact_hi) { + fs = fls(fact_hi); + shift -= fs; + fact >>= fs; } return mul_u64_u32_shr(delta_exec, fact, shift); } +/* + * delta /= w + */ +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) +{ + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); + + return delta; +} const struct sched_class fair_sched_class; @@ -260,46 +304,11 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -375,8 +384,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) /* * With cfs_rq being unthrottled/throttled during an enqueue, - * it can happen the tmp_alone_branch points the a leaf that - * we finally want to del. In this case, tmp_alone_branch moves + * it can happen the tmp_alone_branch points to the leaf that + * we finally want to delete. In this case, tmp_alone_branch moves * to the prev element but it will point to rq->leaf_cfs_rq_list * at the end of the enqueue. */ @@ -393,7 +402,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq) SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } -/* Iterate thr' all leaf cfs_rq's on a runqueue */ +/* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ leaf_cfs_rq_list) @@ -408,7 +417,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse) return NULL; } -static inline struct sched_entity *parent_entity(struct sched_entity *se) +static inline struct sched_entity *parent_entity(const struct sched_entity *se) { return se->parent; } @@ -445,40 +454,27 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* !CONFIG_FAIR_GROUP_SCHED */ - -static inline struct task_struct *task_of(struct sched_entity *se) +static int tg_is_idle(struct task_group *tg) { - return container_of(se, struct task_struct, se); + return tg->idle > 0; } -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) { - return &task_rq(p)->cfs; + return cfs_rq->idle > 0; } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static int se_is_idle(struct sched_entity *se) { - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); } -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} +#else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (path) - strlcpy(path, "(null)", len); -} +#define for_each_sched_entity(se) \ + for (; se; se = NULL) static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { @@ -506,6 +502,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } +static inline int tg_is_idle(struct task_group *tg) +{ + return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + return task_has_idle_policy(task_of(se)); +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@ -515,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) @@ -524,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) return max_vruntime; } -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) @@ -533,17 +544,222 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline int entity_before(struct sched_entity *a, - struct sched_entity *b) +static inline bool entity_before(const struct sched_entity *a, + const struct sched_entity *b) { - return (s64)(a->vruntime - b->vruntime) < 0; + /* + * Tiebreak on vruntime seems unnecessary since it can + * hardly happen. + */ + return (s64)(a->deadline - b->deadline) < 0; } -static void update_min_vruntime(struct cfs_rq *cfs_rq) +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return (s64)(se->vruntime - cfs_rq->min_vruntime); +} + +#define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + +/* + * Compute virtual time from the per-task service numbers: + * + * Fair schedulers conserve lag: + * + * \Sum lag_i = 0 + * + * Where lag_i is given by: + * + * lag_i = S - s_i = w_i * (V - v_i) + * + * Where S is the ideal service time and V is it's virtual time counterpart. + * Therefore: + * + * \Sum lag_i = 0 + * \Sum w_i * (V - v_i) = 0 + * \Sum w_i * V - w_i * v_i = 0 + * + * From which we can solve an expression for V in v_i (which we have in + * se->vruntime): + * + * \Sum v_i * w_i \Sum v_i * w_i + * V = -------------- = -------------- + * \Sum w_i W + * + * Specifically, this is the weighted average of all entity virtual runtimes. + * + * [[ NOTE: this is only equal to the ideal scheduler under the condition + * that join/leave operations happen at lag_i = 0, otherwise the + * virtual time has non-contiguous motion equivalent to: + * + * V +-= lag_i / W + * + * Also see the comment in place_entity() that deals with this. ]] + * + * However, since v_i is u64, and the multiplication could easily overflow + * transform it into a relative form that uses smaller quantities: + * + * Substitute: v_i == (v_i - v0) + v0 + * + * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i + * V = ---------------------------- = --------------------- + v0 + * W W + * + * Which we track using: + * + * v0 := cfs_rq->min_vruntime + * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime + * \Sum w_i := cfs_rq->avg_load + * + * Since min_vruntime is a monotonic increasing variable that closely tracks + * the per-task service, these deltas: (v_i - v), will be in the order of the + * maximal (virtual) lag induced in the system due to quantisation. + * + * Also, we use scale_load_down() to reduce the size. + * + * As measured, the max (key * weight) value was ~44 bits for a kernel build. + */ +static void +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime += key * weight; + cfs_rq->avg_load += weight; +} + +static void +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime -= key * weight; + cfs_rq->avg_load -= weight; +} + +static inline +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +{ + /* + * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + */ + cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; +} + +/* + * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * For this to be so, the result of this function must have a left bias. + */ +u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { + unsigned long weight = scale_load_down(curr->load.weight); + + avg += entity_key(cfs_rq, curr) * weight; + load += weight; + } + + if (load) { + /* sign flips effective floor / ceiling */ + if (avg < 0) + avg -= (load - 1); + avg = div_s64(avg, load); + } + + return cfs_rq->min_vruntime + avg; +} + +/* + * lag_i = S - s_i = w_i * (V - v_i) + * + * However, since V is approximated by the weighted average of all entities it + * is possible -- by addition/removal/reweight to the tree -- to move V around + * and end up with a larger lag than we started with. + * + * Limit this to either double the slice length with a minimum of TICK_NSEC + * since that is the timing granularity. + * + * EEVDF gives the following limit for a steady state system: + * + * -r_max < lag < max(r_max, q) + * + * XXX could add max_slice to the augmented data to track this. + */ +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 vlag, limit; + + SCHED_WARN_ON(!se->on_rq); + vlag = avg_vruntime(cfs_rq) - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + + se->vlag = clamp(vlag, -limit, limit); +} + +/* + * Entity is eligible once it received less service than it ought to have, + * eg. lag >= 0. + * + * lag_i = S - s_i = w_i*(V - v_i) + * + * lag_i >= 0 -> V >= v_i + * + * \Sum (v_i - v)*w_i + * V = ------------------ + v + * \Sum w_i + * + * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * + * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due + * to the loss in precision caused by the division. + */ +static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { + unsigned long weight = scale_load_down(curr->load.weight); + + avg += entity_key(cfs_rq, curr) * weight; + load += weight; + } + + return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; +} + +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return vruntime_eligible(cfs_rq, se->vruntime); +} + +static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +{ + u64 min_vruntime = cfs_rq->min_vruntime; + /* + * open coded max_vruntime() to allow updating avg_vruntime + */ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) { + avg_vruntime_update(cfs_rq, delta); + min_vruntime = vruntime; + } + return min_vruntime; +} + +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; u64 vruntime = cfs_rq->min_vruntime; if (curr) { @@ -553,60 +769,108 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); - + if (se) { if (!curr) - vruntime = se->vruntime; + vruntime = se->min_vruntime; else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = min_vruntime(vruntime, se->min_vruntime); } /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif + cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); +} + +static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 min_slice = ~0ULL; + + if (curr && curr->on_rq) + min_slice = curr->slice; + + if (root) + min_slice = min(min_slice, root->min_slice); + + return min_slice; +} + +static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +{ + return entity_before(__node_2_se(a), __node_2_se(b)); +} + +#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) + +static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (vruntime_gt(min_vruntime, se, rse)) + se->min_vruntime = rse->min_vruntime; + } +} + +static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->min_slice < se->min_slice) + se->min_slice = rse->min_slice; + } } /* - * Enqueue an entity into the rb-tree: + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ -static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; + u64 old_min_vruntime = se->min_vruntime; + u64 old_min_slice = se->min_slice; + struct rb_node *node = &se->run_node; - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left); - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); + se->min_slice = se->slice; + __min_slice_update(se, node->rb_right); + __min_slice_update(se, node->rb_left); + + return se->min_vruntime == old_min_vruntime && + se->min_slice == old_min_slice; +} + +RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, + run_node, min_vruntime, min_vruntime_update); + +/* + * Enqueue an entity into the rb-tree: + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + avg_vruntime_add(cfs_rq, se); + se->min_vruntime = se->vruntime; + se->min_slice = se->slice; + rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + __entity_less, &min_vruntime_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + &min_vruntime_cb); + avg_vruntime_sub(cfs_rq, se); +} + +struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node; + + if (!root) + return NULL; + + return __node_2_se(root); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) @@ -616,17 +880,91 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) if (!left) return NULL; - return rb_entry(left, struct sched_entity, run_node); + return __node_2_se(left); } -static struct sched_entity *__pick_next_entity(struct sched_entity *se) +/* + * Earliest Eligible Virtual Deadline First + * + * In order to provide latency guarantees for different request sizes + * EEVDF selects the best runnable task from two criteria: + * + * 1) the task must be eligible (must be owed service) + * + * 2) from those tasks that meet 1), we select the one + * with the earliest virtual deadline. + * + * We can do this in O(log n) time due to an augmented RB-tree. The + * tree keeps the entries sorted on deadline, but also functions as a + * heap based on the vruntime by keeping: + * + * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) + * + * Which allows tree pruning through eligibility. + */ +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { - struct rb_node *next = rb_next(&se->run_node); + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + struct sched_entity *best = NULL; - if (!next) - return NULL; + /* + * We can safely skip eligibility check if there is only one entity + * in this cfs_rq, saving some cycles. + */ + if (cfs_rq->nr_queued == 1) + return curr && curr->on_rq ? curr : se; + + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) + curr = NULL; + + /* + * Once selected, run a task until it either becomes non-eligible or + * until it gets a new slice. See the HACK in set_next_entity(). + */ + if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + return curr; + + /* Pick the leftmost entity if it's eligible */ + if (se && entity_eligible(cfs_rq, se)) { + best = se; + goto found; + } - return rb_entry(next, struct sched_entity, run_node); + /* Heap search for the EEVD entity */ + while (node) { + struct rb_node *left = node->rb_left; + + /* + * Eligible entities in left subtree are always better + * choices, since they have earlier deadlines. + */ + if (left && vruntime_eligible(cfs_rq, + __node_2_se(left)->min_vruntime)) { + node = left; + continue; + } + + se = __node_2_se(node); + + /* + * The left subtree either is empty or has no eligible + * entity, so check the current node since it is the one + * with earliest deadline that might be eligible. + */ + if (entity_eligible(cfs_rq, se)) { + best = se; + break; + } + + node = node->rb_right; + } +found: + if (!best || (curr && entity_before(curr, best))) + best = curr; + + return best; } #ifdef CONFIG_SCHED_DEBUG @@ -637,99 +975,55 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) if (!last) return NULL; - return rb_entry(last, struct sched_entity, run_node); + return __node_2_se(last); } /************************************************************** * Scheduling class statistics methods: */ - -int sched_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SMP +int sched_update_scaling(void) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); - if (ret || !write) - return ret; - - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, - sysctl_sched_min_granularity); - #define WRT_SYSCTL(name) \ (normalized_sysctl_##name = sysctl_##name / (factor)) - WRT_SYSCTL(sched_min_granularity); - WRT_SYSCTL(sched_latency); - WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_base_slice); #undef WRT_SYSCTL return 0; } #endif +#endif -/* - * delta /= w - */ -static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) -{ - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = __calc_delta(delta, NICE_0_LOAD, &se->load); - - return delta; -} +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); /* - * The idea is to set a period in which each task runs once. - * - * When there are too many tasks (sched_nr_latency) we have to stretch - * this period because otherwise the slices get too small. - * - * p = (nr <= nl) ? l : l*nr/nl + * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i + * this is probably good enough. */ -static u64 __sched_period(unsigned long nr_running) +static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (unlikely(nr_running > sched_nr_latency)) - return nr_running * sysctl_sched_min_granularity; - else - return sysctl_sched_latency; -} - -/* - * We calculate the wall-time slice from the period by taking a part - * proportional to the weight. - * - * s = p*P[w/rw] - */ -static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); - - for_each_sched_entity(se) { - struct load_weight *load; - struct load_weight lw; - - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; + if ((s64)(se->vruntime - se->deadline) < 0) + return false; - if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; + /* + * For EEVDF the virtual time slope is determined by w_i (iow. + * nice) while the request time r_i is determined by + * sysctl_sched_base_slice. + */ + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; - update_load_add(&lw, se->load.weight); - load = &lw; - } - slice = __calc_delta(slice, se->load.weight, load); - } - return slice; -} + /* + * EEVDF: vd_i = ve_i + r_i / w_i + */ + se->deadline = se->vruntime + calc_delta_fair(se->slice, se); -/* - * We calculate the vruntime slice of a to-be-inserted task. - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); + /* + * The task has consumed its request, reschedule. + */ + return true; } #include "pelt.h" @@ -755,16 +1049,15 @@ void init_entity_runnable_average(struct sched_entity *se) if (entity_is_task(se)) sa->load_avg = scale_load_down(se->load.weight); - /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ + /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */ } -static void attach_entity_cfs_rq(struct sched_entity *se); - /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * - * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) + * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such @@ -794,20 +1087,6 @@ void post_init_entity_util_avg(struct task_struct *p) long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; - if (cap > 0) { - if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; - sa->util_avg /= (cfs_rq->avg.load_avg + 1); - - if (sa->util_avg > cap) - sa->util_avg = cap; - } else { - sa->util_avg = cap; - } - } - - sa->runnable_avg = sa->util_avg; - if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -823,7 +1102,19 @@ void post_init_entity_util_avg(struct task_struct *p) return; } - attach_entity_cfs_rq(se); + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se_weight(se); + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; } #else /* !CONFIG_SMP */ @@ -833,177 +1124,209 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct task_struct *p) { } -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } #endif /* CONFIG_SMP */ +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) +{ + u64 now = rq_clock_task(rq); + s64 delta_exec; + + delta_exec = now - curr->exec_start; + if (unlikely(delta_exec <= 0)) + return delta_exec; + + curr->exec_start = now; + curr->sum_exec_runtime += delta_exec; + + if (schedstat_enabled()) { + struct sched_statistics *stats; + + stats = __schedstats_from_se(curr); + __schedstat_set(stats->exec_max, + max(delta_exec, stats->exec_max)); + } + + return delta_exec; +} + +static inline void update_curr_task(struct task_struct *p, s64 delta_exec) +{ + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); + cgroup_account_cputime(p, delta_exec); +} + +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (curr->vlag == curr->deadline) + return false; + + return !entity_eligible(cfs_rq, curr); +} + +static inline bool do_preempt_short(struct cfs_rq *cfs_rq, + struct sched_entity *pse, struct sched_entity *se) +{ + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (pse->slice >= se->slice) + return false; + + if (!entity_eligible(cfs_rq, pse)) + return false; + + if (entity_before(pse, se)) + return true; + + if (!entity_eligible(cfs_rq, se)) + return true; + + return false; +} + +/* + * Used by other classes to account runtime. + */ +s64 update_curr_common(struct rq *rq) +{ + struct task_struct *donor = rq->donor; + s64 delta_exec; + + delta_exec = update_curr_se(rq, &donor->se); + if (likely(delta_exec > 0)) + update_curr_task(donor, delta_exec); + + return delta_exec; +} + /* * Update the current task's runtime statistics. */ static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_clock_task(rq_of(cfs_rq)); - u64 delta_exec; + struct rq *rq = rq_of(cfs_rq); + s64 delta_exec; + bool resched; if (unlikely(!curr)) return; - delta_exec = now - curr->exec_start; - if (unlikely((s64)delta_exec <= 0)) + delta_exec = update_curr_se(rq, curr); + if (unlikely(delta_exec <= 0)) return; - curr->exec_start = now; - - schedstat_set(curr->statistics.exec_max, - max(delta_exec, curr->statistics.exec_max)); - - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq->exec_clock, delta_exec); - curr->vruntime += calc_delta_fair(delta_exec, curr); + resched = update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); + struct task_struct *p = task_of(curr); + + update_curr_task(p, delta_exec); - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cgroup_account_cputime(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); + /* + * If the fair_server is active, we need to account for the + * fair_server time whether or not the task is running on + * behalf of fair_server or not: + * - If the task is running on behalf of fair_server, we need + * to limit its time based on the assigned runtime. + * - Fair task that runs outside of fair_server should account + * against fair_server such that it can account for this time + * and possibly avoid running this period. + */ + if (dl_server_active(&rq->fair_server)) + dl_server_update(&rq->fair_server, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); + + if (cfs_rq->nr_queued == 1) + return; + + if (resched || did_preempt_short(cfs_rq, curr)) { + resched_curr_lazy(rq); + clear_buddies(cfs_rq, curr); + } } static void update_curr_fair(struct rq *rq) { - update_curr(cfs_rq_of(&rq->curr->se)); + update_curr(cfs_rq_of(&rq->donor->se)); } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start, prev_wait_start; + struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(se->statistics.wait_start); + stats = __schedstats_from_se(se); - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; + if (entity_is_task(se)) + p = task_of(se); - __schedstat_set(se->statistics.wait_start, wait_start); + __update_stats_wait_start(rq_of(cfs_rq), p, stats); } static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *p; - u64 delta; + struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); + stats = __schedstats_from_se(se); - if (entity_is_task(se)) { + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(stats->wait_start))) + return; + + if (entity_is_task(se)) p = task_of(se); - if (task_on_rq_migrating(p)) { - /* - * Preserve migrating task's wait time so wait_start - * time stamp can be adjusted to accumulate wait time - * prior to migration. - */ - __schedstat_set(se->statistics.wait_start, delta); - return; - } - trace_sched_stat_wait(p, delta); - } - __schedstat_set(se->statistics.wait_max, - max(schedstat_val(se->statistics.wait_max), delta)); - __schedstat_inc(se->statistics.wait_count); - __schedstat_add(se->statistics.wait_sum, delta); - __schedstat_set(se->statistics.wait_start, 0); + __update_stats_wait_end(rq_of(cfs_rq), p, stats); } static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { + struct sched_statistics *stats; struct task_struct *tsk = NULL; - u64 sleep_start, block_start; if (!schedstat_enabled()) return; - sleep_start = schedstat_val(se->statistics.sleep_start); - block_start = schedstat_val(se->statistics.block_start); + stats = __schedstats_from_se(se); if (entity_is_task(se)) tsk = task_of(se); - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - __schedstat_set(se->statistics.sleep_max, delta); - - __schedstat_set(se->statistics.sleep_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.block_max))) - __schedstat_set(se->statistics.block_max, delta); - - __schedstat_set(se->statistics.block_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - if (tsk->in_iowait) { - __schedstat_add(se->statistics.iowait_sum, delta); - __schedstat_inc(se->statistics.iowait_count); - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } + __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats); } /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) return; @@ -1013,14 +1336,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) - update_stats_wait_start(cfs_rq, se); + update_stats_wait_start_fair(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) - update_stats_enqueue_sleeper(cfs_rq, se); + update_stats_enqueue_sleeper_fair(cfs_rq, se); } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) @@ -1031,16 +1354,19 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * waiting task: */ if (se != cfs_rq->curr) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { struct task_struct *tsk = task_of(se); + unsigned int state; - if (tsk->state & TASK_INTERRUPTIBLE) - __schedstat_set(se->statistics.sleep_start, + /* XXX racy against TTWU */ + state = READ_ONCE(tsk->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(tsk->stats.sleep_start, rq_clock(rq_of(cfs_rq))); - if (tsk->state & TASK_UNINTERRUPTIBLE) - __schedstat_set(se->statistics.block_start, + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(tsk->stats.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -1061,6 +1387,50 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +static inline bool is_core_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + int sibling; + + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; + + if (!idle_cpu(sibling)) + return false; + } +#endif + + return true; +} + +#ifdef CONFIG_NUMA +#define NUMA_IMBALANCE_MIN 2 + +static inline long +adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr) +{ + /* + * Allow a NUMA imbalance if busy CPUs is less than the maximum + * threshold. Above this threshold, individual tasks may be contending + * for both memory bandwidth and any shared HT resources. This is an + * approximation as the number of running tasks may not be related to + * the number of busy CPUs due to sched_setaffinity. + */ + if (dst_running > imb_numa_nr) + return imbalance; + + /* + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the destination is lightly loaded. + */ + if (imbalance <= NUMA_IMBALANCE_MIN) + return 0; + + return imbalance; +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_NUMA_BALANCING /* * Approximate time to scan a full NUMA task in ms. The task scan period is @@ -1076,6 +1446,9 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* The page with hint page fault latency < threshold in ms is considered hot */ +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; + struct numa_group { refcount_t refcount; @@ -1088,11 +1461,12 @@ struct numa_group { unsigned long total_faults; unsigned long max_faults_cpu; /* + * faults[] array is split into two regions: faults_mem and faults_cpu. + * * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted * more by CPU use than by memory faults. */ - unsigned long *faults_cpu; unsigned long faults[]; }; @@ -1103,7 +1477,7 @@ struct numa_group { static struct numa_group *deref_task_numa_group(struct task_struct *p) { return rcu_dereference_check(p->numa_group, p == current || - (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu))); + (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); } static struct numa_group *deref_curr_numa_group(struct task_struct *p) @@ -1133,7 +1507,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) return rss / nr_scan_pages; } -/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ #define MAX_SCAN_WINDOW 2560 static unsigned int task_scan_min(struct task_struct *p) @@ -1266,8 +1640,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + - group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; + return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] + + group->faults[task_faults_idx(NUMA_CPU, nid, 1)]; } static inline unsigned long group_faults_priv(struct numa_group *ng) @@ -1308,10 +1682,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng) /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, - int maxdist, bool task) + int lim_dist, bool task) { unsigned long score = 0; - int node; + int node, max_dist; /* * All nodes are directly connected, and the same distance @@ -1320,9 +1694,11 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, if (sched_numa_topology_type == NUMA_DIRECT) return 0; + /* sched_max_numa_distance may be changed in parallel. */ + max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, - * which should be ok given the number of nodes rarely exceeds 8. + * which should be OK given the number of nodes rarely exceeds 8. */ for_each_online_node(node) { unsigned long faults; @@ -1332,7 +1708,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * The furthest away nodes in the system are not interesting * for placement; nid was already counted. */ - if (dist == sched_max_numa_distance || node == nid) + if (dist >= max_dist || node == nid) continue; /* @@ -1342,8 +1718,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes. */ - if (sched_numa_topology_type == NUMA_BACKPLANE && - dist >= maxdist) + if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue; /* Add up the faults from nearby nodes. */ @@ -1361,8 +1736,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * This seems to result in good task placement. */ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { - faults *= (sched_max_numa_distance - dist); - faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + faults *= (max_dist - dist); + faults /= (max_dist - LOCAL_DISTANCE); } score += faults; @@ -1416,15 +1791,169 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } -bool should_numa_migrate_memory(struct task_struct *p, struct page * page, +/* + * If memory tiering mode is enabled, cpupid of slow memory page is + * used to record scan time instead of CPU and PID. When tiering mode + * is disabled at run time, the scan time (in cpupid) will be + * interpreted as CPU and PID. So CPU needs to be checked to avoid to + * access out of array bound. + */ +static inline bool cpupid_valid(int cpupid) +{ + return cpupid_to_cpu(cpupid) < nr_cpu_ids; +} + +/* + * For memory tiering mode, if there are enough free pages (more than + * enough watermark defined here) in fast memory node, to take full + * advantage of fast memory capacity, all recently accessed slow + * memory pages will be migrated to fast memory node without + * considering hot threshold. + */ +static bool pgdat_free_space_enough(struct pglist_data *pgdat) +{ + int z; + unsigned long enough_wmark; + + enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, + pgdat->node_present_pages >> 4); + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone_watermark_ok(zone, 0, + promo_wmark_pages(zone) + enough_wmark, + ZONE_MOVABLE, 0)) + return true; + } + return false; +} + +/* + * For memory tiering mode, when page tables are scanned, the scan + * time will be recorded in struct page in addition to make page + * PROT_NONE for slow memory page. So when the page is accessed, in + * hint page fault handler, the hint page fault latency is calculated + * via, + * + * hint page fault latency = hint page fault time - scan time + * + * The smaller the hint page fault latency, the higher the possibility + * for the page to be hot. + */ +static int numa_hint_fault_latency(struct folio *folio) +{ + int last_time, time; + + time = jiffies_to_msecs(jiffies); + last_time = folio_xchg_access_time(folio, time); + + return (time - last_time) & PAGE_ACCESS_TIME_MASK; +} + +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + +#define NUMA_MIGRATION_ADJUST_STEPS 16 + +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, + unsigned long rate_limit, + unsigned int ref_th) +{ + unsigned int now, start, th_period, unit_th, th; + unsigned long nr_cand, ref_cand, diff_cand; + + now = jiffies_to_msecs(jiffies); + th_period = sysctl_numa_balancing_scan_period_max; + start = pgdat->nbp_th_start; + if (now - start > th_period && + cmpxchg(&pgdat->nbp_th_start, start, now) == start) { + ref_cand = rate_limit * + sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + diff_cand = nr_cand - pgdat->nbp_th_nr_cand; + unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; + th = pgdat->nbp_threshold ? : ref_th; + if (diff_cand > ref_cand * 11 / 10) + th = max(th - unit_th, unit_th); + else if (diff_cand < ref_cand * 9 / 10) + th = min(th + unit_th, ref_th * 2); + pgdat->nbp_th_nr_cand = nr_cand; + pgdat->nbp_threshold = th; + } +} + +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu) { struct numa_group *ng = deref_curr_numa_group(p); int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * Cannot migrate to memoryless nodes. + */ + if (!node_state(dst_nid, N_MEMORY)) + return false; + + /* + * The pages in slow memory node should be migrated according + * to hot/cold instead of private/shared. + */ + if (folio_use_access_time(folio)) { + struct pglist_data *pgdat; + unsigned long rate_limit; + unsigned int latency, th, def_th; + + pgdat = NODE_DATA(dst_nid); + if (pgdat_free_space_enough(pgdat)) { + /* workload changed, reset hot threshold */ + pgdat->nbp_threshold = 0; + return true; + } + + def_th = sysctl_numa_balancing_hot_threshold; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); + + th = pgdat->nbp_threshold ? : def_th; + latency = numa_hint_fault_latency(folio); + if (latency >= th) + return false; + + return !numa_promotion_rate_limit(pgdat, rate_limit, + folio_nr_pages(folio)); + } + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid); + + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) + return false; /* * Allow first faults or private faults to migrate immediately early in @@ -1506,6 +2035,7 @@ enum numa_type { /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; + unsigned long runnable; unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; @@ -1515,28 +2045,12 @@ struct numa_stats { int idle_cpu; }; -static inline bool is_core_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - int sibling; - - for_each_cpu(sibling, cpu_smt_mask(cpu)) { - if (cpu == sibling) - continue; - - if (!idle_cpu(cpu)) - return false; - } -#endif - - return true; -} - struct task_numa_env { struct task_struct *p; int src_cpu, src_nid; int dst_cpu, dst_nid; + int imb_numa_nr; struct numa_stats src_stats, dst_stats; @@ -1549,19 +2063,20 @@ struct task_numa_env { }; static unsigned long cpu_load(struct rq *rq); -static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); +static unsigned long cpu_runnable(struct rq *rq); static inline enum numa_type numa_classify(unsigned int imbalance_pct, struct numa_stats *ns) { if ((ns->nr_running > ns->weight) && - ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) return node_overloaded; if ((ns->nr_running < ns->weight) || - ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) return node_has_spare; return node_fully_busy; @@ -1569,11 +2084,11 @@ numa_type numa_classify(unsigned int imbalance_pct, #ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ -static inline bool test_idle_cores(int cpu, bool def); +static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { if (!static_branch_likely(&sched_smt_present) || - idle_core >= 0 || !test_idle_cores(cpu, false)) + idle_core >= 0 || !test_idle_cores(cpu)) return idle_core; /* @@ -1612,11 +2127,12 @@ static void update_numa_stats(struct task_numa_env *env, struct rq *rq = cpu_rq(cpu); ns->load += cpu_load(rq); - ns->util += cpu_util(cpu); - ns->nr_running += rq->cfs.h_nr_running; + ns->runnable += cpu_runnable(rq); + ns->util += cpu_util_cfs(cpu); + ns->nr_running += rq->cfs.h_nr_runnable; ns->compute_capacity += capacity_of(cpu); - if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { if (READ_ONCE(rq->numa_migrate_on) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; @@ -1648,7 +2164,7 @@ static void task_numa_assign(struct task_numa_env *env, int start = env->dst_cpu; /* Find alternative idle CPU. */ - for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) { if (cpu == env->best_cpu || !idle_cpu(cpu) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { continue; @@ -1788,6 +2304,15 @@ static bool task_numa_compare(struct task_numa_env *env, */ cur_ng = rcu_dereference(cur->numa_group); if (cur_ng == p_ng) { + /* + * Do not swap within a group or between tasks that have + * no group if there is spare capacity. Swapping does + * not address the load imbalance and helps one task at + * the cost of punishing another. + */ + if (env->dst_stats.node_type == node_has_spare) + goto unlock; + imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* @@ -1927,7 +2452,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, src_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running, + env->imb_numa_nr); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -1992,8 +2518,10 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) + if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + env.imb_numa_nr = sd->imb_numa_nr; + } rcu_read_unlock(); /* @@ -2028,7 +2556,7 @@ static int task_numa_migrate(struct task_struct *p) */ ng = deref_curr_numa_group(p); if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -2116,7 +2644,7 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find out how many nodes on the workload is actively running on. Do this by + * Find out how many nodes the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. @@ -2126,13 +2654,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group) unsigned long faults, max_faults = 0; int nid, active_nodes = 0; - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults > max_faults) max_faults = faults; } - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults * ACTIVE_NODE_FRACTION > max_faults) active_nodes++; @@ -2170,7 +2698,7 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is - * completely idle or all activity is areas that are not of interest + * completely idle or all activity is in areas that are not of interest * to automatic numa balancing. Related to that, if there were failed * migration then it implies we are migrating too quickly or the local * node is overloaded. In either case, scan slower @@ -2286,7 +2814,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) dist = sched_max_numa_distance; - for_each_online_node(node) { + for_each_node_state(node, N_CPU) { score = group_weight(p, node, dist); if (score > max_score) { max_score = score; @@ -2305,7 +2833,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) * inside the highest scoring group of nodes. The nodemask tricks * keep the complexity of the search down. */ - nodes = node_online_map; + nodes = node_states[N_CPU]; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; nodemask_t max_group = NODE_MASK_NONE; @@ -2427,7 +2955,7 @@ static void task_numa_placement(struct task_struct *p) * is at the beginning of the numa_faults array. */ ng->faults[mem_idx] += diff; - ng->faults_cpu[mem_idx] += f_diff; + ng->faults[cpu_idx] += f_diff; ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } @@ -2444,6 +2972,9 @@ static void task_numa_placement(struct task_struct *p) } } + /* Cannot migrate task to CPU-less node */ + max_nid = numa_nearest_node(max_nid, N_CPU); + if (ng) { numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); @@ -2481,7 +3012,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + - 4*nr_node_ids*sizeof(unsigned long); + NR_NUMA_HINT_FAULT_STATS * + nr_node_ids * sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -2492,9 +3024,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; - /* Second half of the array tracks nids where faults happen */ - grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * - nr_node_ids; for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2551,7 +3080,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled()); double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { @@ -2578,7 +3107,7 @@ no_join: } /* - * Get rid of NUMA staticstics associated with a task (either current or dead). + * Get rid of NUMA statistics associated with a task (either current or dead). * If @final is set, the task is dead and has reached refcount zero, so we can * safely free all relevant data structures. Otherwise, there might be * concurrent reads from places like load balancing and procfs, and we should @@ -2636,6 +3165,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; + /* + * NUMA faults statistics are unnecessary for the slow memory + * node for memory tiering mode. + */ + if (!node_is_toptier(mem_node) && + (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || + !cpupid_valid(last_cpupid))) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * @@ -2706,6 +3244,45 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) +{ + unsigned long pids; + /* + * Allow unconditional access first two times, so that all the (pages) + * of VMAs get prot_none fault introduced irrespective of accesses. + * This is also done to avoid any side effect of task scanning + * amplifying the unfairness of disjoint set of VMAs' access. + */ + if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) + return true; + + pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + return true; + + /* + * Complete a scan that has already started regardless of PID access, or + * some VMAs may never be scanned in multi-threaded applications: + */ + if (mm->numa_scan_offset > vma->vm_start) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); + return true; + } + + /* + * This vma has not been accessed for a while, and if the number + * the threads in the same process is low, which means no other + * threads can help scan this vma, force a vma scan. + */ + if (READ_ONCE(mm->numa_scan_seq) > + (vma->numab_state->prev_scan_seq + get_nr_threads(current))) + return true; + + return false; +} + +#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) + /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). @@ -2720,6 +3297,9 @@ static void task_numa_work(struct callback_head *work) unsigned long start, end; unsigned long nr_pte_updates = 0; long pages, virtpages; + struct vma_iterator vmi; + bool vma_pids_skipped; + bool vma_pids_forced = false; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -2753,7 +3333,7 @@ static void task_numa_work(struct callback_head *work) } next_scan = now + msecs_to_jiffies(p->numa_scan_period); - if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan)) return; /* @@ -2762,7 +3342,6 @@ static void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ virtpages = pages * 8; /* Scan up to this much virtual space */ @@ -2772,34 +3351,118 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + + /* + * VMAs are skipped if the current PID has not trapped a fault within + * the VMA recently. Allow scanning to be forced if there is no + * suitable VMA remaining. + */ + vma_pids_skipped = false; + +retry_pids: + start = mm->numa_scan_offset; + vma_iter_init(&vmi, mm, start); + vma = vma_next(&vmi); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + vma_iter_set(&vmi, start); + vma = vma_next(&vmi); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = vma_next(&vmi)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); continue; } /* * Shared library pages mapped by multiple processes are not * migrated as it is expected they are cache replicated. Avoid - * hinting faults in read-only file-backed mappings or the vdso + * hinting faults in read-only file-backed mappings or the vDSO * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue; + } /* * Skip inaccessible VMAs to avoid any confusion between - * PROT_NONE and NUMA hinting ptes + * PROT_NONE and NUMA hinting PTEs */ - if (!vma_is_accessible(vma)) + if (!vma_is_accessible(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue; + } + + /* Initialise new per-VMA NUMAB state. */ + if (!vma->numab_state) { + struct vma_numab_state *ptr; + + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + if (!ptr) + continue; + + if (cmpxchg(&vma->numab_state, NULL, ptr)) { + kfree(ptr); + continue; + } + + vma->numab_state->start_scan_seq = mm->numa_scan_seq; + + vma->numab_state->next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + + /* Reset happens after 4 times scan delay of scan start */ + vma->numab_state->pids_active_reset = vma->numab_state->next_scan + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + + /* + * Ensure prev_scan_seq does not match numa_scan_seq, + * to prevent VMAs being skipped prematurely on the + * first scan: + */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; + } + + /* + * Scanning the VMAs of short lived tasks add more overhead. So + * delay the scan for new VMAs. + */ + if (mm->numa_scan_seq && time_before(jiffies, + vma->numab_state->next_scan)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); + continue; + } + + /* RESET access PIDs regularly for old VMAs. */ + if (mm->numa_scan_seq && + time_after(jiffies, vma->numab_state->pids_active_reset)) { + vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); + vma->numab_state->pids_active[1] = 0; + } + + /* Do not rescan VMAs twice within the same sequence. */ + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { + mm->numa_scan_offset = vma->vm_end; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); + continue; + } + + /* + * Do not scan the VMA if task has not accessed it, unless no other + * VMA candidate exists. + */ + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { + vma_pids_skipped = true; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); + continue; + } do { start = max(start, vma->vm_start); @@ -2810,7 +3473,7 @@ static void task_numa_work(struct callback_head *work) /* * Try to scan sysctl_numa_balancing_size worth of * hpages that have at least one present PTE that - * is not already pte-numa. If the VMA contains + * is not already PTE-numa. If the VMA contains * areas that are unused or already full of prot_numa * PTEs, scan up to virtpages, to skip through those * areas faster. @@ -2825,6 +3488,26 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); + + /* VMA scan is complete, do not scan until next sequence. */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; + + /* + * Only force scan within one VMA at a time, to limit the + * cost of scanning a potentially uninteresting VMA. + */ + if (vma_pids_forced) + break; + } + + /* + * If no VMAs are remaining and VMAs were skipped due to the PID + * not accessing the VMA previously, then force a scan to ensure + * forward progress: + */ + if (!vma && !vma_pids_forced && vma_pids_skipped) { + vma_pids_forced = true; + goto retry_pids; } out: @@ -2867,9 +3550,12 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0; p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_migrate_retry = 0; /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; + p->numa_pages_migrated = 0; + p->total_numa_faults = 0; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -2907,7 +3593,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -2925,7 +3611,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); + task_work_add(curr, work, TWA_RESUME); } } @@ -2994,7 +3680,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &rq->cfs_tasks); } #endif - cfs_rq->nr_running++; + cfs_rq->nr_queued++; } static void @@ -3007,7 +3693,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } #endif - cfs_rq->nr_running--; + cfs_rq->nr_queued--; } /* @@ -3071,6 +3757,9 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void @@ -3079,44 +3768,74 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { + bool curr = cfs_rq->curr == se; + if (se->on_rq) { /* commit outstanding execution time */ - if (cfs_rq->curr == se) - update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); + update_curr(cfs_rq); + update_entity_lag(cfs_rq, se); + se->deadline -= se->vruntime; + se->rel_deadline = 1; + if (!curr) + __dequeue_entity(cfs_rq, se); + update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); + /* + * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), + * we need to scale se->vlag when w_i changes. + */ + se->vlag = div_s64(se->vlag * se->load.weight, weight); + if (se->rel_deadline) + se->deadline = div_s64(se->deadline * se->load.weight, weight); + update_load_set(&se->load, weight); #ifdef CONFIG_SMP do { - u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) - account_entity_enqueue(cfs_rq, se); + if (se->on_rq) { + update_load_add(&cfs_rq->load, se->load.weight); + place_entity(cfs_rq, se, 0); + if (!curr) + __enqueue_entity(cfs_rq, se); + /* + * The entity's vruntime has been adjusted, so let's check + * whether the rq-wide min_vruntime needs updated too. Since + * the calculations above require stable min_vruntime rather + * than up-to-date one, we do the update at the end of the + * reweight process. + */ + update_min_vruntime(cfs_rq); + } } -void reweight_task(struct task_struct *p, int prio) +static void reweight_task_fair(struct rq *rq, struct task_struct *p, + const struct load_weight *lw) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct load_weight *load = &se->load; - unsigned long weight = scale_load(sched_prio_to_weight[prio]); - reweight_entity(cfs_rq, se, weight); - load->inv_weight = sched_prio_to_wmult[prio]; + reweight_entity(cfs_rq, se, lw->weight); + load->inv_weight = lw->inv_weight; } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* @@ -3128,7 +3847,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (1) - * \Sum grq->load.weight + * \Sum grq->load.weight * * Now, because computing that sum is prohibitively expensive to compute (been * there, done that) we approximate it with this average stuff. The average @@ -3142,7 +3861,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->avg.load_avg * ge->load.weight = ------------------------------ (3) - * tg->load_avg + * tg->load_avg * * Where: tg->load_avg ~= \Sum grq->avg.load_avg * @@ -3158,7 +3877,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- = tg->weight (4) - * grp->load.weight + * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. * @@ -3177,7 +3896,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (6) - * tg_load_avg' + * tg_load_avg' * * Where: * @@ -3227,8 +3946,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); - /* * Recomputes the group entity based on the current state of its group * runqueue. @@ -3238,7 +3955,11 @@ static void update_cfs_group(struct sched_entity *se) struct cfs_rq *gcfs_rq = group_cfs_rq(se); long shares; - if (!gcfs_rq) + /* + * When a group becomes empty, preserve its weight. This matters for + * DELAY_DEQUEUE. + */ + if (!gcfs_rq || !gcfs_rq->load.weight) return; if (throttled_hierarchy(gcfs_rq)) @@ -3246,14 +3967,11 @@ static void update_cfs_group(struct sched_entity *se) #ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); - - if (likely(se->load.weight == shares)) - return; #else - shares = calc_group_shares(gcfs_rq); + shares = calc_group_shares(gcfs_rq); #endif - - reweight_entity(cfs_rq_of(se), se, shares); + if (unlikely(se->load.weight != shares)) + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3279,18 +3997,87 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * As is, the util number is not freq-invariant (we'd have to * implement arch_scale_freq_capacity() for that). * - * See cpu_util(). + * See cpu_util_cfs(). */ cpufreq_update_util(rq, flags); } } #ifdef CONFIG_SMP +static inline bool load_avg_is_decayed(struct sched_avg *sa) +{ + if (sa->load_sum) + return false; + + if (sa->util_sum) + return false; + + if (sa->runnable_sum) + return false; + + /* + * _avg must be null when _sum are null because _avg = _sum / divider + * Make sure that rounding and/or propagation of PELT values never + * break this. + */ + SCHED_WARN_ON(sa->load_avg || + sa->util_avg || + sa->runnable_avg); + + return true; +} + +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return u64_u32_load_copy(cfs_rq->avg.last_update_time, + cfs_rq->last_update_time_copy); +} #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list + * bottom-up, we only have to test whether the cfs_rq before us on the list + * is our child. + * If cfs_rq is not on the list, test whether a child needs its to be added to + * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). + */ +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) +{ + struct cfs_rq *prev_cfs_rq; + struct list_head *prev; + struct rq *rq = rq_of(cfs_rq); + + if (cfs_rq->on_list) { + prev = cfs_rq->leaf_cfs_rq_list.prev; + } else { + prev = rq->tmp_alone_branch; + } + + if (prev == &rq->leaf_cfs_rq_list) + return false; + + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); + + return (prev_cfs_rq->tg->parent == cfs_rq->tg); +} + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (!load_avg_is_decayed(&cfs_rq->avg)) + return false; + + if (child_cfs_rq_on_list(cfs_rq)) + return false; + + return true; +} + /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed - * @force: update regardless of how small the difference * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance @@ -3302,9 +4089,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * * Updating tg's load_avg is necessary before update_cfs_share(). */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { - long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + long delta; + u64 now; /* * No need to update load_avg for root_task_group as it is not used. @@ -3312,12 +4100,69 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) if (cfs_rq->tg == &root_task_group) return; - if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + /* rq has been offline and doesn't contribute to the share anymore: */ + if (!cpu_active(cpu_of(rq_of(cfs_rq)))) + return; + + /* + * For migration heavy workloads, access to tg->load_avg can be + * unbound. Limit the update rate to at most once per ms. + */ + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) + return; + + delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; + cfs_rq->last_update_tg_load_avg = now; } } +static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) +{ + long delta; + u64 now; + + /* + * No need to update load_avg for root_task_group, as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + delta = 0 - cfs_rq->tg_load_avg_contrib; + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = 0; + cfs_rq->last_update_tg_load_avg = now; +} + +/* CPU offline callback: */ +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) +{ + struct task_group *tg; + + lockdep_assert_rq_held(rq); + + /* + * The rq clock has already been updated in + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + clear_tg_load_avg(cfs_rq); + } + rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); +} + /* * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, @@ -3342,32 +4187,13 @@ void set_task_rq_fair(struct sched_entity *se, if (!(se->avg.last_update_time && prev)) return; -#ifndef CONFIG_64BIT - { - u64 p_last_update_time_copy; - u64 n_last_update_time_copy; - - do { - p_last_update_time_copy = prev->load_last_update_time_copy; - n_last_update_time_copy = next->load_last_update_time_copy; - - smp_rmb(); - - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = cfs_rq_last_update_time(prev); + n_last_update_time = cfs_rq_last_update_time(next); - } while (p_last_update_time != p_last_update_time_copy || - n_last_update_time != n_last_update_time_copy); - } -#else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; -#endif __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3435,51 +4261,66 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; + + /* Nothing to update */ + if (!delta_avg) + return; + /* * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + divider = get_pelt_divider(&cfs_rq->avg); - /* Nothing to update */ - if (!delta) - return; /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * divider; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; + + /* Nothing to update */ + if (!delta_avg) + return; + /* * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; - - /* Nothing to update */ - if (!delta) - return; + divider = get_pelt_divider(&cfs_rq->avg); /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * divider; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); } static inline void @@ -3500,7 +4341,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + divider = get_pelt_divider(&cfs_rq->avg); if (runnable_sum >= 0) { /* @@ -3515,7 +4356,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3532,16 +4373,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, divider); + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) + return; + + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; add_positive(&cfs_rq->avg.load_avg, delta_avg); add_positive(&cfs_rq->avg.load_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3610,7 +4457,9 @@ static inline bool skip_blocked_update(struct sched_entity *se) #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} + +static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {} static inline int propagate_entity_load_avg(struct sched_entity *se) { @@ -3621,18 +4470,100 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_NO_HZ_COMMON +static inline void migrate_se_pelt_lag(struct sched_entity *se) +{ + u64 throttled = 0, now, lut; + struct cfs_rq *cfs_rq; + struct rq *rq; + bool is_idle; + + if (load_avg_is_decayed(&se->avg)) + return; + + cfs_rq = cfs_rq_of(se); + rq = rq_of(cfs_rq); + + rcu_read_lock(); + is_idle = is_idle_task(rcu_dereference(rq->curr)); + rcu_read_unlock(); + + /* + * The lag estimation comes with a cost we don't want to pay all the + * time. Hence, limiting to the case where the source CPU is idle and + * we know we are at the greatest risk to have an outdated clock. + */ + if (!is_idle) + return; + + /* + * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where: + * + * last_update_time (the cfs_rq's last_update_time) + * = cfs_rq_clock_pelt()@cfs_rq_idle + * = rq_clock_pelt()@cfs_rq_idle + * - cfs->throttled_clock_pelt_time@cfs_rq_idle + * + * cfs_idle_lag (delta between rq's update and cfs_rq's update) + * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle + * + * rq_idle_lag (delta between now and rq's update) + * = sched_clock_cpu() - rq_clock()@rq_idle + * + * We can then write: + * + * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time + + * sched_clock_cpu() - rq_clock()@rq_idle + * Where: + * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle + * rq_clock()@rq_idle is rq->clock_idle + * cfs->throttled_clock_pelt_time@cfs_rq_idle + * is cfs_rq->throttled_pelt_idle + */ + +#ifdef CONFIG_CFS_BANDWIDTH + throttled = u64_u32_load(cfs_rq->throttled_pelt_idle); + /* The clock has been stopped for throttling */ + if (throttled == U64_MAX) + return; +#endif + now = u64_u32_load(rq->clock_pelt_idle); + /* + * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case + * is observed the old clock_pelt_idle value and the new clock_idle, + * which lead to an underestimation. The opposite would lead to an + * overestimation. + */ + smp_rmb(); + lut = cfs_rq_last_update_time(cfs_rq); + + now -= throttled; + if (now < lut) + /* + * cfs_rq->avg.last_update_time is more recent than our + * estimation, let's use it. + */ + now = lut; + else + now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle); + + __update_load_avg_blocked_se(now, se); +} +#else +static void migrate_se_pelt_lag(struct sched_entity *se) {} +#endif + /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) - * avg. The immediate corollary is that all (fair) tasks must be attached, see - * post_init_entity_util_avg(). + * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -3646,7 +4577,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq->removed.nr) { unsigned long r; - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); @@ -3658,14 +4589,31 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_load; sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); r = removed_runnable; sub_positive(&sa->runnable_avg, r); sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); /* * removed_runnable is the unweighted version of removed_load so we @@ -3678,12 +4626,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) } decayed |= __update_load_avg_cfs_rq(now, cfs_rq); - -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->load_last_update_time_copy = sa->last_update_time; -#endif - + u64_u32_store_copy(sa->last_update_time, + cfs_rq->last_update_time_copy, + sa->last_update_time); return decayed; } @@ -3701,7 +4646,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); /* * When we attach the @se to the @cfs_rq, we must align the decay @@ -3723,11 +4668,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s se->avg.runnable_sum = se->avg.runnable_avg * divider; - se->avg.load_sum = divider; - if (se_weight(se)) { - se->avg.load_sum = - div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); - } + se->avg.load_sum = se->avg.load_avg * divider; + if (se_weight(se) < se->avg.load_sum) + se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se)); + else + se->avg.load_sum = 1; enqueue_load_avg(cfs_rq, se); cfs_rq->avg.util_avg += se->avg.util_avg; @@ -3755,8 +4700,15 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3771,6 +4723,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 #define DO_ATTACH 0x4 +#define DO_DETACH 0x8 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -3780,7 +4733,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * Track task load average for carrying it to new CPU after migrated, and - * track group sched_entity load average for task_h_load calc in migration + * track group sched_entity load average for task_h_load calculation in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) __update_load_avg_se(now, cfs_rq, se); @@ -3798,37 +4751,23 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * IOW we're enqueueing a task on a new CPU. */ attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); + } else if (flags & DO_DETACH) { + /* + * DO_DETACH means we're here from dequeue_entity() + * and we are migrating task out of the CPU. + */ + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); if (flags & UPDATE_TG) - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } } -#ifndef CONFIG_64BIT -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - u64 last_update_time_copy; - u64 last_update_time; - - do { - last_update_time_copy = cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time = cfs_rq->avg.last_update_time; - } while (last_update_time != last_update_time_copy); - - return last_update_time; -} -#else -static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.last_update_time; -} -#endif - /* * Synchronize entity load avg of dequeued entity without locking * the previous rq. @@ -3853,8 +4792,8 @@ static void remove_entity_load_avg(struct sched_entity *se) /* * tasks cannot exit without having gone through wake_up_new_task() -> - * post_init_entity_util_avg() which will have added things to the - * cfs_rq, so we can remove unconditionally. + * enqueue_task_fair() which will have added things to the cfs_rq, + * so we can remove unconditionally. */ sync_entity_load_avg(se); @@ -3877,38 +4816,27 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); } -static inline unsigned long _task_util_est(struct task_struct *p) +static inline unsigned long task_runnable(struct task_struct *p) { - struct util_est ue = READ_ONCE(p->se.avg.util_est); - - return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); + return READ_ONCE(p->se.avg.runnable_avg); } -static inline unsigned long task_util_est(struct task_struct *p) +static inline unsigned long _task_util_est(struct task_struct *p) { - return max(task_util(p), _task_util_est(p)); + return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; } -#ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p) -{ - return clamp(task_util_est(p), - uclamp_eff_value(p, UCLAMP_MIN), - uclamp_eff_value(p, UCLAMP_MAX)); -} -#else -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long task_util_est(struct task_struct *p) { - return task_util_est(p); + return max(task_util(p), _task_util_est(p)); } -#endif static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) @@ -3919,38 +4847,39 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, return; /* Update root cfs_rq's estimated utilization */ - enqueued = cfs_rq->avg.util_est.enqueued; + enqueued = cfs_rq->avg.util_est; enqueued += _task_util_est(p); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); -} + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); -/* - * Check if a (signed) value is within a specified (unsigned) margin, - * based on the observation that: - * - * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) - * - * NOTE: this only works when value + maring < INT_MAX. - */ -static inline bool within_margin(int value, int margin) -{ - return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); + trace_sched_util_est_cfs_tp(cfs_rq); } -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) { - long last_ewma_diff; - struct util_est ue; - int cpu; + unsigned int enqueued; if (!sched_feat(UTIL_EST)) return; /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + enqueued = cfs_rq->avg.util_est; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) + +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) +{ + unsigned int ewma, dequeued, last_ewma_diff; + + if (!sched_feat(UTIL_EST)) + return; /* * Skip update of task's estimated utilization when the task has not @@ -3959,82 +4888,232 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (!task_sleep) return; + /* Get current estimate of utilization */ + ewma = READ_ONCE(p->se.avg.util_est); + /* * If the PELT values haven't changed since enqueue time, * skip the util_est update. */ - ue = p->se.avg.util_est; - if (ue.enqueued & UTIL_AVG_UNCHANGED) + if (ewma & UTIL_AVG_UNCHANGED) return; + /* Get utilization at dequeue */ + dequeued = task_util(p); + /* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); - if (sched_feat(UTIL_EST_FASTUP)) { - if (ue.ewma < ue.enqueued) { - ue.ewma = ue.enqueued; - goto done; - } + if (ewma <= dequeued) { + ewma = dequeued; + goto done; } /* - * Skip update of task's estimated utilization when its EWMA is + * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value. */ - last_ewma_diff = ue.enqueued - ue.ewma; - if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) - return; + last_ewma_diff = ewma - dequeued; + if (last_ewma_diff < UTIL_EST_MARGIN) + goto done; /* * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; /* + * To avoid underestimate of task utilization, skip updates of EWMA if + * we cannot grant that thread got all CPU time it wanted. + */ + if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) + goto done; + + + /* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample - * of the task size. This is done by storing the current PELT value - * as ue.enqueued and by using this value to update the Exponential - * Weighted Moving Average (EWMA): + * of the task size. This is done by using this value to update the + * Exponential Weighted Moving Average (EWMA): * * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) - * = w * ( last_ewma_diff ) + ewma(t-1) - * = w * (last_ewma_diff + ewma(t-1) / w) + * = w * ( -last_ewma_diff ) + ewma(t-1) + * = w * (-last_ewma_diff + ewma(t-1) / w) * * Where 'w' is the weight of new samples, which is configured to be * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) */ - ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; - ue.ewma += last_ewma_diff; - ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + ewma <<= UTIL_EST_WEIGHT_SHIFT; + ewma -= last_ewma_diff; + ewma >>= UTIL_EST_WEIGHT_SHIFT; done: - WRITE_ONCE(p->se.avg.util_est, ue); + ewma |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(p->se.avg.util_est, ewma); + + trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, long capacity) +static inline unsigned long get_actual_cpu_capacity(int cpu) { - return fits_capacity(uclamp_task_util(p), capacity); + unsigned long capacity = arch_scale_cpu_capacity(cpu); + + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu)); + + return capacity; +} + +static inline int util_fits_cpu(unsigned long util, + unsigned long uclamp_min, + unsigned long uclamp_max, + int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long capacity_orig; + bool fits, uclamp_max_fits; + + /* + * Check if the real util fits without any uclamp boost/cap applied. + */ + fits = fits_capacity(util, capacity); + + if (!uclamp_is_used()) + return fits; + + /* + * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and + * uclamp_max. We only care about capacity pressure (by using + * capacity_of()) for comparing against the real util. + * + * If a task is boosted to 1024 for example, we don't want a tiny + * pressure to skew the check whether it fits a CPU or not. + * + * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it + * should fit a little cpu even if there's some pressure. + * + * Only exception is for HW or cpufreq pressure since it has a direct impact + * on available OPP of the system. + * + * We honour it for uclamp_min only as a drop in performance level + * could result in not getting the requested minimum performance level. + * + * For uclamp_max, we can tolerate a drop in performance level as the + * goal is to cap the task. So it's okay if it's getting less. + */ + capacity_orig = arch_scale_cpu_capacity(cpu); + + /* + * We want to force a task to fit a cpu as implied by uclamp_max. + * But we do have some corner cases to cater for.. + * + * + * C=z + * | ___ + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | + * | | | | | | | (util somewhere in this region) + * | | | | | | | + * | | | | | | | + * +---------------------------------------- + * CPU0 CPU1 CPU2 + * + * In the above example if a task is capped to a specific performance + * point, y, then when: + * + * * util = 80% of x then it does not fit on CPU0 and should migrate + * to CPU1 + * * util = 80% of y then it is forced to fit on CPU1 to honour + * uclamp_max request. + * + * which is what we're enforcing here. A task always fits if + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, + * the normal upmigration rules should withhold still. + * + * Only exception is when we are on max capacity, then we need to be + * careful not to block overutilized state. This is so because: + * + * 1. There's no concept of capping at max_capacity! We can't go + * beyond this performance level anyway. + * 2. The system is being saturated when we're operating near + * max capacity, it doesn't make sense to block overutilized. + */ + uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); + uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); + fits = fits || uclamp_max_fits; + + /* + * + * C=z + * | ___ (region a, capped, util >= uclamp_max) + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min + * | | | | | | | + * | | | | | | | (region c, boosted, util < uclamp_min) + * +---------------------------------------- + * CPU0 CPU1 CPU2 + * + * a) If util > uclamp_max, then we're capped, we don't care about + * actual fitness value here. We only care if uclamp_max fits + * capacity without taking margin/pressure into account. + * See comment above. + * + * b) If uclamp_min <= util <= uclamp_max, then the normal + * fits_capacity() rules apply. Except we need to ensure that we + * enforce we remain within uclamp_max, see comment above. + * + * c) If util < uclamp_min, then we are boosted. Same as (b) but we + * need to take into account the boosted value fits the CPU without + * taking margin/pressure into account. + * + * Cases (a) and (b) are handled in the 'fits' variable already. We + * just need to consider an extra check for case (c) after ensuring we + * handle the case uclamp_min > uclamp_max. + */ + uclamp_min = min(uclamp_min, uclamp_max); + if (fits && (util < uclamp_min) && + (uclamp_min > get_actual_cpu_capacity(cpu))) + return -1; + + return fits; +} + +static inline int task_fits_cpu(struct task_struct *p, int cpu) +{ + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); + unsigned long util = task_util_est(p); + /* + * Return true only if the cpu fully fits the task requirements, which + * include the utilization but also the performance hints. + */ + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!static_branch_unlikely(&sched_asym_cpucapacity)) - return; + int cpu = cpu_of(rq); - if (!p) { - rq->misfit_task_load = 0; + if (!sched_asym_cpucap_active()) return; - } - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + /* + * Affinity allows us to go somewhere higher? Or are we on biggest + * available CPU already? Or do we fit into this CPU ? + */ + if (!p || (p->nr_cpus_allowed == 1) || + (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) || + task_fits_cpu(p, cpu)) { + rq->misfit_task_load = 0; return; } @@ -4048,9 +5127,15 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) #else /* CONFIG_SMP */ +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + return !cfs_rq->nr_queued; +} + #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 +#define DO_DETACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -4064,7 +5149,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf) +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -4073,178 +5158,210 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ -static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { -#ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; - - if (d < 0) - d = -d; + struct sched_entity *se = &p->se; - if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq->nr_spread_over); -#endif + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + if (attr->sched_runtime) { + se->custom_slice = 1; + se->slice = clamp_t(u64, attr->sched_runtime, + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ + } else { + se->custom_slice = 0; + se->slice = sysctl_sched_base_slice; + } } static void -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vruntime = cfs_rq->min_vruntime; + u64 vslice, vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; + + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. + * Due to how V is constructed as the weighted average of entities, + * adding tasks with positive lag, or removing tasks with negative lag + * will move 'time' backwards, this can screw around with the lag of + * other tasks. + * + * EEVDF: placement strategy #1 / #2 */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); + if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { + struct sched_entity *curr = cfs_rq->curr; + unsigned long load; - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh = sysctl_sched_latency; + lag = se->vlag; /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: + * If we want to place a task and preserve lag, we have to + * consider the effect of the new entity on the weighted + * average and compensate for this, otherwise lag can quickly + * evaporate. + * + * Lag is defined as: + * + * lag_i = S - s_i = w_i * (V - v_i) + * + * To avoid the 'w_i' term all over the place, we only track + * the virtual lag: + * + * vl_i = V - v_i <=> v_i = V - vl_i + * + * And we take V to be the weighted average of all v: + * + * V = (\Sum w_j*v_j) / W + * + * Where W is: \Sum w_j + * + * Then, the weighted average after adding an entity with lag + * vl_i is given by: + * + * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) + * = (W*V + w_i*(V - vl_i)) / (W + w_i) + * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) + * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = V - w_i*vl_i / (W + w_i) + * + * And the actual lag after adding an entity with vl_i is: + * + * vl'_i = V' - v_i + * = V - w_i*vl_i / (W + w_i) - (V - vl_i) + * = vl_i - w_i*vl_i / (W + w_i) + * + * Which is strictly less than vl_i. So in order to preserve lag + * we should inflate the lag before placement such that the + * effective lag after placement comes out right. + * + * As such, invert the above relation for vl'_i to get the vl_i + * we need to use such that the lag after placement is the lag + * we computed before dequeue. + * + * vl'_i = vl_i - w_i*vl_i / (W + w_i) + * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) + * + * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i + * = W*vl_i + * + * vl_i = (W + w_i)*vl'_i / W */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; + load = cfs_rq->avg_load; + if (curr && curr->on_rq) + load += scale_load_down(curr->load.weight); - vruntime -= thresh; + lag *= load + scale_load_down(se->load.weight); + if (WARN_ON_ONCE(!load)) + load = 1; + lag = div_s64(lag, load); } - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); -} - -static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + se->vruntime = vruntime - lag; -static inline void check_schedstat_required(void) -{ -#ifdef CONFIG_SCHEDSTATS - if (schedstat_enabled()) + if (se->rel_deadline) { + se->deadline += se->vruntime; + se->rel_deadline = 0; return; - - /* Force schedstat enabled if a dependent tracepoint is active */ - if (trace_sched_stat_wait_enabled() || - trace_sched_stat_sleep_enabled() || - trace_sched_stat_iowait_enabled() || - trace_sched_stat_blocked_enabled() || - trace_sched_stat_runtime_enabled()) { - printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " - "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enable or " - "kernel.sched_schedstats=1\n"); } -#endif + + /* + * When joining the competition; the existing tasks will be, + * on average, halfway through their slice, as such start tasks + * off with half a slice to ease into the competition. + */ + if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) + vslice /= 2; + + /* + * EEVDF: vd_i = ve_i + r_i/w_i + */ + se->deadline = se->vruntime + vslice; } -static inline bool cfs_bandwidth_used(void); +static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); -/* - * MIGRATION - * - * dequeue - * update_curr() - * update_min_vruntime() - * vruntime -= min_vruntime - * - * enqueue - * update_curr() - * update_min_vruntime() - * vruntime += min_vruntime - * - * this way the vruntime transition between RQs is done when both - * min_vruntime are up-to-date. - * - * WAKEUP (remote) - * - * ->migrate_task_rq_fair() (p->state == TASK_WAKING) - * vruntime -= min_vruntime - * - * enqueue - * update_curr() - * update_min_vruntime() - * vruntime += min_vruntime - * - * this way we don't have the most up-to-date min_vruntime on the originating - * CPU and an up-to-date min_vruntime on the destination CPU. - */ +static void +requeue_delayed_entity(struct sched_entity *se); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; /* * If we're the current task, we must renormalise before calling * update_curr(). */ - if (renorm && curr) - se->vruntime += cfs_rq->min_vruntime; + if (curr) + place_entity(cfs_rq, se, flags); update_curr(cfs_rq); /* - * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. Being - * placed in the past could significantly boost this task to the - * fairness detriment of existing tasks. - */ - if (renorm && !curr) - se->vruntime += cfs_rq->min_vruntime; - - /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_runnable of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); se_update_runnable(se); + /* + * XXX update_load_avg() above will have attached us to the pelt sum; + * but update_cfs_group() here will re-adjust the weight and have to + * undo/redo all that. Seems wasteful. + */ update_cfs_group(se); + + /* + * XXX now that the entity has been re-weighted, and it's lag adjusted, + * we can place the entity. + */ + if (!curr) + place_entity(cfs_rq, se, flags); + account_entity_enqueue(cfs_rq, se); - if (flags & ENQUEUE_WAKEUP) - place_entity(cfs_rq, se, 0); + /* Entity has migrated, no longer consider this task hot */ + if (flags & ENQUEUE_MIGRATED) + se->exec_start = 0; check_schedstat_required(); - update_stats_enqueue(cfs_rq, se, flags); - check_spread(cfs_rq, se); + update_stats_enqueue_fair(cfs_rq, se, flags); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - /* - * When bandwidth control is enabled, cfs might have been removed - * because of a parent been throttled but cfs->nr_running > 1. Try to - * add it unconditionnally. - */ - if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) - list_add_leaf_cfs_rq(cfs_rq); - - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); -} - -static void __clear_buddies_last(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last != se) - break; + if (!throttled_hierarchy(cfs_rq)) { + list_add_leaf_cfs_rq(cfs_rq); + } else { +#ifdef CONFIG_CFS_BANDWIDTH + struct rq *rq = rq_of(cfs_rq); - cfs_rq->last = NULL; + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +#endif + } } } @@ -4259,68 +5376,122 @@ static void __clear_buddies_next(struct sched_entity *se) } } -static void __clear_buddies_skip(struct sched_entity *se) +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { + if (cfs_rq->next == se) + __clear_buddies_next(se); +} + +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +static void set_delayed(struct sched_entity *se) +{ + se->sched_delayed = 1; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since dequeue_entities() + * will account it for blocked tasks. + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip != se) - break; - cfs_rq->skip = NULL; + cfs_rq->h_nr_runnable--; + if (cfs_rq_throttled(cfs_rq)) + break; } } -static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void clear_delayed(struct sched_entity *se) { - if (cfs_rq->last == se) - __clear_buddies_last(se); + se->sched_delayed = 0; - if (cfs_rq->next == se) - __clear_buddies_next(se); + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since a dequeue has + * already accounted for it or an enqueue of a task + * below it will account for it in enqueue_task_fair(). + */ + if (!entity_is_task(se)) + return; - if (cfs_rq->skip == se) - __clear_buddies_skip(se); + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->h_nr_runnable++; + if (cfs_rq_throttled(cfs_rq)) + break; + } } -static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static inline void finish_delayed_dequeue_entity(struct sched_entity *se) +{ + clear_delayed(se); + if (sched_feat(DELAY_ZERO) && se->vlag > 0) + se->vlag = 0; +} -static void +static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - /* - * Update run-time statistics of the 'current'. - */ + bool sleep = flags & DEQUEUE_SLEEP; + int action = UPDATE_TG; + update_curr(cfs_rq); + clear_buddies(cfs_rq, se); + + if (flags & DEQUEUE_DELAYED) { + SCHED_WARN_ON(!se->sched_delayed); + } else { + bool delay = sleep; + /* + * DELAY_DEQUEUE relies on spurious wakeups, special task + * states must not suffer spurious wakeups, excempt them. + */ + if (flags & DEQUEUE_SPECIAL) + delay = false; + + SCHED_WARN_ON(delay && se->sched_delayed); + + if (sched_feat(DELAY_DEQUEUE) && delay && + !entity_eligible(cfs_rq, se)) { + update_load_avg(cfs_rq, se, 0); + set_delayed(se); + return false; + } + } + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Subtract its load from the cfs_rq->runnable_avg. + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_runnable of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(cfs_rq, se, action); se_update_runnable(se); - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue_fair(cfs_rq, se, flags); - clear_buddies(cfs_rq, se); + update_entity_lag(cfs_rq, se); + if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { + se->deadline -= se->vruntime; + se->rel_deadline = 1; + } if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; account_entity_dequeue(cfs_rq, se); - /* - * Normalize after update_curr(); which will also have moved - * min_vruntime if @se is the one holding it back. But before doing - * update_min_vruntime() again, which will discount @se's position and - * can move min_vruntime forward still more. - */ - if (!(flags & DEQUEUE_SLEEP)) - se->vruntime -= cfs_rq->min_vruntime; - /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); @@ -4330,55 +5501,25 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- ie. we'll be penalized. + * further than we started -- i.e. we'll be penalized. */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); -} -/* - * Preempt the current task with a newly woken task if needed: - */ -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - unsigned long ideal_runtime, delta_exec; - struct sched_entity *se; - s64 delta; + if (flags & DEQUEUE_DELAYED) + finish_delayed_dequeue_entity(se); - ideal_runtime = sched_slice(cfs_rq, curr); - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); - /* - * The current task ran long enough, ensure it doesn't get - * re-elected due to buddy favours. - */ - clear_buddies(cfs_rq, curr); - return; - } - - /* - * Ensure that a task that missed wakeup preemption by a - * narrow margin doesn't have to wait for a full slice. - * This also mitigates buddy induced latencies under load. - */ - if (delta_exec < sysctl_sched_min_granularity) - return; + if (cfs_rq->nr_queued == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); - se = __pick_first_entity(cfs_rq); - delta = curr->vruntime - se->vruntime; - - if (delta < 0) - return; - - if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); + return true; } static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + clear_buddies(cfs_rq, se); + /* 'current' is not kept within the tree. */ if (se->on_rq) { /* @@ -4386,31 +5527,39 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); + /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ + se->vlag = se->deadline; } update_stats_curr_start(cfs_rq, se); + SCHED_WARN_ON(cfs_rq->curr); cfs_rq->curr = se; /* * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it + * least twice that of our own weight (i.e. don't track it * when there are only lesser-weight tasks around): */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, - max((u64)schedstat_val(se->statistics.slice_max), - se->sum_exec_runtime - se->prev_sum_exec_runtime)); + struct sched_statistics *stats; + + stats = __schedstats_from_se(se); + __schedstat_set(stats->slice_max, + max((u64)stats->slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); /* * Pick the next process, keeping these things in mind, in this order: @@ -4420,53 +5569,28 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { - struct sched_entity *left = __pick_first_entity(cfs_rq); struct sched_entity *se; /* - * If curr is set we have to see if its left of the leftmost entity - * still in the tree, provided there was anything in the tree at all. - */ - if (!left || (curr && entity_before(curr, left))) - left = curr; - - se = left; /* ideally we run the leftmost entity */ - - /* - * Avoid running the skip buddy, if running something else can - * be done without getting too unfair. + * Picking the ->next buddy will affect latency but not fairness. */ - if (cfs_rq->skip == se) { - struct sched_entity *second; - - if (se == curr) { - second = __pick_first_entity(cfs_rq); - } else { - second = __pick_next_entity(se); - if (!second || (curr && entity_before(curr, second))) - second = curr; - } - - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; + if (sched_feat(PICK_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + SCHED_WARN_ON(cfs_rq->next->sched_delayed); + return cfs_rq->next; } - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; - - clear_buddies(cfs_rq, se); - + se = pick_eevdf(cfs_rq); + if (se->sched_delayed) { + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + /* + * Must not reference @se again, see __block_task(). + */ + return NULL; + } return se; } @@ -4484,15 +5608,14 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); - if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); + update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } + SCHED_WARN_ON(cfs_rq->curr != prev); cfs_rq->curr = NULL; } @@ -4516,19 +5639,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } - /* - * don't let the period tick interfere with the hrtick preemption - */ - if (!sched_feat(DOUBLE_TICK) && - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) - return; #endif - - if (cfs_rq->nr_running > 1) - check_preempt_tick(cfs_rq, curr); } @@ -4588,8 +5702,20 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - if (cfs_b->quota != RUNTIME_INF) - cfs_b->runtime = cfs_b->quota; + s64 runtime; + + if (unlikely(cfs_b->quota == RUNTIME_INF)) + return; + + cfs_b->runtime += cfs_b->quota; + runtime = cfs_b->runtime_snap - cfs_b->runtime; + if (runtime > 0) { + cfs_b->burst_time += runtime; + cfs_b->nr_burst++; + } + + cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst); + cfs_b->runtime_snap = cfs_b->runtime; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4700,12 +5826,23 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; - /* Add cfs_rq with already running entity in the list */ - if (cfs_rq->nr_running >= 1) + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + + cfs_rq->throttled_clock_self = 0; + + if (SCHED_WARN_ON((s64)delta < 0)) + delta = 0; + + cfs_rq->throttled_clock_self_time += delta; + } } return 0; @@ -4718,8 +5855,12 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); + + SCHED_WARN_ON(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_queued) + cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -4731,7 +5872,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, dequeue = 1; + long queued_delta, runnable_delta, idle_delta, dequeue = 1; + long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -4761,37 +5903,73 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); + int flags; + /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; - if (dequeue) { - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - } else { - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); + /* + * Abuse SPECIAL to avoid delayed dequeue in this instance. + * This avoids teaching dequeue_entities() about throttled + * entities and keeps things relatively simple. + */ + flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; + if (se->sched_delayed) + flags |= DEQUEUE_DELAYED; + dequeue_entity(qcfs_rq, se, flags); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_delta = cfs_rq->h_nr_queued; + + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; + + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; } + } + + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); - if (qcfs_rq->load.weight) - dequeue = 0; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_delta = cfs_rq->h_nr_queued; + + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; } - if (!se) - sub_nr_running(rq, task_delta); + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, queued_delta); + /* Stop the fair server if throttling resulted in no runnable tasks */ + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) + dl_server_stop(&rq->fair_server); +done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); + SCHED_WARN_ON(cfs_rq->throttled_clock); + if (cfs_rq->nr_queued) + cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -4800,7 +5978,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta; + long queued_delta, runnable_delta, idle_delta; + long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4809,93 +5988,196 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + if (cfs_rq->throttled_clock) { + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + cfs_rq->throttled_clock = 0; + } list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - if (!cfs_rq->load.weight) - return; + if (!cfs_rq->load.weight) { + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; + } - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { - if (se->on_rq) + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + + /* Handle any unfinished DELAY_DEQUEUE business first. */ + if (se->sched_delayed) { + int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; + + dequeue_entity(qcfs_rq, se, flags); + } else if (se->on_rq) break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_delta = cfs_rq->h_nr_queued; - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(qcfs_rq, se, UPDATE_TG); se_update_runnable(se); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_delta = cfs_rq->h_nr_queued; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; - - /* - * One parent has been throttled and cfs_rq removed from the - * list. Add it back to not break the leaf list. - */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); } + /* Start the fair server if un-throttling resulted in new runnable tasks */ + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) + dl_server_start(&rq->fair_server); + /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); + add_nr_running(rq, queued_delta); unthrottle_throttle: + assert_list_leaf_cfs_rq(rq); + + /* Determine whether we need to wake up potentially idle CPU: */ + if (rq->curr == rq->idle && rq->cfs.nr_queued) + resched_curr(rq); +} + +#ifdef CONFIG_SMP +static void __cfsb_csd_unthrottle(void *arg) +{ + struct cfs_rq *cursor, *tmp; + struct rq *rq = arg; + struct rq_flags rf; + + rq_lock(rq, &rf); + /* - * The cfs_rq_throttled() breaks in the above iteration can result in - * incomplete leaf list maintenance, resulting in triggering the - * assertion below. + * Iterating over the list can trigger several call to + * update_rq_clock() in unthrottle_cfs_rq(). + * Do it once and skip the potential next ones. */ - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + update_rq_clock(rq); + rq_clock_start_loop_update(rq); - if (list_add_leaf_cfs_rq(cfs_rq)) - break; + /* + * Since we hold rq lock we're safe from concurrent manipulation of + * the CSD list. However, this RCU critical section annotates the + * fact that we pair with sched_free_group_rcu(), so that we cannot + * race with group being freed in the window between removing it + * from the list and advancing to the next entry in the list. + */ + rcu_read_lock(); + + list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, + throttled_csd_list) { + list_del_init(&cursor->throttled_csd_list); + + if (cfs_rq_throttled(cursor)) + unthrottle_cfs_rq(cursor); } - assert_list_leaf_cfs_rq(rq); + rcu_read_unlock(); - /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_curr(rq); + rq_clock_stop_loop_update(rq); + rq_unlock(rq, &rf); } -static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { - struct cfs_rq *cfs_rq; + struct rq *rq = rq_of(cfs_rq); + bool first; + + if (rq == this_rq()) { + unthrottle_cfs_rq(cfs_rq); + return; + } + + /* Already enqueued */ + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + return; + + first = list_empty(&rq->cfsb_csd_list); + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); + if (first) + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); +} +#else +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + unthrottle_cfs_rq(cfs_rq); +} +#endif + +static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + lockdep_assert_rq_held(rq_of(cfs_rq)); + + if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + cfs_rq->runtime_remaining <= 0)) + return; + + __unthrottle_cfs_rq_async(cfs_rq); +} + +static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +{ + int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; + bool throttled = false; + struct cfs_rq *cfs_rq, *tmp; + struct rq_flags rf; + struct rq *rq; + LIST_HEAD(local_unthrottle); rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { - struct rq *rq = rq_of(cfs_rq); - struct rq_flags rf; + rq = rq_of(cfs_rq); + + if (!remaining) { + throttled = true; + break; + } rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; - /* By the above check, this should never be true */ + /* Already queued for async unthrottle */ + if (!list_empty(&cfs_rq->throttled_csd_list)) + goto next; + + /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); @@ -4909,16 +6191,44 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) cfs_rq->runtime_remaining += runtime; /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) - unthrottle_cfs_rq(cfs_rq); + if (cfs_rq->runtime_remaining > 0) { + if (cpu_of(rq) != this_cpu) { + unthrottle_cfs_rq_async(cfs_rq); + } else { + /* + * We currently only expect to be unthrottling + * a single cfs_rq locally. + */ + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + list_add_tail(&cfs_rq->throttled_csd_list, + &local_unthrottle); + } + } else { + throttled = true; + } next: rq_unlock_irqrestore(rq, &rf); + } - if (!remaining) - break; + list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, + throttled_csd_list) { + struct rq *rq = rq_of(cfs_rq); + + rq_lock_irqsave(rq, &rf); + + list_del_init(&cfs_rq->throttled_csd_list); + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + + rq_unlock_irqrestore(rq, &rf); } + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + rcu_read_unlock(); + + return throttled; } /* @@ -4938,6 +6248,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->nr_periods += overrun; + /* Refill extra burst quota even if cfs_b->idle */ + __refill_cfs_bandwidth_runtime(cfs_b); + /* * idle depends on !throttled (for the case of a large deficit), and if * we're going inactive then everything else can be deferred @@ -4945,8 +6258,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u if (cfs_b->idle && !throttled) goto out_deactivate; - __refill_cfs_bandwidth_runtime(cfs_b); - if (!throttled) { /* mark as potentially idle for the upcoming period */ cfs_b->idle = 1; @@ -4962,10 +6273,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - distribute_cfs_runtime(cfs_b); + throttled = distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - - throttled = !list_empty(&cfs_b->throttled_cfs_rq); } /* @@ -4999,7 +6308,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; - u64 remaining; + s64 remaining; /* if the call-back is running a quota refresh is already occurring */ if (hrtimer_callback_running(refresh_timer)) @@ -5007,7 +6316,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) /* is a quota refresh about to occur? */ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); - if (remaining < min_expire) + if (remaining < (s64)min_expire) return 1; return 0; @@ -5060,7 +6369,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued) return; __return_cfs_rq_runtime(cfs_rq); @@ -5093,15 +6402,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; distribute_cfs_runtime(cfs_b); - - raw_spin_lock_irqsave(&cfs_b->lock, flags); - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. + * runtime as update_curr() throttling can not trigger until it's on-rq. */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { @@ -5136,7 +6442,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -5199,6 +6505,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (new < max_cfs_quota_period) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; + cfs_b->burst *= 2; pr_warn_ratelimited( "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", @@ -5224,16 +6531,22 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) { raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->burst = 0; + cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; + + /* Add a random offset so that timers interleave */ + hrtimer_set_expires(&cfs_b->period_timer, + get_random_u32_below(cfs_b->period)); hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->slack_started = false; @@ -5243,6 +6556,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); + INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5259,12 +6573,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + int __maybe_unused i; + /* init_cfs_bandwidth() was not called */ if (!cfs_b->throttled_cfs_rq.next) return; hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); + + /* + * It is possible that we still have some cfs_rq's pending on a CSD + * list, though this race is very rare. In order for this to occur, we + * must have raced with the last task leaving the group while there + * exist throttled cfs_rq(s), and the period_timer must have queued the + * CSD item but the remote cpu has not yet processed it. To handle this, + * we can simply flush all pending CSD work inline here. We're + * guaranteed at this point that no additional cfs_rq of this group can + * join a CSD list. + */ +#ifdef CONFIG_SMP + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + if (list_empty(&rq->cfsb_csd_list)) + continue; + + local_irq_save(flags); + __cfsb_csd_unthrottle(rq); + local_irq_restore(flags); + } +#endif } /* @@ -5274,12 +6614,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * bits doesn't do much. */ -/* cpu online calback */ +/* cpu online callback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5298,7 +6638,18 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct task_group *tg; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); + + // Do not unthrottle for an active CPU + if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) + return; + + /* + * The rq clock has already been updated in the + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { @@ -5308,29 +6659,68 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) continue; /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = 1; - /* * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + if (!cfs_rq_throttled(cfs_rq)) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = 1; + unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); -} -#else /* CONFIG_CFS_BANDWIDTH */ + rq_clock_stop_loop_update(rq); +} -static inline bool cfs_bandwidth_used(void) +bool cfs_task_bw_constrained(struct task_struct *p) { + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + if (!cfs_bandwidth_used()) + return false; + + if (cfs_rq->runtime_enabled || + tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF) + return true; + return false; } +#ifdef CONFIG_NO_HZ_FULL +/* called from pick_next_task_fair() */ +static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq); + + if (!cfs_bandwidth_used()) + return; + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (rq->nr_running != 1) + return; + + /* + * We know there is only one task runnable and we've just picked it. The + * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will + * be otherwise able to stop the tick. Just need to check if we are using + * bandwidth control. + */ + if (cfs_task_bw_constrained(p)) + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#endif + +#else /* CONFIG_CFS_BANDWIDTH */ + static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -5353,9 +6743,8 @@ static inline int throttled_lb_pair(struct task_group *tg, return 0; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} - #ifdef CONFIG_FAIR_GROUP_SCHED +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif @@ -5366,9 +6755,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} - +#ifdef CONFIG_CGROUP_SCHED +bool cfs_task_bw_constrained(struct task_struct *p) +{ + return false; +} +#endif #endif /* CONFIG_CFS_BANDWIDTH */ +#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) +static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} +#endif + /************************************************** * CFS operations on tasks: */ @@ -5377,17 +6775,16 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); SCHED_WARN_ON(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { - u64 slice = sched_slice(cfs_rq, se); + if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; + u64 slice = se->slice; s64 delta = slice - ran; if (delta < 0) { - if (rq->curr == p) + if (task_current_donor(rq, p)) resched_curr(rq); return; } @@ -5402,13 +6799,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) */ static void hrtick_update(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *donor = rq->donor; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -5422,28 +6818,55 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { - return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); + unsigned long rq_util_min, rq_util_max; + + if (!sched_energy_enabled()) + return false; + + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + + /* Return true only if the utilization doesn't fit CPU's capacity */ + return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); +} + +/* + * overutilized value make sense only if EAS is enabled + */ +static inline bool is_rd_overutilized(struct root_domain *rd) +{ + return !sched_energy_enabled() || READ_ONCE(rd->overutilized); } -static inline void update_overutilized_status(struct rq *rq) +static inline void set_rd_overutilized(struct root_domain *rd, bool flag) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { - WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); - } + if (!sched_energy_enabled()) + return; + + WRITE_ONCE(rd->overutilized, flag); + trace_sched_overutilized_tp(rd, flag); +} + +static inline void check_update_overutilized_status(struct rq *rq) +{ + /* + * overutilized field is used for load balancing decisions only + * if energy aware scheduler is being used + */ + + if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) + set_rd_overutilized(rq->rd, 1); } #else -static inline void update_overutilized_status(struct rq *rq) { } +static inline void check_update_overutilized_status(struct rq *rq) { } #endif /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) { - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + return unlikely(rq->nr_running == rq->cfs.h_nr_idle && rq->nr_running); } @@ -5454,6 +6877,37 @@ static int sched_idle_cpu(int cpu) } #endif +static void +requeue_delayed_entity(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * se->sched_delayed should imply: se->on_rq == 1. + * Because a delayed entity is one that is still on + * the runqueue competing until elegibility. + */ + SCHED_WARN_ON(!se->sched_delayed); + SCHED_WARN_ON(!se->on_rq); + + if (sched_feat(DELAY_ZERO)) { + update_entity_lag(cfs_rq, se); + if (se->vlag > 0) { + cfs_rq->nr_queued--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->vlag = 0; + place_entity(cfs_rq, se, 0); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; + } + } + + update_load_avg(cfs_rq, se, 0); + clear_delayed(se); +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5464,7 +6918,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int idle_h_nr_running = task_has_idle_policy(p); + int h_nr_idle = task_has_idle_policy(p); + int h_nr_runnable = 1; + int task_new = !(flags & ENQUEUE_WAKEUP); + int rq_h_nr_queued = rq->cfs.h_nr_queued; + u64 slice = 0; /* * The code below (indirectly) updates schedutil which looks at @@ -5472,7 +6930,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - util_est_enqueue(&rq->cfs, p); + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) + util_est_enqueue(&rq->cfs, p); + + if (flags & ENQUEUE_DELAYED) { + requeue_delayed_entity(se); + return; + } /* * If in_iowait is set, the code below may not trigger any cpufreq @@ -5482,14 +6946,35 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + if (task_new && se->sched_delayed) + h_nr_runnable = 0; + for_each_sched_entity(se) { - if (se->on_rq) + if (se->on_rq) { + if (se->sched_delayed) + requeue_delayed_entity(se); break; + } cfs_rq = cfs_rq_of(se); + + /* + * Basically set the slice of group entries to the min_slice of + * their respective cfs_rq. This ensures the group can service + * its entities in the desired time-frame. + */ + if (slice) { + se->slice = slice; + se->custom_slice = 1; + } enqueue_entity(cfs_rq, se, flags); + slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; + + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -5505,19 +6990,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se_update_runnable(se); update_cfs_group(se); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; + se->slice = slice; + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; + + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; + } - /* - * One parent has been throttled and cfs_rq removed from the - * list. Add it back to not break the leaf list. - */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { + /* Account for idle runtime */ + if (!rq->nr_running) + dl_server_update_idle_time(rq, rq->curr); + dl_server_start(&rq->fair_server); } /* At this point se is NULL and we are at root level*/ @@ -5537,25 +7029,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ - if (flags & ENQUEUE_WAKEUP) - update_overutilized_status(rq); + if (!task_new) + check_update_overutilized_status(rq); enqueue_throttle: - if (cfs_bandwidth_used()) { - /* - * When bandwidth control is enabled; the cfs_rq_throttled() - * breaks in the above iteration can result in incomplete - * leaf list maintenance, resulting in triggering the assertion - * below. - */ - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - - if (list_add_leaf_cfs_rq(cfs_rq)) - break; - } - } - assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -5564,31 +7041,63 @@ enqueue_throttle: static void set_next_buddy(struct sched_entity *se); /* - * The dequeue_task method is called before nr_running is - * decreased. We remove the task from the rbtree and - * update the fair scheduling stats: + * Basically dequeue_task_fair(), except it can deal with dequeue_entity() + * failing half-way through and resume the dequeue later. + * + * Returns: + * -1 - dequeue delayed + * 0 - dequeue throttled + * 1 - dequeue complete */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - int task_sleep = flags & DEQUEUE_SLEEP; - int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + int rq_h_nr_queued = rq->cfs.h_nr_queued; + bool task_sleep = flags & DEQUEUE_SLEEP; + bool task_delayed = flags & DEQUEUE_DELAYED; + struct task_struct *p = NULL; + int h_nr_idle = 0; + int h_nr_queued = 0; + int h_nr_runnable = 0; + struct cfs_rq *cfs_rq; + u64 slice = 0; + + if (entity_is_task(se)) { + p = task_of(se); + h_nr_queued = 1; + h_nr_idle = task_has_idle_policy(p); + if (task_sleep || task_delayed || !se->sched_delayed) + h_nr_runnable = 1; + } else { + cfs_rq = group_cfs_rq(se); + slice = cfs_rq_min_slice(cfs_rq); + } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, flags); - cfs_rq->h_nr_running--; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (!dequeue_entity(cfs_rq, se, flags)) { + if (p && &p->se == se) + return -1; + + break; + } + + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; + + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; + return 0; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + slice = cfs_rq_min_slice(cfs_rq); + /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); /* @@ -5600,6 +7109,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; } flags |= DEQUEUE_SLEEP; + flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); } for_each_sched_entity(se) { @@ -5609,32 +7119,76 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) se_update_runnable(se); update_cfs_group(se); - cfs_rq->h_nr_running--; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; + se->slice = slice; + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; + + if (cfs_rq_is_idle(cfs_rq)) + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; - + return 0; } -dequeue_throttle: - if (!se) - sub_nr_running(rq, 1); + sub_nr_running(rq, h_nr_queued); + + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) + dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; - util_est_dequeue(&rq->cfs, p, task_sleep); + if (p && task_delayed) { + SCHED_WARN_ON(!task_sleep); + SCHED_WARN_ON(p->on_rq != 1); + + /* Fix-up what dequeue_task_fair() skipped */ + hrtick_update(rq); + + /* + * Fix-up what block_task() skipped. + * + * Must be last, @p might not be valid after this. + */ + __block_task(rq, p); + } + + return 1; +} + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) + util_est_dequeue(&rq->cfs, p); + + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + if (dequeue_entities(rq, &p->se, flags) < 0) + return false; + + /* + * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). + */ + hrtick_update(rq); + return true; } #ifdef CONFIG_SMP -/* Working cpumask for: load_balance, load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); +/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); +static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); #ifdef CONFIG_NO_HZ_COMMON @@ -5642,6 +7196,7 @@ static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; int has_blocked; /* Idle CPUS has blocked load */ + int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ } nohz ____cacheline_aligned; @@ -5792,6 +7347,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + return nr_cpumask_bits; } @@ -5849,23 +7407,23 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); - schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); - if (target == nr_cpumask_bits) + schedstat_inc(p->stats.nr_wakeups_affine_attempts); + if (target != this_cpu) return prev_cpu; schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); + schedstat_inc(p->stats.nr_wakeups_affine); return target; } static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* - * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. + * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. */ static int -find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -5880,11 +7438,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { + struct rq *rq = cpu_rq(i); + + if (!sched_core_cookie_match(rq, p)) + continue; + if (sched_idle_cpu(i)) return i; if (available_idle_cpu(i)) { - struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { /* @@ -5917,7 +7479,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, +static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { int new_cpu = cpu; @@ -5942,13 +7504,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p continue; } - group = find_idlest_group(sd, p, cpu); + group = sched_balance_find_dst_group(sd, p, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_group_cpu(group, p, cpu); + new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu); if (new_cpu == cpu) { /* Now try balancing at a lower domain level of 'cpu': */ sd = sd->child; @@ -5970,6 +7532,15 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu, struct task_struct *p) +{ + if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && + sched_cpu_cookie_match(cpu_rq(cpu), p)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -5983,7 +7554,7 @@ static inline void set_idle_cores(int cpu, int val) WRITE_ONCE(sds->has_idle_cores, val); } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; @@ -5991,7 +7562,7 @@ static inline bool test_idle_cores(int cpu, bool def) if (sds) return READ_ONCE(sds->has_idle_cores); - return def; + return false; } /* @@ -6007,7 +7578,7 @@ void __update_idle_core(struct rq *rq) int cpu; rcu_read_lock(); - if (test_idle_cores(core, true)) + if (test_idle_cores(core)) goto unlock; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -6028,54 +7599,49 @@ unlock: * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - if (!test_idle_cores(target, false)) - return -1; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; + bool idle = true; + int cpu; - for_each_cpu(cpu, cpu_smt_mask(core)) { - if (!available_idle_cpu(cpu)) { - idle = false; - break; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { + *idle_cpu = cpu; + break; + } + continue; } + break; } - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); - - if (idle) - return core; + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu; - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { + if (cpu == target) + continue; + /* + * Check if the CPU is in the LLC scheduling domain of @target. + * Due to isolcpus, there is no guarantee that all the siblings are in the domain. + */ + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6086,12 +7652,21 @@ static int select_idle_smt(struct task_struct *p, int target) #else /* CONFIG_SCHED_SMT */ -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline void set_idle_cores(int cpu, int val) { - return -1; } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline bool test_idle_cores(int cpu) +{ + return false; +} + +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) +{ + return __select_idle_cpu(core, p); +} + +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -6103,52 +7678,68 @@ static inline int select_idle_smt(struct task_struct *p, int target) * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle). */ -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - struct sched_domain *this_sd; - u64 avg_cost, avg_idle; - u64 time; - int this = smp_processor_id(); - int cpu, nr = INT_MAX; - - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct sched_domain_shared *sd_share; - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr = div_u64(span_avg, avg_cost); - else - nr = 4; + if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { + /* because !--nr is the condition to stop scan */ + nr = READ_ONCE(sd_share->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } } - time = cpu_clock(this); + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + for_each_cpu_wrap(cpu, cpus, target + 1) { + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; - for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } } - time = cpu_clock(this) - time; - update_avg(&this_sd->avg_scan_cost, time); + if (has_idle_core) + set_idle_cores(target, false); - return cpu; + return idle_cpu; } /* @@ -6159,71 +7750,109 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long best_cap = 0; + unsigned long task_util, util_min, util_max, best_cap = 0; + int fits, best_fits = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - sync_entity_load_avg(&p->se); - - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (task_fits_capacity(p, cpu_cap)) + + fits = util_fits_cpu(task_util, util_min, util_max, cpu); + + /* This CPU fits with all requirements */ + if (fits > 0) return cpu; + /* + * Only the min performance hint (i.e. uclamp_min) doesn't fit. + * Look for the CPU with best capacity. + */ + else if (fits < 0) + cpu_cap = get_actual_cpu_capacity(cpu); - if (cpu_cap > best_cap) { + /* + * First, select CPU which fits better (-1 being better than 0). + * Then, select the one with best capacity at same level. + */ + if ((fits < best_fits) || + ((fits == best_fits) && (cpu_cap > best_cap))) { best_cap = cpu_cap; best_cpu = cpu; + best_fits = fits; } } return best_cpu; } +static inline bool asym_fits_cpu(unsigned long util, + unsigned long util_min, + unsigned long util_max, + int cpu) +{ + if (sched_asym_cpucap_active()) + /* + * Return true only if the cpu fully fits the task requirements + * which include the utilization and the performance hints. + */ + return (util_fits_cpu(util, util_min, util_max, cpu) > 0); + + return true; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { + bool has_idle_core = false; struct sched_domain *sd; - int i, recent_used_cpu; + unsigned long task_util, util_min, util_max; + int i, recent_used_cpu, prev_aff = -1; /* - * For asymmetric CPU capacity systems, our domain of interest is - * sd_asym_cpucapacity rather than sd_llc. + * On asymmetric system, update task utilization because we will check + * that the task fits with CPU's capacity. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); - /* - * On an asymmetric CPU capacity system where an exclusive - * cpuset defines a symmetric island (i.e. one unique - * capacity_orig value through the cpuset), the key will be set - * but the CPUs within that cpuset will not have a domain with - * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric - * capacity path. - */ - if (!sd) - goto symmetric; - - i = select_idle_capacity(p, sd, target); - return ((unsigned)i < nr_cpumask_bits) ? i : target; + if (sched_asym_cpucap_active()) { + sync_entity_load_avg(&p->se); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); } -symmetric: - if (available_idle_cpu(target) || sched_idle_cpu(target)) + /* + * per-cpu select_rq_mask usage + */ + lockdep_assert_irqs_disabled(); + + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_cpu(task_util, util_min, util_max, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) - return prev; + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_cpu(task_util, util_min, util_max, prev)) { + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(prev, target)) + return prev; + + prev_aff = prev; + } /* * Allow a per-cpu kthread to stack with the wakee if the @@ -6234,95 +7863,197 @@ symmetric: * pattern is IO completions. */ if (is_per_cpu_kthread(current) && + in_task() && prev == smp_processor_id() && - this_rq()->nr_running <= 1) { + this_rq()->nr_running <= 1 && + asym_fits_cpu(task_util, util_min, util_max, prev)) { return prev; } /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && + asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; + + } else { + recent_used_cpu = -1; + } + + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (sched_asym_cpucap_active()) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* - * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake: + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. */ - p->recent_used_cpu = prev; - return recent_used_cpu; + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } } sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (sched_smt_active()) { + has_idle_core = test_idle_cores(target); - i = select_idle_cpu(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (!has_idle_core && cpus_share_cache(prev, target)) { + i = select_idle_smt(p, sd, prev); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } + } - i = select_idle_smt(p, target); + i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i; + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } /** - * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks - * @cpu: the CPU to get the utilization of - * - * The unit of the return value must be the one of capacity so we can compare - * the utilization with the capacity of the CPU that is available for CFS task - * (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * The estimated utilization of a CPU is defined to be the maximum between its - * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks - * currently RUNNABLE on that CPU. - * This allows to properly represent the expected utilization of a CPU which - * has just got a big task running since a long sleep period. At the same time - * however it preserves the benefits of the "blocked utilization" in - * describing the potential for other tasks waking up on the same CPU. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - * - * Return: the (estimated) utilization for the specified CPU - */ -static inline unsigned long cpu_util(int cpu) + * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. + * @cpu: the CPU to get the utilization for + * @p: task for which the CPU utilization should be predicted or NULL + * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL + * @boost: 1 to enable boosting, otherwise 0 + * + * The unit of the return value must be the same as the one of CPU capacity + * so that CPU utilization can be compared with CPU capacity. + * + * CPU utilization is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on that CPU. + * It represents the amount of CPU capacity currently used by CFS tasks in + * the range [0..max CPU capacity] with max CPU capacity being the CPU + * capacity at f_max. + * + * The estimated CPU utilization is defined as the maximum between CPU + * utilization and sum of the estimated utilization of the currently + * runnable tasks on that CPU. It preserves a utilization "snapshot" of + * previously-executed tasks, which helps better deduce how busy a CPU will + * be when a long-sleeping task wakes up. The contribution to CPU utilization + * of such a task would be significantly decayed at this point of time. + * + * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). + * CPU contention for CFS tasks can be detected by CPU runnable > CPU + * utilization. Boosting is implemented in cpu_util() so that internal + * users (e.g. EAS) can use it next to external users (e.g. schedutil), + * latter via cpu_util_cfs_boost(). + * + * CPU utilization can be higher than the current CPU capacity + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because + * of rounding errors as well as task migrations or wakeups of new tasks. + * CPU utilization has to be capped to fit into the [0..max CPU capacity] + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) + * could be seen as over-utilized even though CPU1 has 20% of spare CPU + * capacity. CPU utilization is allowed to overshoot current CPU capacity + * though since this is useful for predicting the CPU capacity required + * after task migrations (scheduler-driven DVFS). + * + * Return: (Boosted) (estimated) utilization for the specified CPU. + */ +static unsigned long +cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) { - struct cfs_rq *cfs_rq; - unsigned int util; + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long runnable; - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); + if (boost) { + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + util = max(util, runnable); + } + + /* + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its + * contribution. If @p migrates from another CPU to @cpu add its + * contribution. In all the other cases @cpu is not impacted by the + * migration so its util_avg is already correct. + */ + if (p && task_cpu(p) == cpu && dst_cpu != cpu) + lsub_positive(&util, task_util(p)); + else if (p && task_cpu(p) != cpu && dst_cpu == cpu) + util += task_util(p); - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + if (sched_feat(UTIL_EST)) { + unsigned long util_est; + + util_est = READ_ONCE(cfs_rq->avg.util_est); - return min_t(unsigned long, util, capacity_orig_of(cpu)); + /* + * During wake-up @p isn't enqueued yet and doesn't contribute + * to any cpu_rq(cpu)->cfs.avg.util_est. + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p + * has been enqueued. + * + * During exec (@dst_cpu = -1) @p is enqueued and does + * contribute to cpu_rq(cpu)->cfs.util_est. + * Remove it to "simulate" cpu_util without @p's contribution. + * + * Despite the task_on_rq_queued(@p) check there is still a + * small window for a possible race when an exec + * select_task_rq_fair() races with LB's detach_task(). + * + * detach_task() + * deactivate_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * -------------------------------- A + * dequeue_task() \ + * dequeue_task_fair() + Race Time + * util_est_dequeue() / + * -------------------------------- B + * + * The additional check "current == p" is required to further + * reduce the race window. + */ + if (dst_cpu == cpu) + util_est += _task_util_est(p); + else if (p && unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&util_est, _task_util_est(p)); + + util = max(util, util_est); + } + + return min(util, arch_scale_cpu_capacity(cpu)); +} + +unsigned long cpu_util_cfs(int cpu) +{ + return cpu_util(cpu, NULL, -1, 0); +} + +unsigned long cpu_util_cfs_boost(int cpu) +{ + return cpu_util(cpu, NULL, -1, 1); } /* @@ -6340,168 +8071,255 @@ static inline unsigned long cpu_util(int cpu) */ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { - struct cfs_rq *cfs_rq; - unsigned int util; - /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util(cpu); + p = NULL; - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); + return cpu_util(cpu, p, -1, 0); +} - /* Discount task's util from CPU's util */ - lsub_positive(&util, task_util(p)); +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the IRQ utilization. + * + * The DL bandwidth number OTOH is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max) +{ + unsigned long util, irq, scale; + struct rq *rq = cpu_rq(cpu); + + scale = arch_scale_cpu_capacity(cpu); /* - * Covered cases: - * - * a) if *p is the only task sleeping on this CPU, then: - * cpu_util (== task_util) > util_est (== 0) - * and thus we return: - * cpu_util_without = (cpu_util - task_util) = 0 - * - * b) if other tasks are SLEEPING on this CPU, which is now exiting - * IDLE, then: - * cpu_util >= task_util - * cpu_util > util_est (== 0) - * and thus we discount *p's blocked utilization to return: - * cpu_util_without = (cpu_util - task_util) >= 0 - * - * c) if other tasks are RUNNABLE on that CPU and - * util_est > cpu_util - * then we use util_est since it returns a more restrictive - * estimation of the spare capacity on that CPU, by just - * considering the expected utilization of tasks already - * runnable on that CPU. - * - * Cases a) and b) are covered by the above code, while case c) is - * covered by the following code when estimated utilization is - * enabled. + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). */ - if (sched_feat(UTIL_EST)) { - unsigned int estimated = - READ_ONCE(cfs_rq->avg.util_est.enqueued); + irq = cpu_util_irq(rq); + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + if (min) { /* - * Despite the following checks we still have a small window - * for a possible race, when an execl's select_task_rq_fair() - * races with LB's detach_task(): - * - * detach_task() - * p->on_rq = TASK_ON_RQ_MIGRATING; - * ---------------------------------- A - * deactivate_task() \ - * dequeue_task() + RaceTime - * util_est_dequeue() / - * ---------------------------------- B - * - * The additional check on "current == p" it's required to - * properly fix the execl regression and it helps in further - * reducing the chances for the above race. + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. */ - if (unlikely(task_on_rq_queued(p) || current == p)) - lsub_positive(&estimated, _task_util_est(p)); + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); - util = max(util, estimated); + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); } /* - * Utilization (estimated) can exceed the CPU capacity, thus let's - * clamp to the maximum CPU capacity to ensure consistency with - * the cpu_util call. + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ + util = util_cfs + cpu_util_rt(rq); + util += cpu_util_dl(rq); + + /* + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. + */ + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); + + if (util >= scale) + return scale; + + /* + * There is still idle time; further improve the number by using the + * IRQ metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max */ - return min_t(unsigned long, util, capacity_orig_of(cpu)); + util = scale_irq_capacity(util, irq, scale); + util += irq; + + return min(scale, util); +} + +unsigned long sched_cpu_util(int cpu) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); } /* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. + * energy_env - Utilization landscape for energy estimation. + * @task_busy_time: Utilization contribution by the task for which we test the + * placement. Given by eenv_task_busy_time(). + * @pd_busy_time: Utilization of the whole perf domain without the task + * contribution. Given by eenv_pd_busy_time(). + * @cpu_cap: Maximum CPU capacity for the perf domain. + * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap). */ -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +struct energy_env { + unsigned long task_busy_time; + unsigned long pd_busy_time; + unsigned long cpu_cap; + unsigned long pd_cap; +}; + +/* + * Compute the task busy time for compute_energy(). This time cannot be + * injected directly into effective_cpu_util() because of the IRQ scaling. + * The latter only makes sense with the most recent CPUs where the task has + * run. + */ +static inline void eenv_task_busy_time(struct energy_env *eenv, + struct task_struct *p, int prev_cpu) { - struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu); + unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu)); - /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. - */ - if (task_cpu(p) == cpu && dst_cpu != cpu) - sub_positive(&util, task_util(p)); - else if (task_cpu(p) != cpu && dst_cpu == cpu) - util += task_util(p); + if (unlikely(irq >= max_cap)) + busy_time = max_cap; + else + busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap); - if (sched_feat(UTIL_EST)) { - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + eenv->task_busy_time = busy_time; +} - /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util() after the task has been enqueued. - */ - if (dst_cpu == cpu) - util_est += _task_util_est(p); +/* + * Compute the perf_domain (PD) busy time for compute_energy(). Based on the + * utilization for each @pd_cpus, it however doesn't take into account + * clamping since the ratio (utilization / cpu_capacity) is already enough to + * scale the EM reported power consumption at the (eventually clamped) + * cpu_capacity. + * + * The contribution of the task @p for which we want to estimate the + * energy cost is removed (by cpu_util()) and must be calculated + * separately (see eenv_task_busy_time). This ensures: + * + * - A stable PD utilization, no matter which CPU of that PD we want to place + * the task on. + * + * - A fair comparison between CPUs as the task contribution (task_util()) + * will always be the same no matter which CPU utilization we rely on + * (util_avg or util_est). + * + * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't + * exceed @eenv->pd_cap. + */ +static inline void eenv_pd_busy_time(struct energy_env *eenv, + struct cpumask *pd_cpus, + struct task_struct *p) +{ + unsigned long busy_time = 0; + int cpu; - util = max(util, util_est); + for_each_cpu(cpu, pd_cpus) { + unsigned long util = cpu_util(cpu, p, -1, 0); + + busy_time += effective_cpu_util(cpu, util, NULL, NULL); } - return min(util, capacity_orig_of(cpu)); + eenv->pd_busy_time = min(eenv->pd_cap, busy_time); } /* - * compute_energy(): Estimates the energy that @pd would consume if @p was - * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of @pd's CPUs after the task migration, and uses the Energy Model - * to compute what would be the energy if we decided to actually migrate that - * task. + * Compute the maximum utilization for compute_energy() when the task @p + * is placed on the cpu @dst_cpu. + * + * Returns the maximum utilization among @eenv->cpus. This utilization can't + * exceed @eenv->cpu_cap. */ -static long -compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +static inline unsigned long +eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, + struct task_struct *p, int dst_cpu) { - struct cpumask *pd_mask = perf_domain_span(pd); - unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - unsigned long max_util = 0, sum_util = 0; + unsigned long max_util = 0; int cpu; - /* - * The capacity state of CPUs of the current rd can be driven by CPUs - * of another rd if they belong to the same pd. So, account for the - * utilization of these CPUs too by masking pd with cpu_online_mask - * instead of the rd span. - * - * If an entire pd is outside of the current rd, it will not appear in - * its pd list and will not be accounted by compute_energy(). - */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); - struct task_struct *tsk = cpu == dst_cpu ? p : NULL; - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); + for_each_cpu(cpu, pd_cpus) { + struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; + unsigned long util = cpu_util(cpu, p, dst_cpu, 1); + unsigned long eff_util, min, max; /* * Performance domain frequency: utilization clamping * must be considered since it affects the selection * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. + * NOTE: in case RT tasks are running, by default the min + * utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); + eff_util = effective_cpu_util(cpu, util, &min, &max); + + /* Task's uclamp can modify min and max value */ + if (tsk && uclamp_is_used()) { + min = max(min, uclamp_eff_value(p, UCLAMP_MIN)); + + /* + * If there is no active max uclamp constraint, + * directly use task's one, otherwise keep max. + */ + if (uclamp_rq_is_idle(cpu_rq(cpu))) + max = uclamp_eff_value(p, UCLAMP_MAX); + else + max = max(max, uclamp_eff_value(p, UCLAMP_MAX)); + } + + eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max); + max_util = max(max_util, eff_util); } - return em_pd_energy(pd->em_pd, max_util, sum_util); + return min(max_util, eenv->cpu_cap); +} + +/* + * compute_energy(): Use the Energy Model to estimate the energy that @pd would + * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task + * contribution is ignored. + */ +static inline unsigned long +compute_energy(struct energy_env *eenv, struct perf_domain *pd, + struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) +{ + unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); + unsigned long busy_time = eenv->pd_busy_time; + unsigned long energy; + + if (dst_cpu >= 0) + busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); + + energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + + trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time); + + return energy; } /* @@ -6536,7 +8354,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt @@ -6545,17 +8363,23 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) */ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - unsigned long cpu_cap, util, base_energy = 0; - int cpu, best_energy_cpu = prev_cpu; + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; + struct root_domain *rd = this_rq()->rd; + int cpu, best_energy_cpu, target = -1; + int prev_fits = -1, best_fits = -1; + unsigned long best_actual_cap = 0; + unsigned long prev_actual_cap = 0; struct sched_domain *sd; struct perf_domain *pd; + struct energy_env eenv; rcu_read_lock(); pd = rcu_dereference(rd->pd); - if (!pd || READ_ONCE(rd->overutilized)) - goto fail; + if (!pd) + goto unlock; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6565,113 +8389,198 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) - goto fail; + goto unlock; + + target = prev_cpu; sync_entity_load_avg(&p->se); - if (!task_util_est(p)) + if (!task_util_est(p) && p_util_min == 0) goto unlock; + eenv_task_busy_time(&eenv, p, prev_cpu); + for (; pd; pd = pd->next) { - unsigned long cur_delta, spare_cap, max_spare_cap = 0; - unsigned long base_energy_pd; + unsigned long util_min = p_util_min, util_max = p_util_max; + unsigned long cpu_cap, cpu_actual_cap, util; + long prev_spare_cap = -1, max_spare_cap = -1; + unsigned long rq_util_min, rq_util_max; + unsigned long cur_delta, base_energy; int max_spare_cap_cpu = -1; + int fits, max_fits = -1; - /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd = compute_energy(p, -1, pd); - base_energy += base_energy_pd; + cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); + + if (cpumask_empty(cpus)) + continue; + + /* Account external pressure for the energy estimation */ + cpu = cpumask_first(cpus); + cpu_actual_cap = get_actual_cpu_capacity(cpu); + + eenv.cpu_cap = cpu_actual_cap; + eenv.pd_cap = 0; + + for_each_cpu(cpu, cpus) { + struct rq *rq = cpu_rq(cpu); + + eenv.pd_cap += cpu_actual_cap; + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; - util = cpu_util_next(cpu, p, cpu); + util = cpu_util(cpu, p, cpu, 0); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap - util; /* * Skip CPUs that cannot satisfy the capacity request. * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ - util = uclamp_rq_util_with(cpu_rq(cpu), util, p); - if (!fits_capacity(util, cpu_cap)) + if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { + /* + * Open code uclamp_rq_util_with() except for + * the clamp() part. I.e.: apply max aggregation + * only. util_fits_cpu() logic requires to + * operate on non clamped util but must use the + * max-aggregated uclamp_{min, max}. + */ + rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN); + rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX); + + util_min = max(rq_util_min, p_util_min); + util_max = max(rq_util_max, p_util_max); + } + + fits = util_fits_cpu(util, util_min, util_max, cpu); + if (!fits) continue; - /* Always use prev_cpu as a candidate. */ + lsub_positive(&cpu_cap, util); + if (cpu == prev_cpu) { - prev_delta = compute_energy(p, prev_cpu, pd); - prev_delta -= base_energy_pd; - best_delta = min(best_delta, prev_delta); + /* Always use prev_cpu as a candidate. */ + prev_spare_cap = cpu_cap; + prev_fits = fits; + } else if ((fits > max_fits) || + ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { + /* + * Find the CPU with the maximum spare capacity + * among the remaining CPUs in the performance + * domain. + */ + max_spare_cap = cpu_cap; + max_spare_cap_cpu = cpu; + max_fits = fits; } + } + + if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) + continue; + + eenv_pd_busy_time(&eenv, cpus, p); + /* Compute the 'base' energy of the pd, without @p */ + base_energy = compute_energy(&eenv, pd, cpus, p, -1); + + /* Evaluate the energy impact of using prev_cpu. */ + if (prev_spare_cap > -1) { + prev_delta = compute_energy(&eenv, pd, cpus, p, + prev_cpu); + /* CPU utilization has changed */ + if (prev_delta < base_energy) + goto unlock; + prev_delta -= base_energy; + prev_actual_cap = cpu_actual_cap; + best_delta = min(best_delta, prev_delta); + } + + /* Evaluate the energy impact of using max_spare_cap_cpu. */ + if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { + /* Current best energy cpu fits better */ + if (max_fits < best_fits) + continue; /* - * Find the CPU with the maximum spare capacity in - * the performance domain + * Both don't fit performance hint (i.e. uclamp_min) + * but best energy cpu has better capacity. */ - if (spare_cap > max_spare_cap) { - max_spare_cap = spare_cap; - max_spare_cap_cpu = cpu; - } - } + if ((max_fits < 0) && + (cpu_actual_cap <= best_actual_cap)) + continue; - /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { - cur_delta = compute_energy(p, max_spare_cap_cpu, pd); - cur_delta -= base_energy_pd; - if (cur_delta < best_delta) { - best_delta = cur_delta; - best_energy_cpu = max_spare_cap_cpu; - } + cur_delta = compute_energy(&eenv, pd, cpus, p, + max_spare_cap_cpu); + /* CPU utilization has changed */ + if (cur_delta < base_energy) + goto unlock; + cur_delta -= base_energy; + + /* + * Both fit for the task but best energy cpu has lower + * energy impact. + */ + if ((max_fits > 0) && (best_fits > 0) && + (cur_delta >= best_delta)) + continue; + + best_delta = cur_delta; + best_energy_cpu = max_spare_cap_cpu; + best_fits = max_fits; + best_actual_cap = cpu_actual_cap; } } -unlock: rcu_read_unlock(); - /* - * Pick the best CPU if prev_cpu cannot be used, or if it saves at - * least 6% of the energy used by prev_cpu. - */ - if (prev_delta == ULONG_MAX) - return best_energy_cpu; - - if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) - return best_energy_cpu; + if ((best_fits > prev_fits) || + ((best_fits > 0) && (best_delta < prev_delta)) || + ((best_fits < 0) && (best_actual_cap > prev_actual_cap))) + target = best_energy_cpu; - return prev_cpu; + return target; -fail: +unlock: rcu_read_unlock(); - return -1; + return target; } /* * select_task_rq_fair: Select target runqueue for the waking task in domains - * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * * Balances load by selecting the idlest CPU in the idlest group, or under * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * * Returns the target CPU number. - * - * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + /* SD_flags and WF_flags share the first nibble */ + int sd_flag = wake_flags & 0xF; - if (sd_flag & SD_BALANCE_WAKE) { + /* + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); + if (wake_flags & WF_TTWU) { record_wakee(p); - if (sched_energy_enabled()) { + if ((wake_flags & WF_CURRENT_CPU) && + cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + + if (!is_rd_overutilized(this_rq()->rd)) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -6696,6 +8605,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; } + /* + * Usually only true for WF_EXEC and WF_FORK, as sched_domains + * usually do not have SD_BALANCE_WAKE set. That means wakeup + * will usually go to the fast path. + */ if (tmp->flags & sd_flag) sd = tmp; else if (!want_affine) @@ -6704,22 +8618,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (unlikely(sd)) { /* Slow path */ - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); + } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); return new_cpu; } -static void detach_entity_cfs_rq(struct sched_entity *se); - /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -6727,181 +8635,126 @@ static void detach_entity_cfs_rq(struct sched_entity *se); */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { - /* - * As blocked tasks retain absolute vruntime the migration needs to - * deal with this by subtracting the old and adding the new - * min_vruntime -- the latter is done by enqueue_entity() when placing - * the task on the new runqueue. - */ - if (p->state == TASK_WAKING) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif + struct sched_entity *se = &p->se; - se->vruntime -= min_vruntime; - } + if (!task_on_rq_migrating(p)) { + remove_entity_load_avg(se); - if (p->on_rq == TASK_ON_RQ_MIGRATING) { /* - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' - * rq->lock and can modify state directly. - */ - lockdep_assert_held(&task_rq(p)->lock); - detach_entity_cfs_rq(&p->se); - - } else { - /* - * We are supposed to update the task to "current" time, then - * its up to date and ready to go to new CPU/cfs_rq. But we - * have difficulty in getting what current time is, so simply - * throw away the out-of-date time. This will result in the - * wakee task is less decayed, but giving the wakee more load - * sounds not bad. + * Here, the task's PELT values have been updated according to + * the current rq's clock. But if that clock hasn't been + * updated in a while, a substantial idle time will be missed, + * leading to an inflation after wake-up on the new rq. + * + * Estimate the missing time from the cfs_rq last_update_time + * and update sched_avg to improve the PELT continuity after + * migration. */ - remove_entity_load_avg(&p->se); + migrate_se_pelt_lag(se); } /* Tell new CPU we are migrated */ - p->se.avg.last_update_time = 0; - - /* We have migrated, no longer consider this task hot */ - p->se.exec_start = 0; + se->avg.last_update_time = 0; update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) { - remove_entity_load_avg(&p->se); -} - -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (rq->nr_running) - return 1; + struct sched_entity *se = &p->se; - return newidle_balance(rq, rf) != 0; -} -#endif /* CONFIG_SMP */ + if (se->sched_delayed) { + struct rq_flags rf; + struct rq *rq; -static unsigned long wakeup_gran(struct sched_entity *se) -{ - unsigned long gran = sysctl_sched_wakeup_granularity; + rq = task_rq_lock(p, &rf); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + task_rq_unlock(rq, p, &rf); + } - /* - * Since its curr running now, convert the gran from real-time - * to virtual-time in his units. - * - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - return calc_delta_fair(gran, se); + remove_entity_load_avg(se); } /* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * + * Set the max capacity the task is allowed to run at for misfit detection. */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +static void set_task_max_allowed_capacity(struct task_struct *p) { - s64 gran, vdiff = curr->vruntime - se->vruntime; + struct asym_cap_data *entry; - if (vdiff <= 0) - return -1; + if (!sched_asym_cpucap_active()) + return; - gran = wakeup_gran(se); - if (vdiff > gran) - return 1; + rcu_read_lock(); + list_for_each_entry_rcu(entry, &asym_cap_list, link) { + cpumask_t *cpumask; - return 0; + cpumask = cpu_capacity_span(entry); + if (!cpumask_intersects(p->cpus_ptr, cpumask)) + continue; + + p->max_allowed_capacity = entry->capacity; + break; + } + rcu_read_unlock(); } -static void set_last_buddy(struct sched_entity *se) +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; + set_cpus_allowed_common(p, ctx); + set_task_max_allowed_capacity(p); +} - for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) - return; - cfs_rq_of(se)->last = se; - } +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (sched_fair_runnable(rq)) + return 1; + + return sched_balance_newidle(rq, rf) != 0; } +#else +static inline void set_task_max_allowed_capacity(struct task_struct *p) {} +#endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } -static void set_skip_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) - cfs_rq_of(se)->skip = se; -} - /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int scale = cfs_rq->nr_running >= sched_nr_latency; - int next_buddy_marked = 0; + struct task_struct *donor = rq->donor; + struct sched_entity *se = &donor->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(donor); + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; /* * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_prempt_curr() after an enqueue (which may have + * unconditionally wakeup_preempt() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { set_next_buddy(pse); - next_buddy_marked = 1; } /* @@ -6914,122 +8767,120 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ * prevents us from potentially nominating it as a false LAST_BUDDY * below. */ - if (test_tsk_need_resched(curr)) + if (test_tsk_need_resched(rq->curr)) + return; + + if (!sched_feat(WAKEUP_PREEMPTION)) return; - /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(task_has_idle_policy(curr)) && - likely(!task_has_idle_policy(p))) + find_matching_se(&se, &pse); + WARN_ON_ONCE(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle entity in favor of a non-idle entity (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) goto preempt; + if (cse_is_idle != pse_is_idle) + return; /* - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): + * BATCH and IDLE tasks do not preempt others. */ - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + if (unlikely(!normal_policy(p->policy))) return; - find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); - BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is - * triggering this preemption. - */ - if (!next_buddy_marked) - set_next_buddy(pse); + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + /* + * If @p has a shorter slice than current and @p is eligible, override + * current's slice protection in order to allow preemption. + * + * Note that even if @p does not turn out to be the most eligible + * task at this moment, current's slice protection will be lost. + */ + if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) + se->vlag = se->deadline + 1; + + /* + * If @p has become the most eligible task, force preemption. + */ + if (pick_eevdf(cfs_rq) == pse) goto preempt; - } return; preempt: - resched_curr(rq); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; + resched_curr_lazy(rq); +} + +static struct task_struct *pick_task_fair(struct rq *rq) +{ + struct sched_entity *se; + struct cfs_rq *cfs_rq; + +again: + cfs_rq = &rq->cfs; + if (!cfs_rq->nr_queued) + return NULL; + + do { + /* Might not have done put_prev_entity() */ + if (cfs_rq->curr && cfs_rq->curr->on_rq) + update_curr(cfs_rq); + + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again; + + se = pick_next_entity(rq, cfs_rq); + if (!se) + goto again; + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); + return task_of(se); } +static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); + struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; int new_tasks; again: - if (!sched_fair_runnable(rq)) + p = pick_task_fair(rq); + if (!p) goto idle; + se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) + if (prev->sched_class != &fair_sched_class) goto simple; + __put_prev_set_next_dl_server(rq, prev, p); + /* * Because of the set_next_buddy() in dequeue_task_fair() it is rather * likely that a next task is from the same cgroup as the current. * * Therefore attempt to avoid putting and setting the entire cgroup * hierarchy, only change the part that actually changes. - */ - - do { - struct sched_entity *curr = cfs_rq->curr; - - /* - * Since we got here without doing put_prev_entity() we also - * have to consider cfs_rq->curr. If it is still a runnable - * entity, update_curr() will update its vruntime, otherwise - * forget we've ever seen it. - */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; - - /* - * This call to check_cfs_rq_runtime() will do the - * throttle and dequeue its entity in the parent(s). - * Therefore the nr_running test will indeed - * be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { - cfs_rq = &rq->cfs; - - if (!cfs_rq->nr_running) - goto idle; - - goto simple; - } - } - - se = pick_next_entity(cfs_rq, curr); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - p = task_of(se); - - /* + * * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the * least amount of cfs_rqs. */ if (prev != p) { struct sched_entity *pse = &prev->se; + struct cfs_rq *cfs_rq; while (!(cfs_rq = is_same_group(se, pse))) { int se_depth = se->depth; @@ -7047,47 +8898,25 @@ again: put_prev_entity(cfs_rq, pse); set_next_entity(cfs_rq, se); - } - goto done; -simple: -#endif - if (prev) - put_prev_task(rq, prev); - - do { - se = pick_next_entity(cfs_rq, NULL); - set_next_entity(cfs_rq, se); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); + __set_next_task_fair(rq, p, true); + } - p = task_of(se); + return p; -done: __maybe_unused; -#ifdef CONFIG_SMP - /* - * Move the next running task to the front of - * the list, so our cfs_tasks list becomes MRU - * one. - */ - list_move(&p->se.group_node, &rq->cfs_tasks); +simple: #endif - - if (hrtick_enabled(rq)) - hrtick_start_fair(rq, p); - - update_misfit_status(p, rq); - + put_prev_set_next_task(rq, prev, p); return p; idle: if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); + new_tasks = sched_balance_newidle(rq, rf); /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -7106,15 +8935,34 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq) +static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) +{ + return pick_next_task_fair(rq, prev, NULL); +} + +static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) +{ + return !!dl_se->rq->cfs.nr_queued; +} + +static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) +{ + return pick_task_fair(dl_se->rq); +} + +void fair_server_init(struct rq *rq) { - return pick_next_task_fair(rq, NULL, NULL); + struct sched_dl_entity *dl_se = &rq->fair_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); } /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -7127,8 +8975,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) /* * sched_yield() is very simple - * - * The magic of dealing with the ->skip buddy is in pick_next_entity. */ static void yield_task_fair(struct rq *rq) { @@ -7144,24 +8990,22 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); - if (curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - /* - * Tell update_rq_clock() that we've just updated, - * so we don't do microscopic update in schedule() - * and double the fastpath cost. - */ - rq_clock_skip_update(rq); - } + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq); - set_skip_buddy(se); + se->deadline += calc_delta_fair(se->slice, se); } -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -7169,7 +9013,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; - /* Tell the scheduler that we'd really like pse to run next. */ + /* Tell the scheduler that we'd really like se to run next. */ set_next_buddy(se); yield_task_fair(rq); @@ -7227,7 +9071,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * topology where each level pairs two lower groups (or better). This results * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency - * of load-balance at each level inv. proportional to the number of CPUs in + * of load-balance at each level inversely proportional to the number of CPUs in * the groups. * * This yields: @@ -7316,11 +9160,16 @@ enum group_type { */ group_fully_busy, /* - * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity - * and must be migrated to a more powerful CPU. + * One task doesn't fit with CPU's capacity and must be migrated to a + * more powerful CPU. */ group_misfit_task, /* + * Balance SMT group that's fully busy. Can benefit from migration + * a task on SMT with busy sibling to another CPU on idle core. + */ + group_smt_balance, + /* * SD_ASYM_PACKING only: One local CPU with higher capacity is available, * and the task should be migrated to it instead of running on the * current CPU. @@ -7349,8 +9198,7 @@ enum migration_type { #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 -#define LBF_NOHZ_STATS 0x10 -#define LBF_NOHZ_AGAIN 0x20 +#define LBF_ACTIVE_LB 0x10 struct lb_env { struct sched_domain *sd; @@ -7386,7 +9234,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); if (p->sched_class != &fair_sched_class) return 0; @@ -7394,16 +9242,27 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (unlikely(task_has_idle_policy(p))) return 0; + /* SMT siblings share cache */ + if (env->sd->flags & SD_SHARE_CPUCAPACITY) + return 0; + /* * Buddy candidates are cache hot: */ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) + (&p->se == cfs_rq_of(&p->se)->next)) return 1; if (sysctl_sched_migration_cost == -1) return 1; + + /* + * Don't migrate task if the task's cookie does not match + * with the destination CPU's core cookie. + */ + if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p)) + return 1; + if (sysctl_sched_migration_cost == 0) return 0; @@ -7414,43 +9273,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns 1, if task migration degrades locality - * Returns 0, if task migration improves locality i.e migration preferred. - * Returns -1, if task migration is not affected by locality. + * Returns a positive value, if task migration degrades locality. + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. */ -static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) - return -1; + return 0; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return -1; + return 0; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return -1; + return 0; /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) { if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) return 1; else - return -1; + return 0; } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return 0; + return -1; /* Leaving a core idle is often worse than degrading locality. */ if (env->idle == CPU_IDLE) - return -1; + return 0; dist = node_distance(src_nid, dst_nid); if (numa_group) { @@ -7461,41 +9320,85 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) dst_weight = task_weight(p, dst_nid, dist); } - return dst_weight < src_weight; + return src_weight - dst_weight; } #else -static inline int migrate_degrades_locality(struct task_struct *p, +static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return -1; + return 0; } #endif /* + * Check whether the task is ineligible on the destination cpu + * + * When the PLACE_LAG scheduling feature is enabled and + * dst_cfs_rq->nr_queued is greater than 1, if the task + * is ineligible, it will also be ineligible when + * it is migrated to the destination cpu. + */ +static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu) +{ + struct cfs_rq *dst_cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; +#else + dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; +#endif + if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued && + !entity_eligible(task_cfs_rq(p), &p->se)) + return 1; + + return 0; +} + +/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot; + long degrades, hot; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) + p->sched_task_hot = 0; /* * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 1) delayed dequeued unless we migrate load, or + * 2) throttled_lb_pair, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU. */ + if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) + return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; + /* + * We want to prioritize the migration of eligible tasks. + * For ineligible tasks we soft-limit them and only allow + * them to migrate when nr_balance_failed is non-zero to + * avoid load-balancing trying very hard to balance the load. + */ + if (!env->sd->nr_balance_failed && + task_is_ineligible_on_dst_cpu(p, env->dst_cpu)) + return 0; + + /* Disregard percpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; - schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->stats.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -7504,10 +9407,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have - * already computed one in current iteration. + * Avoid computing new_dst_cpu + * - for NEWLY_IDLE + * - if we have already computed one in current iteration + * - if it's an active balance */ - if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || + env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB)) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ @@ -7522,34 +9428,37 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; } - /* Record that we found atleast one task that could run on dst_cpu */ + /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->src_rq, p)) { - schedstat_inc(p->se.statistics.nr_failed_migrations_running); + if (task_on_cpu(env->src_rq, p)) { + schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. - */ - tsk_cache_hot = migrate_degrades_locality(p, env); - if (tsk_cache_hot == -1) - tsk_cache_hot = task_hot(p, env); - - if (tsk_cache_hot <= 0 || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) { - schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->se.statistics.nr_forced_migrations); - } + * 1) active balance + * 2) destination numa is preferred + * 3) task is cache cold, or + * 4) too many balance attempts have failed. + */ + if (env->flags & LBF_ACTIVE_LB) + return 1; + + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); + else + hot = degrades > 0; + + if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + if (hot) + p->sched_task_hot = 1; return 1; } - schedstat_inc(p->se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->stats.nr_failed_migrations_hot); return 0; } @@ -7558,7 +9467,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) */ static void detach_task(struct task_struct *p, struct lb_env *env) { - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); + + if (p->sched_task_hot) { + p->sched_task_hot = 0; + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->stats.nr_forced_migrations); + } deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); @@ -7574,7 +9489,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) { @@ -7595,8 +9510,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -static const unsigned int sched_nr_migrate_break = 32; - /* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". @@ -7610,7 +9523,16 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; int detached = 0; - lockdep_assert_held(&env->src_rq->lock); + lockdep_assert_rq_held(env->src_rq); + + /* + * Source run queue has been emptied by another CPU, clear + * LBF_ALL_PINNED flag as we will not test any task. + */ + if (env->src_rq->nr_running <= 1) { + env->flags &= ~LBF_ALL_PINNED; + return 0; + } if (env->imbalance <= 0) return 0; @@ -7620,11 +9542,9 @@ static int detach_tasks(struct lb_env *env) * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash. */ - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) + if (env->idle && env->src_rq->nr_running <= 1) break; - p = list_last_entry(tasks, struct task_struct, se.group_node); - env->loop++; /* We've more or less seen every task there is, call it quits */ if (env->loop > env->loop_max) @@ -7632,11 +9552,13 @@ static int detach_tasks(struct lb_env *env) /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sched_nr_migrate_break; + env->loop_break += SCHED_NR_MIGRATE_BREAK; env->flags |= LBF_NEED_BREAK; break; } + p = list_last_entry(tasks, struct task_struct, se.group_node); + if (!can_migrate_task(p, env)) goto next; @@ -7661,8 +9583,7 @@ static int detach_tasks(struct lb_env *env) * scheduler fails to find a good waiting task to * migrate. */ - if (load/2 > env->imbalance && - env->sd->nr_balance_failed <= env->sd->cache_nice_tries) + if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= load; @@ -7671,7 +9592,7 @@ static int detach_tasks(struct lb_env *env) case migrate_util: util = task_util_est(p); - if (util > env->imbalance) + if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= util; @@ -7683,7 +9604,7 @@ static int detach_tasks(struct lb_env *env) case migrate_misfit: /* This is not a misfit task */ - if (task_fits_capacity(p, capacity_of(env->src_cpu))) + if (task_fits_cpu(p, env->src_cpu)) goto next; env->imbalance = 0; @@ -7714,6 +9635,9 @@ static int detach_tasks(struct lb_env *env) continue; next: + if (p->sched_task_hot) + schedstat_inc(p->stats.nr_failed_migrations_hot); + list_move(&p->se.group_node, tasks); } @@ -7732,11 +9656,11 @@ next: */ static void attach_task(struct rq *rq, struct task_struct *p) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); - BUG_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } /* @@ -7790,81 +9714,56 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) static inline bool others_have_blocked(struct rq *rq) { - if (READ_ONCE(rq->avg_rt.util_avg)) + if (cpu_util_rt(rq)) return true; - if (READ_ONCE(rq->avg_dl.util_avg)) + if (cpu_util_dl(rq)) return true; - if (thermal_load_avg(rq)) + if (hw_load_avg(rq)) return true; -#ifdef CONFIG_HAVE_SCHED_AVG_IRQ - if (READ_ONCE(rq->avg_irq.util_avg)) + if (cpu_util_irq(rq)) return true; -#endif return false; } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_blocked_load_tick(struct rq *rq) { - rq->last_blocked_load_update_tick = jiffies; + WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); +} +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ if (!has_blocked) rq->has_blocked_load = 0; } #else static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif static bool __update_blocked_others(struct rq *rq, bool *done) { - const struct sched_class *curr_class; - u64 now = rq_clock_pelt(rq); - unsigned long thermal_pressure; - bool decayed; + bool updated; /* * update_load_avg() can call cpufreq_update_util(). Make sure that RT, * DL and IRQ signals have been updated before updating CFS. */ - curr_class = rq->curr->sched_class; - - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); - - decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | - update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | - update_irq_load_avg(rq, 0); + updated = update_other_load_avgs(rq); if (others_have_blocked(rq)) *done = false; - return decayed; + return updated; } #ifdef CONFIG_FAIR_GROUP_SCHED -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load.weight) - return false; - - if (cfs_rq->avg.load_sum) - return false; - - if (cfs_rq->avg.util_sum) - return false; - - if (cfs_rq->avg.runnable_sum) - return false; - - return true; -} - static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq, *pos; @@ -7879,7 +9778,10 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) struct sched_entity *se; if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); + + if (cfs_rq->nr_queued == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); if (cfs_rq == &rq->cfs) decayed = true; @@ -7888,7 +9790,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(cfs_rq_of(se), se, 0); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); /* * There can be a lot of idle CPU cgroups. Don't let fully @@ -7970,13 +9872,14 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -static void update_blocked_averages(int cpu) +static void sched_balance_update_blocked_averages(int cpu) { bool decayed = false, done = true; struct rq *rq = cpu_rq(cpu); struct rq_flags rf; rq_lock_irqsave(rq, &rf); + update_blocked_load_tick(rq); update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); @@ -7988,24 +9891,25 @@ static void update_blocked_averages(int cpu) rq_unlock_irqrestore(rq, &rf); } -/********** Helpers for find_busiest_group ************************/ +/********** Helpers for sched_balance_find_src_group ************************/ /* - * sg_lb_stats - stats of a sched_group required for load_balancing + * sg_lb_stats - stats of a sched_group required for load-balancing: */ struct sg_lb_stats { - unsigned long avg_load; /*Avg load across the CPUs of the group */ - unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long group_capacity; - unsigned long group_util; /* Total utilization over the CPUs of the group */ - unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ - unsigned int sum_nr_running; /* Nr of tasks running in the group */ - unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ - unsigned int idle_cpus; + unsigned long avg_load; /* Avg load over the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long group_capacity; /* Capacity over the CPUs of the group */ + unsigned long group_util; /* Total utilization over the CPUs of the group */ + unsigned long group_runnable; /* Total runnable time over the CPUs of the group */ + unsigned int sum_nr_running; /* Nr of all tasks running in the group */ + unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */ + unsigned int idle_cpus; /* Nr of idle CPUs in the group */ unsigned int group_weight; enum group_type group_type; - unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ - unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -8013,19 +9917,18 @@ struct sg_lb_stats { }; /* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. + * sd_lb_stats - stats of a sched_domain required for load-balancing: */ struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *local; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_capacity; /* Total capacity of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - unsigned int prefer_sibling; /* tasks should go to sibling first */ - - struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_capacity; /* Total capacity of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + unsigned int prefer_sibling; /* Tasks should go to sibling first */ + + struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; static inline void init_sd_lb_stats(struct sd_lb_stats *sds) @@ -8049,10 +9952,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu) { + unsigned long max = get_actual_cpu_capacity(cpu); struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -8064,12 +9967,9 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) /* * avg_rt.util_avg and avg_dl.util_avg track binary signals * (running and not running) with weights 0 and 1024 respectively. - * avg_thermal.load_avg tracks thermal pressure and the weighted - * average uses the actual delta max capacity(load). */ - used = READ_ONCE(rq->avg_rt.util_avg); - used += READ_ONCE(rq->avg_dl.util_avg); - used += thermal_load_avg(rq); + used = cpu_util_rt(rq); + used += cpu_util_dl(rq); if (unlikely(used >= max)) return 1; @@ -8081,15 +9981,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); - if (!capacity) capacity = 1; cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; @@ -8159,19 +10059,13 @@ static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { return ((rq->cpu_capacity * sd->imbalance_pct) < - (rq->cpu_capacity_orig * 100)); + (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } -/* - * Check whether a rq has a misfit task and if it looks like we can actually - * help that task: we can migrate the task to a CPU of higher capacity, or - * the task's current CPU is heavily pressured. - */ -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +/* Check if the rq has a misfit task */ +static inline bool check_misfit_status(struct rq *rq) { - return rq->misfit_task_load && - (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || - check_cpu_capacity(rq, sd)); + return rq->misfit_task_load; } /* @@ -8195,7 +10089,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditions to allow it + * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the @@ -8211,7 +10105,7 @@ static inline int sg_imbalanced(struct sched_group *group) /* * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. - * We consider that a group has spare capacity if the * number of task is + * We consider that a group has spare capacity if the number of task is * smaller than the number of CPUs or if the utilization is lower than the * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into @@ -8262,26 +10156,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) return false; } -/* - * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller - * per-CPU capacity than sched_group ref. - */ -static inline bool -group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) -{ - return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); -} - -/* - * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller - * per-CPU capacity_orig than sched_group ref. - */ -static inline bool -group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) -{ - return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity); -} - static inline enum group_type group_classify(unsigned int imbalance_pct, struct sched_group *group, @@ -8296,6 +10170,9 @@ group_type group_classify(unsigned int imbalance_pct, if (sgs->group_asym_packing) return group_asym_packing; + if (sgs->group_smt_balance) + return group_smt_balance; + if (sgs->group_misfit_task_load) return group_misfit_task; @@ -8305,70 +10182,180 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } -static bool update_nohz_stats(struct rq *rq, bool force) +/** + * sched_use_asym_prio - Check whether asym_packing priority must be used + * @sd: The scheduling domain of the load balancing + * @cpu: A CPU + * + * Always use CPU priority when balancing load between SMT siblings. When + * balancing load between cores, it is not sufficient that @cpu is idle. Only + * use CPU priority if the whole core is idle. + * + * Returns: True if the priority of @cpu must be followed. False otherwise. + */ +static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) { -#ifdef CONFIG_NO_HZ_COMMON - unsigned int cpu = rq->cpu; + if (!(sd->flags & SD_ASYM_PACKING)) + return false; - if (!rq->has_blocked_load) + if (!sched_smt_active()) + return true; + + return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); +} + +static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu) +{ + /* + * First check if @dst_cpu can do asym_packing load balance. Only do it + * if it has higher priority than @src_cpu. + */ + return sched_use_asym_prio(sd, dst_cpu) && + sched_asym_prefer(dst_cpu, src_cpu); +} + +/** + * sched_group_asym - Check if the destination CPU can do asym_packing balance + * @env: The load balancing environment + * @sgs: Load-balancing statistics of the candidate busiest group + * @group: The candidate busiest group + * + * @env::dst_cpu can do asym_packing if it has higher priority than the + * preferred CPU of @group. + * + * Return: true if @env::dst_cpu can do with asym_packing load balance. False + * otherwise. + */ +static inline bool +sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) +{ + /* + * CPU priorities do not make sense for SMT cores with more than one + * busy sibling. + */ + if ((group->flags & SD_SHARE_CPUCAPACITY) && + (sgs->group_weight - sgs->idle_cpus != 1)) return false; - if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); +} + +/* One group has more than one SMT CPU while the other group does not */ +static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, + struct sched_group *sg2) +{ + if (!sg1 || !sg2) return false; - if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) - return true; + return (sg1->flags & SD_SHARE_CPUCAPACITY) != + (sg2->flags & SD_SHARE_CPUCAPACITY); +} - update_blocked_averages(cpu); +static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + if (!env->idle) + return false; + + /* + * For SMT source group, it is better to move a task + * to a CPU that doesn't have multiple tasks sharing its CPU capacity. + * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY + * will not be on. + */ + if (group->flags & SD_SHARE_CPUCAPACITY && + sgs->sum_h_nr_running > 1) + return true; - return rq->has_blocked_load; -#else return false; -#endif +} + +static inline long sibling_imbalance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sg_lb_stats *busiest, + struct sg_lb_stats *local) +{ + int ncores_busiest, ncores_local; + long imbalance; + + if (!env->idle || !busiest->sum_nr_running) + return 0; + + ncores_busiest = sds->busiest->cores; + ncores_local = sds->local->cores; + + if (ncores_busiest == ncores_local) { + imbalance = busiest->sum_nr_running; + lsub_positive(&imbalance, local->sum_nr_running); + return imbalance; + } + + /* Balance such that nr_running/ncores ratio are same on both groups */ + imbalance = ncores_local * busiest->sum_nr_running; + lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running); + /* Normalize imbalance and do rounding on normalization */ + imbalance = 2 * imbalance + ncores_local + ncores_busiest; + imbalance /= ncores_local + ncores_busiest; + + /* Take advantage of resource in an empty sched group */ + if (imbalance <= 1 && local->sum_nr_running == 0 && + busiest->sum_nr_running > 1) + imbalance = 2; + + return imbalance; +} + +static inline bool +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) +{ + /* + * When there is more than 1 task, the group_overloaded case already + * takes care of cpu with reduced capacity + */ + if (rq->cfs.h_nr_runnable != 1) + return false; + + return check_cpu_capacity(rq, sd); } /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. - * @sg_status: Holds flag indicating the status of the sched_group + * @sg_overloaded: sched_group is overloaded + * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, + struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - int *sg_status) + bool *sg_overloaded, + bool *sg_overutilized) { - int i, nr_running, local_group; + int i, nr_running, local_group, sd_flags = env->sd->flags; + bool balancing_at_rd = !env->sd->parent; memset(sgs, 0, sizeof(*sgs)); - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + local_group = group == sds->local; for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); + unsigned long load = cpu_load(rq); - if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) - env->flags |= LBF_NOHZ_AGAIN; - - sgs->group_load += cpu_load(rq); - sgs->group_util += cpu_util(i); + sgs->group_load += load; + sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; - if (nr_running > 1) - *sg_status |= SG_OVERLOAD; - if (cpu_overutilized(i)) - *sg_status |= SG_OVERUTILIZED; + *sg_overutilized = 1; -#ifdef CONFIG_NUMA_BALANCING - sgs->nr_numa_running += rq->nr_numa_running; - sgs->nr_preferred_running += rq->nr_preferred_running; -#endif /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -8378,29 +10365,46 @@ static inline void update_sg_lb_stats(struct lb_env *env, continue; } + /* Overload indicator is only updated at root domain */ + if (balancing_at_rd && nr_running > 1) + *sg_overloaded = 1; + +#ifdef CONFIG_NUMA_BALANCING + /* Only fbq_classify_group() uses this to classify NUMA groups */ + if (sd_flags & SD_NUMA) { + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; + } +#endif if (local_group) continue; - /* Check for a misfit task on the cpu */ - if (env->sd->flags & SD_ASYM_CPUCAPACITY && - sgs->group_misfit_task_load < rq->misfit_task_load) { - sgs->group_misfit_task_load = rq->misfit_task_load; - *sg_status |= SG_OVERLOAD; + if (sd_flags & SD_ASYM_CPUCAPACITY) { + /* Check for a misfit task on the cpu */ + if (sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; + *sg_overloaded = 1; + } + } else if (env->idle && sched_reduced_capacity(rq, env->sd)) { + /* Check for a task running on a CPU with reduced capacity */ + if (sgs->group_misfit_task_load < load) + sgs->group_misfit_task_load = load; } } - /* Check if dst CPU is idle and preferred to this group */ - if (env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && - sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { - sgs->group_asym_packing = 1; - } - sgs->group_capacity = group->sgc->capacity; sgs->group_weight = group->group_weight; + /* Check if dst CPU is idle and preferred to this group */ + if (!local_group && env->idle && sgs->sum_h_nr_running && + sched_group_asym(env, sgs, group)) + sgs->group_asym_packing = 1; + + /* Check for loaded SMT group to be balanced to dst CPU */ + if (!local_group && smt_balance(env, sgs, group)) + sgs->group_smt_balance = 1; + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ @@ -8439,8 +10443,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, * CPUs in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). */ - if (sgs->group_type == group_misfit_task && - (!group_smaller_max_cpu_capacity(sg, sds->local) || + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && + (sgs->group_type == group_misfit_task) && + (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; @@ -8458,9 +10463,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, switch (sgs->group_type) { case group_overloaded: /* Select the overloaded group with highest avg_load. */ - if (sgs->avg_load <= busiest->avg_load) - return false; - break; + return sgs->avg_load > busiest->avg_load; case group_imbalanced: /* @@ -8471,18 +10474,24 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) - return false; - break; + return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); case group_misfit_task: /* * If we have more than one misfit sg go with the biggest * misfit. */ - if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) - return false; - break; + return sgs->group_misfit_task_load > busiest->group_misfit_task_load; + + case group_smt_balance: + /* + * Check if we have spare CPUs on either SMT group to + * choose has spare or fully busy handling. + */ + if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) + goto has_spare; + + fallthrough; case group_fully_busy: /* @@ -8493,15 +10502,40 @@ static bool update_sd_pick_busiest(struct lb_env *env, * contention when accessing shared HW resources. * * XXX for now avg_load is not computed and always 0 so we - * select the 1st one. + * select the 1st one, except if @sg is composed of SMT + * siblings. */ - if (sgs->avg_load <= busiest->avg_load) + + if (sgs->avg_load < busiest->avg_load) return false; + + if (sgs->avg_load == busiest->avg_load) { + /* + * SMT sched groups need more help than non-SMT groups. + * If @sg happens to also be SMT, either choice is good. + */ + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) + return false; + } + break; case group_has_spare: /* - * Select not overloaded group with lowest number of idle cpus + * Do not pick sg with SMT CPUs over sg with pure CPUs, + * as we do not want to pull task off SMT core with one task + * and make the core idle. + */ + if (smt_vs_nonsmt_groups(sds->busiest, sg)) { + if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1) + return false; + else + return true; + } +has_spare: + + /* + * Select not overloaded group with lowest number of idle CPUs * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up * that the group has less spare capacity but finally more idle @@ -8524,7 +10558,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, */ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type <= group_fully_busy) && - (group_smaller_min_cpu_capacity(sds->local, sg))) + (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu)))) return false; return true; @@ -8599,10 +10633,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p) * be computed and tested before calling idle_cpu_without(). */ -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -8623,6 +10655,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, memset(sgs, 0, sizeof(*sgs)); + /* Assume that task can't fit any CPU of the group */ + if (sd->flags & SD_ASYM_CPUCAPACITY) + sgs->group_misfit_task_load = 1; + for_each_cpu(i, sched_group_span(group)) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -8631,7 +10667,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -8642,12 +10678,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (!nr_running && idle_cpu_without(i, p)) sgs->idle_cpus++; - } + /* Check if task fits in the CPU */ + if (sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load && + task_fits_cpu(p, i)) + sgs->group_misfit_task_load = 0; - /* Check if task fits in the group */ - if (sd->flags & SD_ASYM_CPUCAPACITY && - !task_fits_capacity(p, group->sgc->max_capacity)) { - sgs->group_misfit_task_load = 1; } sgs->group_capacity = group->sgc->capacity; @@ -8692,6 +10728,7 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_imbalanced: case group_asym_packing: + case group_smt_balance: /* Those types are not used in the slow wakeup path */ return false; @@ -8703,8 +10740,14 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_has_spare: /* Select group with most idle CPUs */ - if (idlest_sgs->idle_cpus >= sgs->idle_cpus) + if (idlest_sgs->idle_cpus > sgs->idle_cpus) return false; + + /* Select group with lowest group_util */ + if (idlest_sgs->idle_cpus == sgs->idle_cpus && + idlest_sgs->group_util <= sgs->group_util) + return false; + break; } @@ -8712,13 +10755,13 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * find_idlest_group() finds and returns the least busy CPU group within the + * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups; struct sg_lb_stats local_sgs, tmp_sgs; @@ -8729,9 +10772,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) .group_type = group_overloaded, }; - imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - do { int local_group; @@ -8740,6 +10780,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) p->cpus_ptr)) continue; + /* Skip over this group if no cookie matched */ + if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group)) + continue; + local_group = cpumask_test_cpu(this_cpu, sched_group_span(group)); @@ -8785,6 +10829,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) switch (local_sgs.group_type) { case group_overloaded: case group_fully_busy: + + /* Calculate allowed imbalance based on load */ + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + /* * When comparing groups across NUMA domains, it's possible for * the local domain to be very lightly loaded relative to the @@ -8811,6 +10860,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) case group_imbalanced: case group_asym_packing: + case group_smt_balance: /* Those type are not used in the slow wakeup path */ return NULL; @@ -8821,7 +10871,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) break; case group_has_spare: +#ifdef CONFIG_NUMA if (sd->flags & SD_NUMA) { + int imb_numa_nr = sd->imb_numa_nr; #ifdef CONFIG_NUMA_BALANCING int idlest_cpu; /* @@ -8834,16 +10886,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) idlest_cpu = cpumask_first(sched_group_span(idlest)); if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) return idlest; -#endif +#endif /* CONFIG_NUMA_BALANCING */ /* - * Otherwise, keep the task on this node to stay close - * its wakeup source and improve locality. If there is - * a real need of migration, periodic load balance will + * Otherwise, keep the task close to the wakeup source + * and improve locality if the number of running tasks + * would remain below threshold where an imbalance is + * allowed while accounting for the possibility the + * task is pinned to a subset of CPUs. If there is a + * real need of migration, periodic load balance will * take care of it. */ - if (local_sgs.idle_cpus) + if (p->nr_cpus_allowed != NR_CPUS) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); + + cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); + imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); + } + + imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); + if (!adjust_numa_imbalance(imbalance, + local_sgs.sum_nr_running + 1, + imb_numa_nr)) { return NULL; + } } +#endif /* CONFIG_NUMA */ /* * Select group with highest number of idle CPUs. We could also @@ -8859,6 +10926,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; } +static void update_idle_cpu_scan(struct lb_env *env, + unsigned long sum_util) +{ + struct sched_domain_shared *sd_share; + int llc_weight, pct; + u64 x, y, tmp; + /* + * Update the number of CPUs to scan in LLC domain, which could + * be used as a hint in select_idle_cpu(). The update of sd_share + * could be expensive because it is within a shared cache line. + * So the write of this hint only occurs during periodic load + * balancing, rather than CPU_NEWLY_IDLE, because the latter + * can fire way more frequently than the former. + */ + if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) + return; + + llc_weight = per_cpu(sd_llc_size, env->dst_cpu); + if (env->sd->span_weight != llc_weight) + return; + + sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + if (!sd_share) + return; + + /* + * The number of CPUs to search drops as sum_util increases, when + * sum_util hits 85% or above, the scan stops. + * The reason to choose 85% as the threshold is because this is the + * imbalance_pct(117) when a LLC sched group is overloaded. + * + * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] + * and y'= y / SCHED_CAPACITY_SCALE + * + * x is the ratio of sum_util compared to the CPU capacity: + * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) + * y' is the ratio of CPUs to be scanned in the LLC domain, + * and the number of CPUs to scan is calculated by: + * + * nr_scan = llc_weight * y' [2] + * + * When x hits the threshold of overloaded, AKA, when + * x = 100 / pct, y drops to 0. According to [1], + * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 + * + * Scale x by SCHED_CAPACITY_SCALE: + * x' = sum_util / llc_weight; [3] + * + * and finally [1] becomes: + * y = SCHED_CAPACITY_SCALE - + * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] + * + */ + /* equation [3] */ + x = sum_util; + do_div(x, llc_weight); + + /* equation [4] */ + pct = env->sd->imbalance_pct; + tmp = x * x * pct * pct; + do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); + tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); + y = SCHED_CAPACITY_SCALE - tmp; + + /* equation [2] */ + y *= llc_weight; + do_div(y, SCHED_CAPACITY_SCALE); + if ((int)y != sd_share->nr_idle_scan) + WRITE_ONCE(sd_share->nr_idle_scan, (int)y); +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -8867,16 +11005,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { - struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int sg_status = 0; - -#ifdef CONFIG_NO_HZ_COMMON - if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) - env->flags |= LBF_NOHZ_STATS; -#endif + unsigned long sum_util = 0; + bool sg_overloaded = 0, sg_overutilized = 0; do { struct sg_lb_stats *sgs = &tmp_sgs; @@ -8892,70 +11025,44 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, sgs, &sg_status); - - if (local_group) - goto next_group; - + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); - if (update_sd_pick_busiest(env, sds, sg, sgs)) { + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } -next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sum_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups); - /* Tag domain that child domain prefers tasks go to siblings first */ - sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + /* + * Indicate that the child domain of the busiest group prefers tasks + * go to a child's sibling domains first. NB the flags of a sched group + * are those of the child domain. + */ + if (sds->busiest) + sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING); -#ifdef CONFIG_NO_HZ_COMMON - if ((env->flags & LBF_NOHZ_AGAIN) && - cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { - - WRITE_ONCE(nohz.next_blocked, - jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); - } -#endif if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { - struct root_domain *rd = env->dst_rq->rd; - /* update overload indicator if we are at root domain */ - WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + set_rd_overloaded(env->dst_rq->rd, sg_overloaded); /* Update over-utilization (tipping point, U >= 0) indicator */ - WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); - } else if (sg_status & SG_OVERUTILIZED) { - struct root_domain *rd = env->dst_rq->rd; - - WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); - trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); + } else if (sg_overutilized) { + set_rd_overutilized(env->dst_rq->rd, sg_overutilized); } -} -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) -{ - unsigned int imbalance_min; - - /* - * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the source domain is almost idle. - */ - imbalance_min = 2; - if (src_nr_running <= imbalance_min) - return 0; - - return imbalance; + update_idle_cpu_scan(env, sum_util); } /** @@ -8972,9 +11079,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s busiest = &sds->busiest_stat; if (busiest->group_type == group_misfit_task) { - /* Set imbalance to allow misfit tasks to be balanced. */ - env->migration_type = migrate_misfit; - env->imbalance = 1; + if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + /* Set imbalance to allow misfit tasks to be balanced. */ + env->migration_type = migrate_misfit; + env->imbalance = 1; + } else { + /* + * Set load imbalance to allow moving task from cpu + * with reduced capacity. + */ + env->migration_type = migrate_load; + env->imbalance = busiest->group_misfit_task_load; + } return; } @@ -8988,6 +11104,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } + if (busiest->group_type == group_smt_balance) { + /* Reduce number of tasks sharing CPU capacity */ + env->migration_type = migrate_task; + env->imbalance = 1; + return; + } + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -9005,7 +11128,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_LLC)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity @@ -9025,7 +11149,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * waiting task in this overloaded busiest group. Let's * try to pull it. */ - if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { + if (env->idle && env->imbalance == 0) { env->migration_type = migrate_task; env->imbalance = 1; } @@ -9034,29 +11158,34 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } if (busiest->group_weight == 1 || sds->prefer_sibling) { - unsigned int nr_diff = busiest->sum_nr_running; /* * When prefer sibling, evenly spread running tasks on * groups. */ env->migration_type = migrate_task; - lsub_positive(&nr_diff, local->sum_nr_running); - env->imbalance = nr_diff >> 1; + env->imbalance = sibling_imbalance(env, sds, busiest, local); } else { /* * If there is no overload, we just want to even the number of - * idle cpus. + * idle CPUs. */ env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - - busiest->idle_cpus) >> 1); + env->imbalance = max_t(long, 0, + (local->idle_cpus - busiest->idle_cpus)); } +#ifdef CONFIG_NUMA /* Consider allowing a small imbalance between NUMA groups */ - if (env->sd->flags & SD_NUMA) + if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running); + local->sum_nr_running + 1, + env->sd->imb_numa_nr); + } +#endif + + /* Number of tasks to move to restore balance */ + env->imbalance >>= 1; return; } @@ -9074,8 +11203,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / - sds->total_capacity; /* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks. @@ -9084,6 +11211,19 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = 0; return; } + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; + + /* + * If the local group is more loaded than the average system + * load, don't try to pull any tasks. + */ + if (local->avg_load >= sds->avg_load) { + env->imbalance = 0; + return; + } + } /* @@ -9101,7 +11241,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; } -/******* find_busiest_group() helpers end here *********************/ +/******* sched_balance_find_src_group() helpers end here *********************/ /* * Decision matrix according to the local and busiest group type: @@ -9109,7 +11249,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load @@ -9124,17 +11264,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ /** - * find_busiest_group - Returns the busiest group within the sched_domain + * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ -static struct sched_group *find_busiest_group(struct lb_env *env) +static struct sched_group *sched_balance_find_src_group(struct lb_env *env) { struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; @@ -9147,24 +11286,20 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } - - local = &sds.local_stat; - busiest = &sds.busiest_stat; - /* There is no busy sibling group to pull tasks from */ if (!sds.busiest) goto out_balanced; + busiest = &sds.busiest_stat; + /* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance; + if (!is_rd_overutilized(env->dst_rq->rd) && + rcu_dereference(env->dst_rq->rd->pd)) + goto out_balanced; + /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) goto force_balance; @@ -9177,6 +11312,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; + local = &sds.local_stat; /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -9216,22 +11352,32 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto out_balanced; } - /* Try to move all excess tasks to child's sibling domain */ + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. + */ if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_nr_running > local->sum_nr_running + 1) + sibling_imbalance(env, &sds, busiest, local) > 1) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) + if (!env->idle) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already * busy, let another idle CPU try to pull task. */ goto out_balanced; + } + + if (busiest->group_type == group_smt_balance && + smt_vs_nonsmt_groups(sds.local, sds.busiest)) { + /* Let non SMT CPU pull from SMT CPU sharing with sibling */ + goto force_balance; + } if (busiest->group_weight > 1 && - local->idle_cpus <= (busiest->idle_cpus + 1)) + local->idle_cpus <= (busiest->idle_cpus + 1)) { /* * If the busiest group is not overloaded * and there is no imbalance between this and busiest @@ -9242,12 +11388,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * there is more than 1 CPU per group. */ goto out_balanced; + } - if (busiest->sum_h_nr_running == 1) + if (busiest->sum_h_nr_running == 1) { /* * busiest doesn't have any tasks waiting to run */ goto out_balanced; + } } force_balance: @@ -9261,9 +11409,9 @@ out_balanced: } /* - * find_busiest_queue - find the busiest runqueue among the CPUs in the group. + * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. */ -static struct rq *find_busiest_queue(struct lb_env *env, +static struct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; @@ -9301,8 +11449,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + nr_running = rq->cfs.h_nr_runnable; + if (!nr_running) + continue; + capacity = capacity_of(i); - nr_running = rq->cfs.h_nr_running; /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -9311,10 +11462,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, * average load. */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && - capacity_of(env->dst_cpu) < capacity && + !capacity_greater(capacity_of(env->dst_cpu), capacity) && nr_running == 1) continue; + /* + * Make sure we only pull tasks from a CPU of lower priority + * when balancing between SMT siblings. + * + * If balancing between cores, let lower priority CPUs help + * SMT cores with more than one busy sibling. + */ + if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1) + continue; + switch (env->migration_type) { case migrate_load: /* @@ -9348,7 +11509,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, break; case migrate_util: - util = cpu_util(cpu_of(rq)); + util = cpu_util_cfs_boost(i); /* * Don't try to pull utilization from a CPU with one @@ -9399,30 +11560,55 @@ static inline bool asym_active_balance(struct lb_env *env) { /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. + * ASYM_PACKING needs to force migrate tasks from busy but lower + * priority CPUs in order to pack all tasks in the highest priority + * CPUs. When done between cores, do it only if the whole core if the + * whole core is idle. + * + * If @env::src_cpu is an SMT core with busy siblings, let + * the lower priority @env::dst_cpu help it. Do not follow + * CPU priority. */ - return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu); + return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) && + (sched_asym_prefer(env->dst_cpu, env->src_cpu) || + !sched_use_asym_prio(env->sd, env->src_cpu)); } static inline bool -voluntary_active_balance(struct lb_env *env) +imbalanced_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + /* + * The imbalanced case includes the case of pinned tasks preventing a fair + * distribution of the load on the system but also the even distribution of the + * threads on a system with spare capacity + */ + if ((env->migration_type == migrate_task) && + (sd->nr_balance_failed > sd->cache_nice_tries+2)) + return 1; + + return 0; +} + +static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; if (asym_active_balance(env)) return 1; + if (imbalanced_active_balance(env)) + return 1; + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. */ - if ((env->idle != CPU_NOT_IDLE) && - (env->src_rq->cfs.h_nr_running == 1)) { + if (env->idle && + (env->src_rq->cfs.h_nr_runnable == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -9434,22 +11620,13 @@ voluntary_active_balance(struct lb_env *env) return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { + struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask); struct sched_group *sg = env->sd->groups; - int cpu; + int cpu, idle_smt = -1; /* * Ensure the balancing environment is consistent; can happen @@ -9461,28 +11638,83 @@ static int should_we_balance(struct lb_env *env) /* * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. + * + * However, we bail out if we already have tasks or a wakeup pending, + * to optimize wakeup latency. */ - if (env->idle == CPU_NEWLY_IDLE) + if (env->idle == CPU_NEWLY_IDLE) { + if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending) + return 0; return 1; + } + cpumask_copy(swb_cpus, group_balance_mask(sg)); /* Try to find first idle CPU */ - for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + for_each_cpu_and(cpu, swb_cpus, env->cpus) { if (!idle_cpu(cpu)) continue; - /* Are we the first idle CPU? */ + /* + * Don't balance to idle SMT in busy core right away when + * balancing cores, but remember the first idle SMT CPU for + * later consideration. Find CPU on an idle core first. + */ + if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { + if (idle_smt == -1) + idle_smt = cpu; + /* + * If the core is not idle, and first SMT sibling which is + * idle has been found, then its not needed to check other + * SMT siblings for idleness: + */ +#ifdef CONFIG_SCHED_SMT + cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); +#endif + continue; + } + + /* + * Are we the first idle core in a non-SMT domain or higher, + * or the first idle CPU in a SMT domain? + */ return cpu == env->dst_cpu; } + /* Are we the first idle CPU with busy siblings? */ + if (idle_smt != -1) + return idle_smt == env->dst_cpu; + /* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu; } +static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd, + enum cpu_idle_type idle) +{ + if (!schedstat_enabled()) + return; + + switch (env->migration_type) { + case migrate_load: + __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance); + break; + case migrate_util: + __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance); + break; + case migrate_task: + __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance); + break; + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; + } +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ -static int load_balance(int this_cpu, struct rq *this_rq, +static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { @@ -9492,14 +11724,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); - struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_span(sd->groups), + .dst_grpmask = group_balance_mask(sd->groups), .idle = idle, - .loop_break = sched_nr_migrate_break, + .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), @@ -9515,34 +11746,35 @@ redo: goto out_balanced; } - group = find_busiest_group(&env); + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); goto out_balanced; } - busiest = find_busiest_queue(&env, group); + busiest = sched_balance_find_src_rq(&env, group); if (!busiest) { schedstat_inc(sd->lb_nobusyq[idle]); goto out_balanced; } - BUG_ON(busiest == env.dst_rq); + WARN_ON_ONCE(busiest == env.dst_rq); - schedstat_add(sd->lb_imbalance[idle], env.imbalance); + update_lb_imbalance_stat(&env, sd, idle); env.src_cpu = busiest->cpu; env.src_rq = busiest; ld_moved = 0; + /* Clear this flag as soon as we find a pullable task */ + env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* - * Attempt to move tasks. If find_busiest_group has found + * Attempt to move tasks. If sched_balance_find_src_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -9591,7 +11823,7 @@ more_balance: * load to given_cpu. In rare situations, this may cause * conflicts (balance_cpu and given_cpu/ilb_cpu deciding * _independently_ and at _same_ time to move some load to - * given_cpu) causing exceess load to be moved to given_cpu. + * given_cpu) causing excess load to be moved to given_cpu. * This however should not happen so much in practice and * moreover subsequent load balance cycles should correct the * excess load moved. @@ -9605,7 +11837,7 @@ more_balance: env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; /* * Go back to "more_balance" rather than "redo" since we @@ -9637,7 +11869,7 @@ more_balance: */ if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; goto redo; } goto out_all_pinned; @@ -9651,14 +11883,18 @@ more_balance: * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. + * + * Similarly for migration_misfit which is not related to + * load/util migration, don't pollute nr_balance_failed. */ - if (idle != CPU_NEWLY_IDLE) + if (idle != CPU_NEWLY_IDLE && + env.migration_type != migrate_misfit) sd->nr_balance_failed++; if (need_active_balance(&env)) { unsigned long flags; - raw_spin_lock_irqsave(&busiest->lock, flags); + raw_spin_rq_lock_irqsave(busiest, flags); /* * Don't kick the active_load_balance_cpu_stop, @@ -9666,12 +11902,13 @@ more_balance: * moved to this_cpu: */ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { - raw_spin_unlock_irqrestore(&busiest->lock, - flags); - env.flags |= LBF_ALL_PINNED; + raw_spin_rq_unlock_irqrestore(busiest, flags); goto out_one_pinned; } + /* Record that we found at least one task that could run on this_cpu */ + env.flags &= ~LBF_ALL_PINNED; + /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared @@ -9682,32 +11919,23 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_unlock_irqrestore(&busiest->lock, flags); + preempt_disable(); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } - - /* We've kicked active balancing, force task migration. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; + preempt_enable(); } - } else + } else { sd->nr_balance_failed = 0; + } - if (likely(!active_balance) || voluntary_active_balance(&env)) { + if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; } goto out; @@ -9739,12 +11967,17 @@ out_one_pinned: ld_moved = 0; /* - * newidle_balance() disregards balance intervals, so we could + * sched_balance_newidle() disregards balance intervals, so we could * repeatedly reach this code, which would lead to balance_interval - * skyrocketting in a short amount of time. Skip the balance_interval + * skyrocketing in a short amount of time. Skip the balance_interval * increase logic to avoid that. + * + * Similarly misfit migration which is not necessarily an indication of + * the system being busy and requires lb to backoff to let it settle + * down. */ - if (env.idle == CPU_NEWLY_IDLE) + if (env.idle == CPU_NEWLY_IDLE || + env.migration_type == migrate_misfit) goto out; /* tune up the balancing interval */ @@ -9766,6 +11999,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); + + /* + * Reduce likelihood of busy balancing at higher domains racing with + * balancing at lower domains by preventing their balancing periods + * from being multiples of each other. + */ + if (cpu_busy) + interval -= 1; + interval = clamp(interval, 1UL, max_load_balance_interval); return interval; @@ -9823,7 +12065,7 @@ static int active_load_balance_cpu_stop(void *data) * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup. */ - BUG_ON(busiest_rq == target_rq); + WARN_ON_ONCE(busiest_rq == target_rq); /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); @@ -9840,13 +12082,7 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, - /* - * can_migrate_task() doesn't need to compute new_dst_cpu - * for active balancing. Since we have CPU_IDLE, but no - * @dst_grpmask we need to make that test go away with lying - * about DST_PINNED. - */ - .flags = LBF_DST_PINNED, + .flags = LBF_ACTIVE_LB, }; schedstat_inc(sd->alb_count); @@ -9874,10 +12110,23 @@ out_unlock: return 0; } -static DEFINE_SPINLOCK(balancing); +/* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); /* - * Scale the max load_balance interval with the number of CPUs in the system. + * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ void update_max_interval(void) @@ -9885,13 +12134,37 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +{ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = jiffies; + } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + /* + * Decay the newidle max times by ~1% per second to ensure that + * it is not outdated and the current max cost is actually + * shorter. + */ + sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; + sd->last_decay_max_lb_cost = jiffies; + + return true; + } + + return false; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; @@ -9908,14 +12181,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) for_each_domain(cpu, sd) { /* * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. + * visit to all the domains. */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } + need_decay = update_newidle_cost(sd, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -9933,25 +12201,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { - if (!spin_trylock(&balancing)) + if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) goto out; } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { + if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + idle = idle_cpu(cpu); + busy = !idle && !sched_idle_cpu(cpu); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } if (need_serialize) - spin_unlock(&balancing); + atomic_set_release(&sched_balance_running, 0); out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; @@ -9973,22 +12241,9 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) { + if (likely(update_next_balance)) rq->next_balance = next_balance; -#ifdef CONFIG_NO_HZ_COMMON - /* - * If this CPU has been elected to perform the nohz idle - * balance. Other idle CPUs have already rebalanced with - * nohz_idle_balance() and nohz.next_balance has been - * updated accordingly. This CPU is now running the idle load - * balance for itself and we need to update the - * nohz.next_balance accordingly. - */ - if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) - nohz.next_balance = rq->next_balance; -#endif - } } static inline int on_null_domain(struct rq *rq) @@ -9998,40 +12253,58 @@ static inline int on_null_domain(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing + * NOHZ idle load balancing (ILB) details: + * + * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set - * anywhere yet. */ - static inline int find_new_ilb(void) { - int ilb; + const struct cpumask *hk_mask; + int ilb_cpu; + + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); + + for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { + + if (ilb_cpu == smp_processor_id()) + continue; - for_each_cpu_and(ilb, nohz.idle_cpus_mask, - housekeeping_cpumask(HK_FLAG_MISC)) { - if (idle_cpu(ilb)) - return ilb; + if (idle_cpu(ilb_cpu)) + return ilb_cpu; } - return nr_cpu_ids; + return -1; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). + * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU + * SMP function call (IPI). + * + * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set + * (if there is one). */ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); + if (ilb_cpu < 0) + return; - if (ilb_cpu >= nr_cpu_ids) + /* + * Don't bother if no new NOHZ balance work items for ilb_cpu, + * i.e. all bits in flags are already set in ilb_cpu. + */ + if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags) return; /* @@ -10044,7 +12317,7 @@ static void kick_ilb(unsigned int flags) /* * This way we generate an IPI on the target CPU which - * is idle. And the softirq performing nohz idle load balance + * is idle, and the softirq performing NOHZ idle load balancing * will be run before returning from the IPI. */ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); @@ -10073,7 +12346,7 @@ static void nohz_balancer_kick(struct rq *rq) /* * None are in tickless mode and hence no need for NOHZ idle load - * balancing. + * balancing: */ if (likely(!atomic_read(&nohz.nr_cpus))) return; @@ -10086,7 +12359,7 @@ static void nohz_balancer_kick(struct rq *rq) goto out; if (rq->nr_running >= 2) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto out; } @@ -10095,12 +12368,11 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { /* - * If there's a CFS task and the current CPU has reduced - * capacity; kick the ILB to see if there's a better CPU to run - * on. + * If there's a runnable CFS task and the current CPU has reduced + * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { - flags = NOHZ_KICK_MASK; + if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10111,10 +12383,13 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_PACKING; see if there's a more preferred CPU * currently idle; in which case, kick the ILB to move tasks * around. + * + * When balancing between cores, all the SMT siblings of the + * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_asym_prefer(i, cpu)) { - flags = NOHZ_KICK_MASK; + if (sched_asym(sd, i, cpu)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10126,8 +12401,8 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq, sd)) { - flags = NOHZ_KICK_MASK; + if (check_misfit_status(rq)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -10145,22 +12420,25 @@ static void nohz_balancer_kick(struct rq *rq) if (sds) { /* * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread + * increase the overall cache utilization), we need a less-loaded LLC + * domain to pull some load from. Likewise, we may need to spread * load within the current LLC domain (e.g. packed SMT cores but * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks + * the others are - so just get a NOHZ balance going if it looks * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } unlock: rcu_read_unlock(); out: + if (READ_ONCE(nohz.needs_update)) + flags |= NOHZ_NEXT_KICK; + if (flags) kick_ilb(flags); } @@ -10225,10 +12503,6 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) - return; - /* * Can be set safely without rq->lock held * If a clear happens, it will have evaluated last additions because @@ -10257,29 +12531,45 @@ void nohz_balance_enter_idle(int cpu) /* * Ensures that if nohz_idle_balance() fails to observe our * @idle_cpus_mask store, it must observe the @has_blocked - * store. + * and @needs_update stores. */ smp_mb__after_atomic(); set_cpu_sd_state_idle(cpu); + WRITE_ONCE(nohz.needs_update, 1); out: /* * Each time a cpu enter idle, we assume that it has blocked load and - * enable the periodic update of the load of idle cpus + * enable the periodic update of the load of idle CPUs */ WRITE_ONCE(nohz.has_blocked, 1); } +static bool update_nohz_stats(struct rq *rq) +{ + unsigned int cpu = rq->cpu; + + if (!rq->has_blocked_load) + return false; + + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return false; + + if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) + return true; + + sched_balance_update_blocked_averages(cpu); + + return rq->has_blocked_load; +} + /* - * Internal function that runs load balance for all idle cpus. The load balance + * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. - * The function returns false if the loop has stopped before running - * through all idle CPUs. */ -static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, - enum cpu_idle_type idle) +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) { /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; @@ -10288,7 +12578,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, int update_next_balance = 0; int this_cpu = this_rq->cpu; int balance_cpu; - int ret = false; struct rq *rq; SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); @@ -10296,12 +12585,17 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, /* * We assume there will be no idle load after this update and clear * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trig another update of idle load. + * set the has_blocked flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. + * + * Same applies to idle_cpus_mask vs needs_update. */ - WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 0); /* * Ensures that if we miss the CPU, we must see the has_blocked @@ -10309,8 +12603,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, */ smp_mb(); - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) + /* + * Start with the next CPU after this_cpu so we will end with this_cpu and let a + * chance for other idle cpu to pull load. + */ + for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) { + if (!idle_cpu(balance_cpu)) continue; /* @@ -10318,14 +12616,18 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, * work being done for other CPUs. Next load * balancing owner will pick it up. */ - if (need_resched()) { - has_blocked_load = true; + if (!idle_cpu(this_cpu) && need_resched()) { + if (flags & NOHZ_STATS_KICK) + has_blocked_load = true; + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 1); goto abort; } rq = cpu_rq(balance_cpu); - has_blocked_load |= update_nohz_stats(rq, true); + if (flags & NOHZ_STATS_KICK) + has_blocked_load |= update_nohz_stats(rq); /* * If time for next balance is due, @@ -10339,7 +12641,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(rq, CPU_IDLE); + sched_balance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { @@ -10348,26 +12650,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } - /* Newly idle CPU doesn't need an update */ - if (idle != CPU_NEWLY_IDLE) { - update_blocked_averages(this_cpu); - has_blocked_load |= this_rq->has_blocked_load; - } - - if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(this_rq, CPU_IDLE); - - WRITE_ONCE(nohz.next_blocked, - now + msecs_to_jiffies(LOAD_AVG_PERIOD)); - - /* The full idle balance loop has been done */ - ret = true; - -abort: - /* There is still blocked load, enable periodic update */ - if (has_blocked_load) - WRITE_ONCE(nohz.has_blocked, 1); - /* * next_balance will be updated only when there is a need. * When the CPU is attached to null domain for ex, it will not be @@ -10376,12 +12658,19 @@ abort: if (likely(update_next_balance)) nohz.next_balance = next_balance; - return ret; + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + +abort: + /* There is still blocked load, enable periodic update */ + if (has_blocked_load) + WRITE_ONCE(nohz.has_blocked, 1); } /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. + * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { @@ -10395,21 +12684,43 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (idle != CPU_IDLE) return false; - _nohz_idle_balance(this_rq, flags, idle); + _nohz_idle_balance(this_rq, flags); return true; } -static void nohz_newidle_balance(struct rq *this_rq) +/* + * Check if we need to directly run the ILB for updating blocked load before + * entering idle state. Here we run ILB directly without issuing IPIs. + * + * Note that when this function is called, the tick may not yet be stopped on + * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and + * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates + * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle + * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is + * called from this function on (this) CPU that's not yet in the mask. That's + * OK because the goal of nohz_run_idle_balance() is to run ILB only for + * updating the blocked load of already idle CPUs without waking up one of + * those idle CPUs and outside the preempt disable / IRQ off phase of the local + * cpu about to enter idle, because it can take a long time. + */ +void nohz_run_idle_balance(int cpu) { - int this_cpu = this_rq->cpu; + unsigned int flags; + + flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu)); /* - * This CPU doesn't want to be disturbed by scheduler - * housekeeping + * Update the blocked load only if no SCHED_SOFTIRQ is about to happen + * (i.e. NOHZ_STATS_KICK set) and will do the same. */ - if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) - return; + if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); +} + +static void nohz_newidle_balance(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu; /* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) @@ -10420,16 +12731,11 @@ static void nohz_newidle_balance(struct rq *this_rq) time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; - raw_spin_unlock(&this_rq->lock); /* - * This CPU is going to be idle and blocked load of idle CPUs - * need to be updated. Run the ilb locally as it is a good - * candidate for ilb instead of waking up another idle CPU. - * Kick an normal ilb if we failed to do the update. + * Set the need to trigger ILB in order to update blocked load + * before entering idle state. */ - if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) - kick_ilb(NOHZ_STATS_KICK); - raw_spin_lock(&this_rq->lock); + atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } #else /* !CONFIG_NO_HZ_COMMON */ @@ -10444,7 +12750,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * idle_balance is called by schedule() if this_cpu is about to become + * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -10452,18 +12758,28 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + int continue_balancing = 1; + u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; - u64 curr_cost = 0; update_misfit_status(NULL, this_rq); + /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. + * There is a task waiting to run. No need to search for one. + * Return 0; the task will be enqueued when switching to idle. + */ + if (this_rq->ttwu_pending) + return 0; + + /* + * We must set idle_stamp _before_ calling sched_balance_rq() + * for CPU_NEWLY_IDLE, such that we measure the this duration + * as idle time. */ this_rq->idle_stamp = rq_clock(this_rq); @@ -10481,82 +12797,83 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !READ_ONCE(this_rq->rd->overload)) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + + if (!get_rd_overloaded(this_rq->rd) || + (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) update_next_balance(sd, &next_balance); rcu_read_unlock(); - nohz_newidle_balance(this_rq); - goto out; } + rcu_read_unlock(); - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); + + t0 = sched_clock_cpu(this_cpu); + sched_balance_update_blocked_averages(this_cpu); - update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - int continue_balancing = 1; - u64 t0, domain_cost; + u64 domain_cost; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); + + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) break; - } if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); - pulled_task = load_balance(this_cpu, this_rq, + pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; + t1 = sched_clock_cpu(this_cpu); + domain_cost = t1 - t0; + update_newidle_cost(sd, domain_cost); curr_cost += domain_cost; + t0 = t1; } - update_next_balance(sd, &next_balance); - /* * Stop searching for tasks to pull if there are * now runnable tasks on this rq. */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || !continue_balancing) break; } rcu_read_unlock(); - raw_spin_lock(&this_rq->lock); + raw_spin_rq_lock(this_rq); if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; -out: /* * While browsing the domains, we released the rq lock, a task could * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) + if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_queued) + pulled_task = -1; + +out: /* Move the next balance forward */ if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) - pulled_task = -1; - if (pulled_task) this_rq->idle_stamp = 0; + else + nohz_newidle_balance(this_rq); rq_repin_lock(this_rq, rf); @@ -10564,19 +12881,21 @@ out: } /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + * This softirq handler is triggered via SCHED_SOFTIRQ from two places: + * + * - directly from the local sched_tick() for periodic load balancing + * + * - indirectly from a remote sched_tick() for NOHZ idle balancing + * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(void) { struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - + enum cpu_idle_type idle = this_rq->idle_balance; /* - * If this CPU has a pending nohz_balance_kick, then do the + * If this CPU has a pending NOHZ_BALANCE_KICK, then do the * balancing on behalf of the other idle CPUs whose ticks are - * stopped. Do nohz_idle_balance *before* rebalance_domains to + * stopped. Do nohz_idle_balance *before* sched_balance_domains to * give the idle CPUs a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. @@ -10585,17 +12904,20 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) return; /* normal load balance */ - update_blocked_averages(this_rq->cpu); - rebalance_domains(this_rq, idle); + sched_balance_update_blocked_averages(this_rq->cpu); + sched_balance_domains(this_rq, idle); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq) +void sched_balance_trigger(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return; if (time_after_eq(jiffies, rq->next_balance)) @@ -10617,10 +12939,140 @@ static void rq_offline_fair(struct rq *rq) /* Ensure any throttled groups are reachable by pick_next_task */ unthrottle_offline_cfs_rqs(rq); + + /* Ensure that we remove rq contribution to group share: */ + clear_tg_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ +#ifdef CONFIG_SCHED_CORE +static inline bool +__entity_slice_used(struct sched_entity *se, int min_nr_tasks) +{ + u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + u64 slice = se->slice; + + return (rtime * min_nr_tasks > slice); +} + +#define MIN_NR_TASKS_DURING_FORCEIDLE 2 +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) +{ + if (!sched_core_enabled(rq)) + return; + + /* + * If runqueue has only one task which used up its slice and + * if the sibling is forced idle, then trigger schedule to + * give forced idle task a chance. + * + * sched_slice() considers only this active rq and it gets the + * whole slice. But during force idle, we have siblings acting + * like a single runqueue and hence we need to consider runnable + * tasks on this CPU and the forced idle CPU. Ideally, we should + * go through the forced idle rq, but that would be a perf hit. + * We can assume that the forced idle CPU has at least + * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check + * if we need to give up the CPU. + */ + if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 && + __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) + resched_curr(rq); +} + +/* + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + */ +static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, + bool forceidle) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (forceidle) { + if (cfs_rq->forceidle_seq == fi_seq) + break; + cfs_rq->forceidle_seq = fi_seq; + } + + cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + } +} + +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) +{ + struct sched_entity *se = &p->se; + + if (p->sched_class != &fair_sched_class) + return; + + se_fi_update(se, rq->core->core_forceidle_seq, in_fi); +} + +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) +{ + struct rq *rq = task_rq(a); + const struct sched_entity *sea = &a->se; + const struct sched_entity *seb = &b->se; + struct cfs_rq *cfs_rqa; + struct cfs_rq *cfs_rqb; + s64 delta; + + SCHED_WARN_ON(task_rq(b)->core != rq->core); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Find an se in the hierarchy for tasks a and b, such that the se's + * are immediate siblings. + */ + while (sea->cfs_rq->tg != seb->cfs_rq->tg) { + int sea_depth = sea->depth; + int seb_depth = seb->depth; + + if (sea_depth >= seb_depth) + sea = parent_entity(sea); + if (sea_depth <= seb_depth) + seb = parent_entity(seb); + } + + se_fi_update(sea, rq->core->core_forceidle_seq, in_fi); + se_fi_update(seb, rq->core->core_forceidle_seq, in_fi); + + cfs_rqa = sea->cfs_rq; + cfs_rqb = seb->cfs_rq; +#else + cfs_rqa = &task_rq(a)->cfs; + cfs_rqb = &task_rq(b)->cfs; +#endif + + /* + * Find delta after normalizing se's vruntime with its cfs_rq's + * min_vruntime_fi, which would have been updated in prior calls + * to se_fi_update(). + */ + delta = (s64)(sea->vruntime - seb->vruntime) + + (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + + return delta > 0; +} + +static int task_is_throttled_fair(struct task_struct *p, int cpu) +{ + struct cfs_rq *cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq = task_group(p)->cfs_rq[cpu]; +#else + cfs_rq = &cpu_rq(cpu)->cfs; +#endif + return throttled_hierarchy(cfs_rq); +} +#else +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} +#endif + /* * scheduler tick hitting a task of our scheduling class. * @@ -10643,7 +13095,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); - update_overutilized_status(task_rq(curr)); + check_update_overutilized_status(task_rq(curr)); + + task_tick_core(rq, curr); } /* @@ -10653,33 +13107,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, *curr; - struct rq *rq = this_rq(); - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; - if (curr) { - update_curr(cfs_rq); - se->vruntime = curr->vruntime; - } - place_entity(cfs_rq, se, 1); - - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { - /* - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - swap(curr->vruntime, se->vruntime); - resched_curr(rq); - } - - se->vruntime -= cfs_rq->min_vruntime; - rq_unlock(rq, &rf); + set_task_max_allowed_capacity(p); } /* @@ -10692,7 +13120,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->cfs.nr_running == 1) + if (rq->cfs.nr_queued == 1) return; /* @@ -10700,39 +13128,11 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { + if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else - check_preempt_curr(rq, p, 0); -} - -static inline bool vruntime_normalized(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - - /* - * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, - * the dequeue_entity(.flags=0) will already have normalized the - * vruntime. - */ - if (p->on_rq) - return true; - - /* - * When !on_rq, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - A forked child which is waiting for being woken up by - * wake_up_new_task(). - * - A task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - */ - if (!se->sum_exec_runtime || - (p->state == TASK_WAKING && p->sched_remote_wakeup)) - return true; - - return false; + wakeup_preempt(rq, p, 0); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -10742,7 +13142,13 @@ static inline bool vruntime_normalized(struct task_struct *p) */ static void propagate_entity_cfs_rq(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + return; + + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ se = se->parent; @@ -10750,10 +13156,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + update_load_avg(cfs_rq, se, UPDATE_TG); + if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(cfs_rq, se, UPDATE_TG); + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } } #else @@ -10764,10 +13173,21 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); +#ifdef CONFIG_SMP + /* + * In case the task sched_avg hasn't been attached: + * - A forked task which hasn't been woken up by wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() but is + * waiting for actually being woken up by sched_ttwu_pending(). + */ + if (!se->avg.last_update_time) + return; +#endif + /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -10775,34 +13195,16 @@ static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } detach_entity_cfs_rq(se); } @@ -10810,12 +13212,8 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); attach_entity_cfs_rq(se); - - if (!vruntime_normalized(p)) - se->vruntime += cfs_rq->min_vruntime; } static void switched_from_fair(struct rq *rq, struct task_struct *p) @@ -10825,27 +13223,26 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) static void switched_to_fair(struct rq *rq, struct task_struct *p) { + SCHED_WARN_ON(p->se.sched_delayed); + attach_task_cfs_rq(p); + set_task_max_allowed_capacity(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (rq->curr == p) + if (task_current_donor(rq, p)) resched_curr(rq); else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } } -/* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; @@ -10858,6 +13255,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) list_move(&se->group_node, &rq->cfs_tasks); } #endif + if (!first) + return; + + SCHED_WARN_ON(se->sched_delayed); + + if (hrtick_enabled_fair(rq)) + hrtick_start_fair(rq, p); + + update_misfit_status(p, rq); + sched_fair_update_stop_tick(rq, p); +} + +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +{ + struct sched_entity *se = &p->se; for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10866,60 +13284,43 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } + + __set_next_task_fair(rq, p, first); } void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_set_group_fair(struct task_struct *p) +static void task_change_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - - set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; -} + /* + * We couldn't detach or attach a forked task which + * hasn't been woken up by wake_up_new_task(). + */ + if (READ_ONCE(p->__state) == TASK_NEW) + return; -static void task_move_group_fair(struct task_struct *p) -{ detach_task_cfs_rq(p); - set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif + set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } -static void task_change_group_fair(struct task_struct *p, int type) -{ - switch (type) { - case TASK_SET_GROUP: - task_set_group_fair(p); - break; - - case TASK_MOVE_GROUP: - task_move_group_fair(p); - break; - } -} - void free_fair_sched_group(struct task_group *tg) { int i; - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -10946,7 +13347,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), @@ -10954,7 +13355,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!cfs_rq) goto err; - se = kzalloc_node(sizeof(struct sched_entity), + se = kzalloc_node(sizeof(struct sched_entity_stats), GFP_KERNEL, cpu_to_node(i)); if (!se) goto err_free_rq; @@ -10992,26 +13393,35 @@ void online_fair_sched_group(struct task_group *tg) void unregister_fair_sched_group(struct task_group *tg) { - unsigned long flags; - struct rq *rq; int cpu; + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(cpu) { - if (tg->se[cpu]) - remove_entity_load_avg(tg->se[cpu]); + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct sched_entity *se = tg->se[cpu]; + struct rq *rq = cpu_rq(cpu); + + if (se) { + if (se->sched_delayed) { + guard(rq_lock_irqsave)(rq); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + list_del_leaf_cfs_rq(cfs_rq); + } + remove_entity_load_avg(se); + } /* * Only empty task groups can be destroyed; so we can speculatively * check on_list without danger of it being re-added. */ - if (!tg->cfs_rq[cpu]->on_list) - continue; - - rq = cpu_rq(cpu); - - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + if (cfs_rq->on_list) { + guard(rq_lock_irqsave)(rq); + list_del_leaf_cfs_rq(cfs_rq); + } } } @@ -11048,10 +13458,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@ -11060,9 +13472,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@ -11080,22 +13491,87 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_unlock_irqrestore(rq, &rf); } -done: - mutex_unlock(&shares_mutex); return 0; } -#else /* CONFIG_FAIR_GROUP_SCHED */ - -void free_fair_sched_group(struct task_group *tg) { } -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +int sched_group_set_shares(struct task_group *tg, unsigned long shares) { - return 1; + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; } -void online_fair_sched_group(struct task_group *tg) { } +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int i; -void unregister_fair_sched_group(struct task_group *tg) { } + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + idle_task_delta = grp_cfs_rq->h_nr_queued - + grp_cfs_rq->h_nr_idle; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->h_nr_idle += idle_task_delta; + + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + +next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + + mutex_unlock(&shares_mutex); + return 0; +} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -11110,7 +13586,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); + rr_interval = NS_TO_JIFFIES(se->slice); return rr_interval; } @@ -11118,15 +13594,16 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class = { - .next = &idle_sched_class, +DEFINE_SCHED_CLASS(fair) = { + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .check_preempt_curr = check_preempt_wakeup, + .wakeup_preempt = check_preempt_wakeup_fair, + .pick_task = pick_task_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, @@ -11140,12 +13617,13 @@ const struct sched_class fair_sched_class = { .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, - .set_cpus_allowed = set_cpus_allowed_common, + .set_cpus_allowed = set_cpus_allowed_fair, #endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, + .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, @@ -11158,6 +13636,10 @@ const struct sched_class fair_sched_class = { .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_fair, +#endif + #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif @@ -11202,93 +13684,27 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); - -#ifdef CONFIG_NO_HZ_COMMON - nohz.next_balance = jiffies; - nohz.next_blocked = jiffies; - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); -#endif -#endif /* SMP */ - -} + int i; -/* - * Helper functions to facilitate extracting info from tracepoints. - */ + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i), + GFP_KERNEL, cpu_to_node(i)); -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) -{ -#ifdef CONFIG_SMP - return cfs_rq ? &cfs_rq->avg : NULL; -#else - return NULL; +#ifdef CONFIG_CFS_BANDWIDTH + INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); + INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); #endif -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); - -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) -{ - if (!cfs_rq) { - if (str) - strlcpy(str, "(null)", len); - else - return NULL; } - cfs_rq_tg_path(cfs_rq, str, len); - return str; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); - -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) -{ - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_rt : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); - -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_dl : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); + open_softirq(SCHED_SOFTIRQ, sched_balance_softirq); -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) - return rq ? &rq->avg_irq : NULL; -#else - return NULL; +#ifdef CONFIG_NO_HZ_COMMON + nohz.next_balance = jiffies; + nohz.next_blocked = jiffies; + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); - -int sched_trace_rq_cpu(struct rq *rq) -{ - return rq ? cpu_of(rq) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); +#endif /* SMP */ -const struct cpumask *sched_trace_rd_span(struct root_domain *rd) -{ -#ifdef CONFIG_SMP - return rd ? rd->span : NULL; -#else - return NULL; -#endif } -EXPORT_SYMBOL_GPL(sched_trace_rd_span); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7481cd96f391..3c12d9f93331 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -1,16 +1,28 @@ /* SPDX-License-Identifier: GPL-2.0 */ + /* - * Only give sleepers 50% of their service deficit. This allows - * them to run sooner, but does not allow tons of sleepers to - * rip the spread apart. + * Using the avg_vruntime, do the right thing and preserve lag across + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. */ -SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) - +SCHED_FEAT(PLACE_LAG, true) +/* + * Give new tasks half a slice to ease into the competition. + */ +SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) +/* + * Preserve relative virtual deadline on 'migration'. + */ +SCHED_FEAT(PLACE_REL_DEADLINE, true) +/* + * Inhibit (wakeup) preemption until the current task has either matched the + * 0-lag point or until is has exhausted it's slice. + */ +SCHED_FEAT(RUN_TO_PARITY, true) /* - * Place new tasks ahead so that they do not starve already running - * tasks + * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for + * current. */ -SCHED_FEAT(START_DEBIT, true) +SCHED_FEAT(PREEMPT_SHORT, true) /* * Prefer to schedule the task we woke last (assuming it failed @@ -20,42 +32,60 @@ SCHED_FEAT(START_DEBIT, true) SCHED_FEAT(NEXT_BUDDY, false) /* - * Prefer to schedule the task that ran last (when we did - * wake-preempt) as that likely will touch the same data, increases - * cache locality. + * Allow completely ignoring cfs_rq->next; which can be set from various + * places: + * - NEXT_BUDDY (wakeup preemption) + * - yield_to_task() + * - cgroup dequeue / pick */ -SCHED_FEAT(LAST_BUDDY, true) +SCHED_FEAT(PICK_BUDDY, true) /* - * Consider buddies to be cache hot, decreases the likelyness of a + * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. */ SCHED_FEAT(CACHE_HOT_BUDDY, true) /* + * Delay dequeueing tasks until they get selected or woken. + * + * By delaying the dequeue for non-eligible tasks, they remain in the + * competition and can burn off their negative lag. When they get selected + * they'll have positive lag by definition. + * + * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0. + */ +SCHED_FEAT(DELAY_DEQUEUE, true) +SCHED_FEAT(DELAY_ZERO, true) + +/* * Allow wakeup-time preemption of the current task: */ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) -SCHED_FEAT(DOUBLE_TICK, false) +SCHED_FEAT(HRTICK_DL, false) /* * Decrement CPU capacity based on time not spent running tasks */ SCHED_FEAT(NONTASK_CAPACITY, true) +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +#else + /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) +#endif /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_AVG_CPU, false) -SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_UTIL, true) /* * Issue a WARN when we do multiple update_rq_clock() calls @@ -77,7 +107,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(RT_RUNTIME_SHARE, false) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) @@ -89,4 +119,5 @@ SCHED_FEAT(WA_BIAS, true) * UtilEstimation. Use estimated CPU utilization. */ SCHED_FEAT(UTIL_EST, true) -SCHED_FEAT(UTIL_EST_FASTUP, true) + +SCHED_FEAT(LATENCY_WARN, false) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1ae95b9150d3..2c85c86b455f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,9 +6,6 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ -#include "sched.h" - -#include <trace/events/power.h> /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -54,17 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { - rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); - local_irq_enable(); + instrumentation_begin(); + trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); + ct_cpuidle_enter(); + raw_local_irq_enable(); while (!tif_need_resched() && - (cpu_idle_force_poll || tick_check_broadcast_expired())) + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + raw_local_irq_disable(); + + ct_cpuidle_exit(); start_critical_timings(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - rcu_idle_exit(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + local_irq_enable(); + instrumentation_end(); return 1; } @@ -73,13 +75,31 @@ static noinline int __cpuidle cpu_idle_poll(void) void __weak arch_cpu_idle_prepare(void) { } void __weak arch_cpu_idle_enter(void) { } void __weak arch_cpu_idle_exit(void) { } -void __weak arch_cpu_idle_dead(void) { } +void __weak __noreturn arch_cpu_idle_dead(void) { while (1); } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - local_irq_enable(); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE +DEFINE_STATIC_KEY_FALSE(arch_needs_tick_broadcast); + +static inline void cond_tick_broadcast_enter(void) +{ + if (static_branch_unlikely(&arch_needs_tick_broadcast)) + tick_broadcast_enter(); +} + +static inline void cond_tick_broadcast_exit(void) +{ + if (static_branch_unlikely(&arch_needs_tick_broadcast)) + tick_broadcast_exit(); +} +#else +static inline void cond_tick_broadcast_enter(void) { } +static inline void cond_tick_broadcast_exit(void) { } +#endif + /** * default_idle_call - Default CPU idle routine. * @@ -87,13 +107,22 @@ void __weak arch_cpu_idle(void) */ void __cpuidle default_idle_call(void) { - if (current_clr_polling_and_test()) { - local_irq_enable(); - } else { + instrumentation_begin(); + if (!current_clr_polling_and_test()) { + cond_tick_broadcast_enter(); + trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); + + ct_cpuidle_enter(); arch_cpu_idle(); + ct_cpuidle_exit(); + start_critical_timings(); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + cond_tick_broadcast_exit(); } + local_irq_enable(); + instrumentation_end(); } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, @@ -131,7 +160,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * * NOTE: no locks or semaphores should be used here * - * On archs that support TIF_POLLING_NRFLAG, is called with polling + * On architectures that support TIF_POLLING_NRFLAG, is called with polling * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ @@ -143,22 +172,15 @@ static void cpuidle_idle_call(void) /* * Check if the idle task must be rescheduled. If it is the - * case, exit the function after re-enabling the local irq. + * case, exit the function after re-enabling the local IRQ. */ if (need_resched()) { local_irq_enable(); return; } - /* - * The RCU framework needs to be told that we are entering an idle - * section, so no more rcu read side critical sections and one more - * step to the grace period - */ - if (cpuidle_not_available(drv, dev)) { tick_nohz_idle_stop_tick(); - rcu_idle_enter(); default_idle_call(); goto exit_idle; @@ -168,7 +190,7 @@ static void cpuidle_idle_call(void) * Suspend-to-idle ("s2idle") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only * activity happens here and in interrupts (if any). In that case bypass - * the cpuidle governor and go stratight for the deepest idle state + * the cpuidle governor and go straight for the deepest idle state * available. Possibly also suspend the local tick and the entire * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. @@ -178,21 +200,17 @@ static void cpuidle_idle_call(void) u64 max_latency_ns; if (idle_should_enter_s2idle()) { - rcu_idle_enter(); entered_state = call_cpuidle_s2idle(drv, dev); if (entered_state > 0) goto exit_idle; - rcu_idle_exit(); - max_latency_ns = U64_MAX; } else { max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); - rcu_idle_enter(); next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); @@ -209,8 +227,6 @@ static void cpuidle_idle_call(void) else tick_nohz_idle_retain_tick(); - rcu_idle_enter(); - entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -222,12 +238,10 @@ exit_idle: __current_set_polling(); /* - * It is up to the idle functions to reenable local interrupts + * It is up to the idle functions to re-enable local interrupts */ if (WARN_ON_ONCE(irqs_disabled())) local_irq_enable(); - - rcu_idle_exit(); } /* @@ -238,6 +252,12 @@ exit_idle: static void do_idle(void) { int cpu = smp_processor_id(); + + /* + * Check if we need to update blocked load + */ + nohz_run_idle_balance(cpu); + /* * If the arch has a polling bit, we maintain an invariant: * @@ -251,20 +271,49 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - rmb(); + /* + * Interrupts shouldn't be re-enabled from that point on until + * the CPU sleeping instruction is reached. Otherwise an interrupt + * may fire and queue a timer that would be ignored until the CPU + * wakes from the sleeping instruction. And testing need_resched() + * doesn't tell about pending needed timer reprogram. + * + * Several cases to consider: + * + * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as + * "wfi" or "mwait" are fine because they can be entered with + * interrupt disabled. + * + * - sti;mwait() couple is fine because the interrupts are + * re-enabled only upon the execution of mwait, leaving no gap + * in-between. + * + * - ROLLBACK based idle handlers with the sleeping instruction + * called with interrupts enabled are NOT fine. In this scheme + * when the interrupt detects it has interrupted an idle handler, + * it rolls back to its beginning which performs the + * need_resched() check before re-executing the sleeping + * instruction. This can leak a pending needed timer reprogram. + * If such a scheme is really mandatory due to the lack of an + * appropriate CPU sleeping instruction, then a FAST-FORWARD + * must instead be applied: when the interrupt detects it has + * interrupted an idle handler, it must resume to the end of + * this idle handler so that the generic idle loop is iterated + * again to reprogram the tick. + */ local_irq_disable(); if (cpu_is_offline(cpu)) { - tick_nohz_idle_stop_tick(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } arch_cpu_idle_enter(); + rcu_nocb_flush_deferred_wakeup(); /* - * In poll mode we reenable interrupts and spin. Also if we + * In poll mode we re-enable interrupts and spin. Also if we * detected in the wakeup from idle path that the tick * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. @@ -300,7 +349,7 @@ static void do_idle(void) * RCU relies on this call to be done outside of an RCU read-side * critical section. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); schedule_idle(); if (unlikely(klp_patch_pending(current))) @@ -341,6 +390,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); WARN_ON_ONCE(!duration_ns); + WARN_ON_ONCE(current->mm); rcu_sleep_check(); preempt_disable(); @@ -348,10 +398,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - it.timer.function = idle_inject_timer_fn; + hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); hrtimer_start(&it.timer, ns_to_ktime(duration_ns), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); while (!READ_ONCE(it.done)) do_idle(); @@ -366,6 +416,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { + current->flags |= PF_IDLE; arch_cpu_idle_prepare(); cpuhp_online_idle(state); while (1) @@ -378,7 +429,7 @@ void cpu_startup_entry(enum cpuhp_state state) #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -393,41 +444,43 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) { resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + dl_server_update_idle_time(rq, prev); + scx_update_idle(rq, false, true); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); + scx_update_idle(rq, true, true); schedstat_inc(rq->sched_goidle); + next->se.exec_start = rq_clock_task(rq); } -struct task_struct *pick_next_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq) { - struct task_struct *next = rq->idle; - - set_next_task_idle(rq, next, true); - - return next; + scx_update_idle(rq, true, false); + return rq->idle; } /* * It is not legal to sleep in the idle task - print a warning * message if some code attempts to do it: */ -static void +static bool dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); - raw_spin_lock_irq(&rq->lock); + raw_spin_rq_lock_irq(rq); + return true; } /* @@ -453,11 +506,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) BUG(); } -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_idle(struct rq *rq) { } @@ -465,16 +513,16 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { - /* .next is NULL */ +DEFINE_SCHED_CLASS(idle) = { + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, - .check_preempt_curr = check_preempt_curr_idle, + .wakeup_preempt = wakeup_preempt_idle, - .pick_next_task = pick_next_task_idle, + .pick_task = pick_task_idle, .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, @@ -486,8 +534,6 @@ const struct sched_class idle_sched_class = { .task_tick = task_tick_idle, - .get_rr_interval = get_rr_interval_idle, - .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 808244f3ddd9..81bc8b329ef1 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,140 +7,189 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ -#include "sched.h" + +enum hk_flags { + HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), + HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), + HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE), +}; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); EXPORT_SYMBOL_GPL(housekeeping_overridden); -static cpumask_var_t housekeeping_mask; -static unsigned int housekeeping_flags; -bool housekeeping_enabled(enum hk_flags flags) +struct housekeeping { + cpumask_var_t cpumasks[HK_TYPE_MAX]; + unsigned long flags; +}; + +static struct housekeeping housekeeping; + +bool housekeeping_enabled(enum hk_type type) { - return !!(housekeeping_flags & flags); + return !!(housekeeping.flags & BIT(type)); } EXPORT_SYMBOL_GPL(housekeeping_enabled); -int housekeeping_any_cpu(enum hk_flags flags) +int housekeeping_any_cpu(enum hk_type type) { int cpu; if (static_branch_unlikely(&housekeeping_overridden)) { - if (housekeeping_flags & flags) { - cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id()); + if (housekeeping.flags & BIT(type)) { + cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id()); if (cpu < nr_cpu_ids) return cpu; - return cpumask_any_and(housekeeping_mask, cpu_online_mask); + cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + if (likely(cpu < nr_cpu_ids)) + return cpu; + /* + * Unless we have another problem this can only happen + * at boot time before start_secondary() brings the 1st + * housekeeping CPU up. + */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING || + type != HK_TYPE_TIMER); } } return smp_processor_id(); } EXPORT_SYMBOL_GPL(housekeeping_any_cpu); -const struct cpumask *housekeeping_cpumask(enum hk_flags flags) +const struct cpumask *housekeeping_cpumask(enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - return housekeeping_mask; + if (housekeeping.flags & BIT(type)) + return housekeeping.cpumasks[type]; return cpu_possible_mask; } EXPORT_SYMBOL_GPL(housekeeping_cpumask); -void housekeeping_affine(struct task_struct *t, enum hk_flags flags) +void housekeeping_affine(struct task_struct *t, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - set_cpus_allowed_ptr(t, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]); } EXPORT_SYMBOL_GPL(housekeeping_affine); -bool housekeeping_test_cpu(int cpu, enum hk_flags flags) +bool housekeeping_test_cpu(int cpu, enum hk_type type) { if (static_branch_unlikely(&housekeeping_overridden)) - if (housekeeping_flags & flags) - return cpumask_test_cpu(cpu, housekeeping_mask); + if (housekeeping.flags & BIT(type)) + return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]); return true; } EXPORT_SYMBOL_GPL(housekeeping_test_cpu); void __init housekeeping_init(void) { - if (!housekeeping_flags) + enum hk_type type; + + if (!housekeeping.flags) return; static_branch_enable(&housekeeping_overridden); - if (housekeeping_flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_KERNEL_NOISE) sched_tick_offload_init(); - /* We need at least one CPU to handle housekeeping work */ - WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); + for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { + /* We need at least one CPU to handle housekeeping work */ + WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type])); + } } -static int __init housekeeping_setup(char *str, enum hk_flags flags) +static void __init housekeeping_setup_type(enum hk_type type, + cpumask_var_t housekeeping_staging) { - cpumask_var_t non_housekeeping_mask; - cpumask_var_t tmp; - int err; + + alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]); + cpumask_copy(housekeeping.cpumasks[type], + housekeeping_staging); +} + +static int __init housekeeping_setup(char *str, unsigned long flags) +{ + cpumask_var_t non_housekeeping_mask, housekeeping_staging; + unsigned int first_cpu; + int err = 0; + + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) { + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { + pr_warn("Housekeeping: nohz unsupported." + " Build with CONFIG_NO_HZ_FULL\n"); + return 0; + } + } alloc_bootmem_cpumask_var(&non_housekeeping_mask); - err = cpulist_parse(str, non_housekeeping_mask); - if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) { + if (cpulist_parse(str, non_housekeeping_mask) < 0) { pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + goto free_non_housekeeping_mask; } - alloc_bootmem_cpumask_var(&tmp); - if (!housekeeping_flags) { - alloc_bootmem_cpumask_var(&housekeeping_mask); - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, non_housekeeping_mask); + alloc_bootmem_cpumask_var(&housekeeping_staging); + cpumask_andnot(housekeeping_staging, + cpu_possible_mask, non_housekeeping_mask); - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) { + first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging); + if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) { + __cpumask_set_cpu(smp_processor_id(), housekeeping_staging); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + if (!housekeeping.flags) { pr_warn("Housekeeping: must include one present CPU, " "using boot CPU:%d\n", smp_processor_id()); - __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - } - } else { - cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); - if (cpumask_empty(tmp)) - __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); - cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); - if (!cpumask_equal(tmp, housekeeping_mask)) { - pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); - free_bootmem_cpumask_var(tmp); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; } } - free_bootmem_cpumask_var(tmp); - if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { - if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { - tick_nohz_full_setup(non_housekeeping_mask); - } else { - pr_warn("Housekeeping: nohz unsupported." - " Build with CONFIG_NO_HZ_FULL\n"); - free_bootmem_cpumask_var(non_housekeeping_mask); - return 0; + if (cpumask_empty(non_housekeeping_mask)) + goto free_housekeeping_staging; + + if (!housekeeping.flags) { + /* First setup call ("nohz_full=" or "isolcpus=") */ + enum hk_type type; + + for_each_set_bit(type, &flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); + } else { + /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */ + enum hk_type type; + unsigned long iter_flags = flags & housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) { + if (!cpumask_equal(housekeeping_staging, + housekeeping.cpumasks[type])) { + pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); + goto free_housekeeping_staging; + } } + + iter_flags = flags & ~housekeeping.flags; + + for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) + housekeeping_setup_type(type, housekeeping_staging); } - housekeeping_flags |= flags; + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) + tick_nohz_full_setup(non_housekeeping_mask); + housekeeping.flags |= flags; + err = 1; + +free_housekeeping_staging: + free_bootmem_cpumask_var(housekeeping_staging); +free_non_housekeeping_mask: free_bootmem_cpumask_var(non_housekeeping_mask); - return 1; + return err; } static int __init housekeeping_nohz_full_setup(char *str) { - unsigned int flags; + unsigned long flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_KERNEL_NOISE; return housekeeping_setup(str, flags); } @@ -148,15 +197,18 @@ __setup("nohz_full=", housekeeping_nohz_full_setup); static int __init housekeeping_isolcpus_setup(char *str) { - unsigned int flags = 0; + unsigned long flags = 0; bool illegal = false; char *par; int len; while (isalpha(*str)) { + /* + * isolcpus=nohz is equivalent to nohz_full. + */ if (!strncmp(str, "nohz,", 5)) { str += 5; - flags |= HK_FLAG_TICK; + flags |= HK_FLAG_KERNEL_NOISE; continue; } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index de22da666ac7..c48900b856a2 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,7 +6,6 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ -#include "sched.h" /* * Global load-average calculations @@ -46,7 +45,7 @@ * again, being late doesn't loose the delta, just wrecks the sample. * * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because - * this would add another cross-CPU cacheline miss and atomic operation + * this would add another cross-CPU cache-line miss and atomic operation * to the wakeup path. Instead we increment on whatever CPU the task ran * when it went into uninterruptible state and decrement on whatever CPU * did the wakeup. This means that only the sum of nr_uninterruptible over @@ -63,7 +62,7 @@ EXPORT_SYMBOL(avenrun); /* should be removed */ /** * get_avenrun - get the load average array - * @loads: pointer to dest load array + * @loads: pointer to destination load array * @offset: offset to add * @shift: shift count to shift the result left * @@ -81,7 +80,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) long nr_active, delta = 0; nr_active = this_rq->nr_running - adjust; - nr_active += (long)this_rq->nr_uninterruptible; + nr_active += (int)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; @@ -189,7 +188,7 @@ calc_load_n(unsigned long load, unsigned long exp, * w:0 1 1 0 0 1 1 0 0 * * This ensures we'll fold the old NO_HZ contribution in this window while - * accumlating the new one. + * accumulating the new one. * * - When we wake up from NO_HZ during the window, we push up our * contribution, since we effectively move our sample point to a known @@ -347,7 +346,7 @@ static inline void calc_global_nohz(void) { } * * Called from the global timer code. */ -void calc_global_load(unsigned long ticks) +void calc_global_load(void) { unsigned long sample_window; long active, delta; @@ -380,7 +379,7 @@ void calc_global_load(unsigned long ticks) } /* - * Called from scheduler_tick() to periodically update this CPU's + * Called from sched_tick() to periodically update this CPU's * active count. */ void calc_global_load_tick(struct rq *this_rq) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..809194cd779f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,7 +4,134 @@ * * membarrier system call */ -#include "sched.h" + +/* + * For documentation purposes, here are some membarrier ordering + * scenarios to keep in mind: + * + * A) Userspace thread execution after IPI vs membarrier's memory + * barrier before sending the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the start of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU0 before + * the membarrier() is executed will be visible to any code executing on + * CPU1 after the IPI-induced memory barrier: + * + * CPU0 CPU1 + * + * x = 1 + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r2 = y + * y = 1 + * barrier() + * r1 = x + * + * BUG_ON(r1 == 0 && r2 == 0) + * + * The write to y and load from x by CPU1 are unordered by the hardware, + * so it's possible to have "r1 = x" reordered before "y = 1" at any + * point after (b). If the memory barrier at (a) is omitted, then "x = 1" + * can be reordered after (a) (although not after (c)), so we get r1 == 0 + * and r2 == 0. This violates the guarantee that membarrier() is + * supposed by provide. + * + * The timing of the memory barrier at (a) has to ensure that it executes + * before the IPI-induced memory barrier on CPU1. + * + * B) Userspace thread execution before IPI vs membarrier's memory + * barrier after completing the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the end of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU1 before + * the membarrier() is executed will be visible to any code executing on + * CPU0 after the membarrier(): + * + * CPU0 CPU1 + * + * x = 1 + * barrier() + * y = 1 + * r2 = y + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r1 = x + * BUG_ON(r1 == 0 && r2 == 1) + * + * The writes to x and y are unordered by the hardware, so it's possible to + * have "r2 = 1" even though the write to x doesn't execute until (b). If + * the memory barrier at (c) is omitted then "r1 = x" can be reordered + * before (b) (although not before (a)), so we get "r1 = 0". This violates + * the guarantee that membarrier() is supposed to provide. + * + * The timing of the memory barrier at (c) has to ensure that it executes + * after the IPI-induced memory barrier on CPU1. + * + * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * d: switch to kthread (includes mb) + * b: read rq->curr->mm == NULL + * e: switch to user (includes mb) + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (e). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + * + * D) exit_mm vs membarrier + * + * Two thread groups are created, A and B. Thread group B is created by + * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD. + * Let's assume we have a single thread within each thread group (Thread A + * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1. + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * exit_mm(): + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * c: smp_mb() + * + * Using scenario (B), we can show that (c) needs to be paired with (d). + * + * E) kthread_{use,unuse}_mm vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * kthread_unuse_mm() + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * kthread_use_mm() + * f: current->mm = mm + * g: smp_mb() + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (g). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + */ /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, @@ -18,18 +145,61 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif +#ifdef CONFIG_RSEQ +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ) +#else +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ - | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + | MEMBARRIER_CMD_GET_REGISTRATIONS) + +static DEFINE_MUTEX(membarrier_ipi_mutex); +#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex) static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_core(void *info) +{ + /* + * The smp_mb() in membarrier after all the IPIs is supposed to + * ensure that memory on remote CPUs that occur before the IPI + * become visible to membarrier()'s caller -- see scenario B in + * the big comment at the top of this file. + * + * A sync_core() would provide this guarantee, but + * sync_core_before_usermode() might end up being deferred until + * after membarrier()'s smp_mb(). + */ + smp_mb(); /* IPIs should be serializing but paranoid. */ + + sync_core_before_usermode(); +} + +static void ipi_rseq(void *info) +{ + /* + * Ensure that all stores done by the calling thread are visible + * to the current task before the current task resumes. We could + * probably optimize this away on most architectures, but by the + * time we've already sent an IPI, the cost of the extra smp_mb() + * is negligible. + */ + smp_mb(); + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -63,6 +233,18 @@ void membarrier_exec_mmap(struct mm_struct *mm) this_cpu_write(runqueues.membarrier_state, 0); } +void membarrier_update_current_mm(struct mm_struct *next_mm) +{ + struct rq *rq = this_rq(); + int membarrier_state = 0; + + if (next_mm) + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} + static int membarrier_global_expedited(void) { int cpu; @@ -72,7 +254,7 @@ static int membarrier_global_expedited(void) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. */ smp_mb(); /* system call entry is not a mb. */ @@ -80,6 +262,7 @@ static int membarrier_global_expedited(void) if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; + SERIALIZE_IPI(); cpus_read_lock(); rcu_read_lock(); for_each_online_cpu(cpu) { @@ -101,12 +284,11 @@ static int membarrier_global_expedited(void) continue; /* - * Skip the CPU if it runs a kernel thread. The scheduler - * leaves the prior task mm in place as an optimization when - * scheduling a kthread. + * Skip the CPU if it runs a kernel thread which is not using + * a task mm. */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p->flags & PF_KTHREAD) + if (!p->mm) continue; __cpumask_set_cpu(cpu, tmpmask); @@ -122,74 +304,128 @@ static int membarrier_global_expedited(void) /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ return 0; } -static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + ipi_func = ipi_sync_core; + prepare_sync_core_cmd(mm); + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; } - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) + if (flags != MEMBARRIER_FLAG_SYNC_CORE && + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; + SERIALIZE_IPI(); cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p; + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); + } + + if (cpu_id >= 0) { /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. + * smp_call_function_single() will call ipi_func() if cpu_id + * is the calling CPU. */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + } else { + /* + * For regular membarrier, we can save a few cycles by + * skipping the current cpu -- we're about to do smp_mb() + * below, and if we migrate to a different cpu, this cpu + * and the new cpu will execute a full barrier in the + * scheduler. + * + * For SYNC_CORE, we do need a barrier on the current cpu -- + * otherwise, if we are migrated and replaced by a different + * task in the same mm just before, during, or after + * membarrier, we will end up with some thread in the mm + * running without a core sync. + * + * For RSEQ, don't rseq_preempt() the caller. User code + * is not supposed to issue syscalls at all from inside an + * rseq critical section. + */ + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_func, NULL, true); + preempt_enable(); + } else { + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); + } } - rcu_read_unlock(); - preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); - preempt_enable(); - - free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock(); /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ @@ -229,11 +465,12 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) /* * For each cpu runqueue, if the task's mm match @mm, ensure that all - * @mm's membarrier state set bits are also set in in the runqueue's + * @mm's membarrier state set bits are also set in the runqueue's * membarrier state. This ensures that a runqueue scheduling * between threads which are users of @mm has its membarrier state * updated. */ + SERIALIZE_IPI(); cpus_read_lock(); rcu_read_lock(); for_each_online_cpu(cpu) { @@ -246,9 +483,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) } rcu_read_unlock(); - preempt_disable(); - smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); - preempt_enable(); + on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true); free_cpumask_var(tmpmask); cpus_read_unlock(); @@ -283,11 +518,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); } /* @@ -299,6 +541,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -308,10 +552,51 @@ static int membarrier_register_private_expedited(int flags) return 0; } +static int membarrier_get_registrations(void) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + int registrations_mask = 0, membarrier_state, i; + static const int states[] = { + MEMBARRIER_STATE_GLOBAL_EXPEDITED | + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY + }; + static const int registration_cmds[] = { + MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ + }; + BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds)); + + membarrier_state = atomic_read(&mm->membarrier_state); + for (i = 0; i < ARRAY_SIZE(states); ++i) { + if (membarrier_state & states[i]) { + registrations_mask |= registration_cmds[i]; + membarrier_state &= ~states[i]; + } + } + WARN_ON_ONCE(membarrier_state != 0); + return registrations_mask; +} + /** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +622,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +658,19 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); + case MEMBARRIER_CMD_GET_REGISTRATIONS: + return membarrier_get_registrations(); default: return -EINVAL; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b4b1ff96642f..7a8534a2deff 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Per Entity Load Tracking + * Per Entity Load Tracking (PELT) * * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * @@ -24,12 +24,6 @@ * Author: Vincent Guittot <vincent.guittot@linaro.org> */ -#include <linux/sched.h> -#include "sched.h" -#include "pelt.h" - -#include <trace/events/sched.h> - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -83,8 +77,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -137,7 +129,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * runnable = running = 0; * * clause from ___update_load_sum(); this results in - * the below usage of @contrib to dissapear entirely, + * the below usage of @contrib to disappear entirely, * so no point in calculating it. */ contrib = __accumulate_pelt_segments(periods, @@ -216,8 +208,8 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * se has been already dequeued but cfs_rq->curr still points to it. * This means that weight will be 0 but not running for a sched_entity * but also for a cfs_rq if the latter becomes idle. As an example, - * this happens during idle_balance() which calls - * update_blocked_averages(). + * this happens during sched_balance_newidle() which calls + * sched_balance_update_blocked_averages(). * * Also see the comment in accumulate_sum(). */ @@ -264,7 +256,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(sa); /* * Step 2: update *_avg. @@ -283,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = grq->h_nr_running + * se_runnable() = grq->h_nr_runnable * * runnable_sum = se_runnable() * runnable = grq->runnable_sum * runnable_avg = runnable_sum @@ -329,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running, + cfs_rq->h_nr_runnable, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); @@ -392,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } -#ifdef CONFIG_SCHED_THERMAL_PRESSURE +#ifdef CONFIG_SCHED_HW_PRESSURE /* - * thermal: + * hardware: * * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked * * util_avg and runnable_load_avg are not supported and meaningless. * * Unlike rt/dl utilization tracking that track time spent by a cpu - * running a rt/dl task through util_avg, the average thermal pressure is - * tracked through load_avg. This is because thermal pressure signal is + * running a rt/dl task through util_avg, the average HW pressure is + * tracked through load_avg. This is because HW pressure signal is * time weighted "delta" capacity unlike util_avg which is binary. * "delta capacity" = actual capacity - - * capped capacity a cpu due to a thermal event. + * capped capacity a cpu due to a HW event. */ -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { - if (___update_load_sum(now, &rq->avg_thermal, + if (___update_load_sum(now, &rq->avg_hw, capacity, capacity, capacity)) { - ___update_load_avg(&rq->avg_thermal, 1); - trace_pelt_thermal_tp(rq); + ___update_load_avg(&rq->avg_hw, 1); + trace_pelt_hw_tp(rq); return 1; } @@ -425,7 +417,7 @@ int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* - * irq: + * IRQ: * * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked * util_sum = cpu_scale * load_sum @@ -440,7 +432,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) int ret = 0; /* - * We can't use clock_pelt because irq time is not accounted in + * We can't use clock_pelt because IRQ time is not accounted in * clock_task. Instead we directly scale the running time to * reflect the real amount of computation */ @@ -475,3 +467,23 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } #endif + +/* + * Load avg and utiliztion metrics need to be updated periodically and before + * consumption. This function updates the metrics for all subsystems except for + * the fair class. @rq must be locked and have its clock updated. + */ +bool update_other_load_avgs(struct rq *rq) +{ + u64 now = rq_clock_pelt(rq); + const struct sched_class *curr_class = rq->donor->sched_class; + unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + + lockdep_assert_rq_held(rq); + + /* hw_pressure doesn't care about invariance */ + return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) | + update_irq_load_avg(rq, 0); +} diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index eb034d9f024d..f4f6a0875c66 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -6,22 +6,23 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +bool update_other_load_avgs(struct rq *rq); -#ifdef CONFIG_SCHED_THERMAL_PRESSURE -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +#ifdef CONFIG_SCHED_HW_PRESSURE +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity); -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { - return READ_ONCE(rq->avg_thermal.load_avg); + return READ_ONCE(rq->avg_hw.load_avg); } #else static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } @@ -37,14 +38,12 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 +#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) + +static inline u32 get_pelt_divider(struct sched_avg *avg) +{ + return PELT_MIN_DIVIDER + avg->period_contrib; +} static inline void cfs_se_util_change(struct sched_avg *avg) { @@ -53,14 +52,33 @@ static inline void cfs_se_util_change(struct sched_avg *avg) if (!sched_feat(UTIL_EST)) return; - /* Avoid store if the flag has been already set */ - enqueued = avg->util_est.enqueued; + /* Avoid store if the flag has been already reset */ + enqueued = avg->util_est; if (!(enqueued & UTIL_AVG_UNCHANGED)) return; /* Reset flag to report util_avg has been updated */ enqueued &= ~UTIL_AVG_UNCHANGED; - WRITE_ONCE(avg->util_est.enqueued, enqueued); + WRITE_ONCE(avg->util_est, enqueued); +} + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + assert_clock_updated(rq); + + return rq->clock_pelt - rq->lost_idle_time; +} + +/* The rq is idle, we can sync to clock_task */ +static inline void _update_idle_rq_clock_pelt(struct rq *rq) +{ + rq->clock_pelt = rq_clock_task(rq); + + u64_u32_store(rq->clock_idle, rq_clock(rq)); + /* Paired with smp_rmb in migrate_se_pelt_lag() */ + smp_wmb(); + u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq)); } /* @@ -78,8 +96,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg) static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) { if (unlikely(is_idle_task(rq->curr))) { - /* The rq is idle, we can sync to clock_task */ - rq->clock_pelt = rq_clock_task(rq); + _update_idle_rq_clock_pelt(rq); return; } @@ -125,33 +142,40 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) * Reflecting stolen time makes sense only if the idle * phase would be present at max capacity. As soon as the * utilization of a rq has reached the maximum value, it is - * considered as an always runnig rq without idle time to + * considered as an always running rq without idle time to * steal. This potential idle time is considered as lost in * this case. We keep track of this lost idle time compare to * rq's clock_task. */ if (util_sum >= divider) rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; + + _update_idle_rq_clock_pelt(rq); } -static inline u64 rq_clock_pelt(struct rq *rq) +#ifdef CONFIG_CFS_BANDWIDTH +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { - lockdep_assert_held(&rq->lock); - assert_clock_updated(rq); + u64 throttled; - return rq->clock_pelt - rq->lost_idle_time; + if (unlikely(cfs_rq->throttle_count)) + throttled = U64_MAX; + else + throttled = cfs_rq->throttled_clock_pelt_time; + + u64_u32_store(cfs_rq->throttled_pelt_idle, throttled); } -#ifdef CONFIG_CFS_BANDWIDTH /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } #else +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); @@ -179,12 +203,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) } static inline int -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { return 0; } -static inline u64 thermal_load_avg(struct rq *rq) +static inline u64 hw_load_avg(struct rq *rq) { return 0; } @@ -206,6 +230,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { } static inline void update_idle_rq_clock_pelt(struct rq *rq) { } +static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } #endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8f45cdb6463b..bb56805e3d47 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Pressure stall information for CPU, memory and IO * @@ -34,12 +35,21 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * (Naturally, the FULL state doesn't exist for the CPU resource.) - * * SOME = nr_delayed_tasks != 0 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 + * + * What it means for a task to be productive is defined differently + * for each resource. For IO, productive means a running task. For + * memory, productive means a running task that isn't a reclaimer. For + * CPU, productive means an on-CPU task. * - * The percentage of wallclock time spent in those compound stall + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level. At the cgroup level, + * FULL means all non-idle tasks in the cgroup are delayed on the CPU + * resource which is being used by others outside of the cgroup or + * throttled by the cgroup cpu.max configuration. + * + * The percentage of wall clock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, * where the SOME percentage indicates workload slowdowns and the FULL * percentage indicates reduced CPU utilization: @@ -59,7 +69,7 @@ * states, we would have to conclude a CPU SOME pressure number of * 100%, since *somebody* is waiting on a runqueue at all * times. However, that is clearly not the amount of contention the - * workload is experiencing: only one out of 256 possible exceution + * workload is experiencing: only one out of 256 possible execution * threads will be contended at any given time, or about 0.4%. * * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any @@ -73,18 +83,18 @@ * we have to base our calculation on the number of non-idle tasks in * conjunction with the number of available CPUs, which is the number * of potential execution threads. SOME becomes then the proportion of - * delayed tasks to possibe threads, and FULL is the share of possible + * delayed tasks to possible threads, and FULL is the share of possible * threads that are unproductive due to delays: * * threads = min(nr_nonidle_tasks, nr_cpus) * SOME = min(nr_delayed_tasks / threads, 1) - * FULL = (threads - min(nr_running_tasks, threads)) / threads + * FULL = (threads - min(nr_productive_tasks, threads)) / threads * * For the 257 number crunchers on 256 CPUs, this yields: * * threads = min(257, 256) * SOME = min(1 / 256, 1) = 0.4% - * FULL = (256 - min(257, 256)) / 256 = 0% + * FULL = (256 - min(256, 256)) / 256 = 0% * * For the 1 out of 4 memory-delayed tasks, this yields: * @@ -109,7 +119,7 @@ * For each runqueue, we track: * * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * * and then periodically aggregate: @@ -127,24 +137,10 @@ * sampling of the aggregate task states would be. */ -#include "../workqueue_internal.h" -#include <linux/sched/loadavg.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/seqlock.h> -#include <linux/uaccess.h> -#include <linux/cgroup.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/ctype.h> -#include <linux/file.h> -#include <linux/poll.h> -#include <linux/psi.h> -#include "sched.h" - static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); +static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; @@ -164,7 +160,6 @@ __setup("psi=", setup_psi); #define EXP_300s 2034 /* 1/exp(2s/300s) */ /* PSI trigger definitions */ -#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ #define WINDOW_MAX_US 10000000 /* Max window size is 10s */ #define UPDATES_PER_WINDOW 10 /* 10 updates per window */ @@ -179,59 +174,76 @@ struct psi_group psi_system = { static void psi_avgs_work(struct work_struct *work); +static void poll_timer_fn(struct timer_list *t); + static void group_init(struct psi_group *group) { int cpu; + group->enabled = true; for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; - INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); - /* Init trigger-related members */ - atomic_set(&group->poll_scheduled, 0); - mutex_init(&group->trigger_lock); - INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; - group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); - group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; - rcu_assign_pointer(group->poll_kworker, NULL); + + /* Init avg trigger-related members */ + INIT_LIST_HEAD(&group->avg_triggers); + memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers)); + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + + /* Init rtpoll trigger-related members */ + atomic_set(&group->rtpoll_scheduled, 0); + mutex_init(&group->rtpoll_trigger_lock); + INIT_LIST_HEAD(&group->rtpoll_triggers); + group->rtpoll_min_period = U32_MAX; + group->rtpoll_next_update = ULLONG_MAX; + init_waitqueue_head(&group->rtpoll_wait); + timer_setup(&group->rtpoll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->rtpoll_task, NULL); } void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } + if (!cgroup_psi_enabled()) + static_branch_disable(&psi_cgroups_enabled); + psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static u32 test_states(unsigned int *tasks, u32 state_mask) { - switch (state) { - case PSI_IO_SOME: - return tasks[NR_IOWAIT]; - case PSI_IO_FULL: - return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; - case PSI_MEM_SOME: - return tasks[NR_MEMSTALL]; - case PSI_MEM_FULL: - return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; - case PSI_CPU_SOME: - return tasks[NR_RUNNING] > tasks[NR_ONCPU]; - case PSI_NONIDLE: - return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || - tasks[NR_RUNNING]; - default: - return false; + const bool oncpu = state_mask & PSI_ONCPU; + + if (tasks[NR_IOWAIT]) { + state_mask |= BIT(PSI_IO_SOME); + if (!tasks[NR_RUNNING]) + state_mask |= BIT(PSI_IO_FULL); } + + if (tasks[NR_MEMSTALL]) { + state_mask |= BIT(PSI_MEM_SOME); + if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]) + state_mask |= BIT(PSI_MEM_FULL); + } + + if (tasks[NR_RUNNING] > oncpu) + state_mask |= BIT(PSI_CPU_SOME); + + if (tasks[NR_RUNNING] && !oncpu) + state_mask |= BIT(PSI_CPU_FULL); + + if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]) + state_mask |= BIT(PSI_NONIDLE); + + return state_mask; } static void get_recent_times(struct psi_group *group, int cpu, @@ -239,6 +251,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + int current_cpu = raw_smp_processor_id(); + unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; enum psi_states s; unsigned int seq; @@ -253,6 +267,8 @@ static void get_recent_times(struct psi_group *group, int cpu, memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; + if (cpu == current_cpu) + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); } while (read_seqcount_retry(&groupc->seq, seq)); /* Calculate state time deltas against the previous snapshot */ @@ -277,6 +293,28 @@ static void get_recent_times(struct psi_group *group, int cpu, if (delta) *pchanged_states |= (1 << s); } + + /* + * When collect_percpu_times() from the avgs_work, we don't want to + * re-arm avgs_work when all CPUs are IDLE. But the current CPU running + * this avgs_work is never IDLE, cause avgs_work can't be shut off. + * So for the current CPU, we need to re-arm avgs_work only when + * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs + * we can just check PSI_NONIDLE delta. + */ + if (current_work() == &group->avgs_work.work) { + bool reschedule; + + if (cpu == current_cpu) + reschedule = tasks[NR_RUNNING] + + tasks[NR_IOWAIT] + + tasks[NR_MEMSTALL] > 1; + else + reschedule = *pchanged_states & (1 << PSI_NONIDLE); + + if (reschedule) + *pchanged_states |= PSI_STATE_RESCHEDULE; + } } static void calc_avgs(unsigned long avg[3], int missed_periods, @@ -311,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group, /* * Collect the per-cpu time buckets and average them into a - * single time sample that is normalized to wallclock time. + * single time sample that is normalized to wall clock time. * * For averaging, each CPU is weighted by its non-idle time in * the sampling period. This eliminates artifacts from uneven @@ -354,6 +392,114 @@ static void collect_percpu_times(struct psi_group *group, *pchanged_states = changed_states; } +/* Trigger tracking window manipulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div64_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void update_triggers(struct psi_group *group, u64 now, + enum psi_aggregators aggregator) +{ + struct psi_trigger *t; + u64 *total = group->total[aggregator]; + struct list_head *triggers; + u64 *aggregator_total; + + if (aggregator == PSI_AVGS) { + triggers = &group->avg_triggers; + aggregator_total = group->avg_total; + } else { + triggers = &group->rtpoll_triggers; + aggregator_total = group->rtpoll_total; + } + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, triggers, node) { + u64 growth; + bool new_stall; + + new_stall = aggregator_total[t->state] != total[t->state]; + + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; + /* + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). + */ + if (new_stall) { + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (!t->pending_event) { + if (growth < t->threshold) + continue; + + t->pending_event = true; + } + } + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) { + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + } + t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; + } +} + static u64 update_averages(struct psi_group *group, u64 now) { unsigned long missed_periods = 0; @@ -412,7 +558,6 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; u32 changed_states; - bool nonidle; u64 now; dwork = to_delayed_work(work); @@ -423,7 +568,6 @@ static void psi_avgs_work(struct work_struct *work) now = sched_clock(); collect_percpu_times(group, PSI_AVGS, &changed_states); - nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -431,10 +575,12 @@ static void psi_avgs_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - if (now >= group->avg_next_update) + if (now >= group->avg_next_update) { + update_triggers(group, now, PSI_AVGS); group->avg_next_update = update_averages(group, now); + } - if (nonidle) { + if (changed_states & PSI_STATE_RESCHEDULE) { schedule_delayed_work(dwork, nsecs_to_jiffies( group->avg_next_update - now) + 1); } @@ -442,194 +588,161 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(&group->avgs_lock); } -/* Trigger tracking window manupulations */ -static void window_reset(struct psi_window *win, u64 now, u64 value, - u64 prev_growth) -{ - win->start_time = now; - win->start_value = value; - win->prev_growth = prev_growth; -} - -/* - * PSI growth tracking window update and growth calculation routine. - * - * This approximates a sliding tracking window by interpolating - * partially elapsed windows using historical growth data from the - * previous intervals. This minimizes memory requirements (by not storing - * all the intermediate values in the previous window) and simplifies - * the calculations. It works well because PSI signal changes only in - * positive direction and over relatively small window sizes the growth - * is close to linear. - */ -static u64 window_update(struct psi_window *win, u64 now, u64 value) -{ - u64 elapsed; - u64 growth; - - elapsed = now - win->start_time; - growth = value - win->start_value; - /* - * After each tracking window passes win->start_value and - * win->start_time get reset and win->prev_growth stores - * the average per-window growth of the previous window. - * win->prev_growth is then used to interpolate additional - * growth from the previous window assuming it was linear. - */ - if (elapsed > win->size) - window_reset(win, now, value, growth); - else { - u32 remaining; - - remaining = win->size - elapsed; - growth += div64_u64(win->prev_growth * remaining, win->size); - } - - return growth; -} - -static void init_triggers(struct psi_group *group, u64 now) +static void init_rtpoll_triggers(struct psi_group *group, u64 now) { struct psi_trigger *t; - list_for_each_entry(t, &group->triggers, node) + list_for_each_entry(t, &group->rtpoll_triggers, node) window_reset(&t->win, now, group->total[PSI_POLL][t->state], 0); - memcpy(group->polling_total, group->total[PSI_POLL], - sizeof(group->polling_total)); - group->polling_next_update = now + group->poll_min_period; + memcpy(group->rtpoll_total, group->total[PSI_POLL], + sizeof(group->rtpoll_total)); + group->rtpoll_next_update = now + group->rtpoll_min_period; } -static u64 update_triggers(struct psi_group *group, u64 now) +/* Schedule rtpolling if it's not already scheduled or forced. */ +static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, + bool force) { - struct psi_trigger *t; - bool new_stall = false; - u64 *total = group->total[PSI_POLL]; + struct task_struct *task; /* - * On subsequent updates, calculate growth deltas and let - * watchers know when their specified thresholds are exceeded. + * atomic_xchg should be called even when !force to provide a + * full memory barrier (see the comment inside psi_rtpoll_work). */ - list_for_each_entry(t, &group->triggers, node) { - u64 growth; - - /* Check for stall activity */ - if (group->polling_total[t->state] == total[t->state]) - continue; - - /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. - */ - new_stall = true; - - /* Calculate growth since last update */ - growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; - - /* Limit event signaling to once per window */ - if (now < t->last_event_time + t->win.size) - continue; - - /* Generate an event */ - if (cmpxchg(&t->event, 0, 1) == 0) - wake_up_interruptible(&t->event_wait); - t->last_event_time = now; - } - - if (new_stall) - memcpy(group->polling_total, total, - sizeof(group->polling_total)); - - return now + group->poll_min_period; -} - -/* - * Schedule polling if it's not already scheduled. It's safe to call even from - * hotpath because even though kthread_queue_delayed_work takes worker->lock - * spinlock that spinlock is never contended due to poll_scheduled atomic - * preventing such competition. - */ -static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) -{ - struct kthread_worker *kworker; - - /* Do not reschedule if already scheduled */ - if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force) return; rcu_read_lock(); - kworker = rcu_dereference(group->poll_kworker); + task = rcu_dereference(group->rtpoll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ - if (likely(kworker)) - kthread_queue_delayed_work(kworker, &group->poll_work, delay); + if (likely(task)) + mod_timer(&group->rtpoll_timer, jiffies + delay); else - atomic_set(&group->poll_scheduled, 0); + atomic_set(&group->rtpoll_scheduled, 0); rcu_read_unlock(); } -static void psi_poll_work(struct kthread_work *work) +static void psi_rtpoll_work(struct psi_group *group) { - struct kthread_delayed_work *dwork; - struct psi_group *group; + bool force_reschedule = false; u32 changed_states; u64 now; - dwork = container_of(work, struct kthread_delayed_work, work); - group = container_of(dwork, struct psi_group, poll_work); + mutex_lock(&group->rtpoll_trigger_lock); - atomic_set(&group->poll_scheduled, 0); + now = sched_clock(); - mutex_lock(&group->trigger_lock); + if (now > group->rtpoll_until) { + /* + * We are either about to start or might stop rtpolling if no + * state change was recorded. Resetting rtpoll_scheduled leaves + * a small window for psi_group_change to sneak in and schedule + * an immediate rtpoll_work before we get to rescheduling. One + * potential extra wakeup at the end of the rtpolling window + * should be negligible and rtpoll_next_update still keeps + * updates correctly on schedule. + */ + atomic_set(&group->rtpoll_scheduled, 0); + /* + * A task change can race with the rtpoll worker that is supposed to + * report on it. To avoid missing events, ensure ordering between + * rtpoll_scheduled and the task state accesses, such that if the + * rtpoll worker misses the state update, the task change is + * guaranteed to reschedule the rtpoll worker: + * + * rtpoll worker: + * atomic_set(rtpoll_scheduled, 0) + * smp_mb() + * LOAD states + * + * task change: + * STORE states + * if atomic_xchg(rtpoll_scheduled, 1) == 0: + * schedule rtpoll worker + * + * The atomic_xchg() implies a full barrier. + */ + smp_mb(); + } else { + /* The rtpolling window is not over, keep rescheduling */ + force_reschedule = true; + } - now = sched_clock(); collect_percpu_times(group, PSI_POLL, &changed_states); - if (changed_states & group->poll_states) { - /* Initialize trigger windows when entering polling mode */ - if (now > group->polling_until) - init_triggers(group, now); + if (changed_states & group->rtpoll_states) { + /* Initialize trigger windows when entering rtpolling mode */ + if (now > group->rtpoll_until) + init_rtpoll_triggers(group, now); /* * Keep the monitor active for at least the duration of the * minimum tracking window as long as monitor states are * changing. */ - group->polling_until = now + - group->poll_min_period * UPDATES_PER_WINDOW; + group->rtpoll_until = now + + group->rtpoll_min_period * UPDATES_PER_WINDOW; } - if (now > group->polling_until) { - group->polling_next_update = ULLONG_MAX; + if (now > group->rtpoll_until) { + group->rtpoll_next_update = ULLONG_MAX; goto out; } - if (now >= group->polling_next_update) - group->polling_next_update = update_triggers(group, now); + if (now >= group->rtpoll_next_update) { + if (changed_states & group->rtpoll_states) { + update_triggers(group, now, PSI_POLL); + memcpy(group->rtpoll_total, group->total[PSI_POLL], + sizeof(group->rtpoll_total)); + } + group->rtpoll_next_update = now + group->rtpoll_min_period; + } - psi_schedule_poll_work(group, - nsecs_to_jiffies(group->polling_next_update - now) + 1); + psi_schedule_rtpoll_work(group, + nsecs_to_jiffies(group->rtpoll_next_update - now) + 1, + force_reschedule); out: - mutex_unlock(&group->trigger_lock); + mutex_unlock(&group->rtpoll_trigger_lock); } -static void record_times(struct psi_group_cpu *groupc, int cpu, - bool memstall_tick) +static int psi_rtpoll_worker(void *data) +{ + struct psi_group *group = (struct psi_group *)data; + + sched_set_fifo_low(current); + + while (true) { + wait_event_interruptible(group->rtpoll_wait, + atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + psi_rtpoll_work(group); + } + return 0; +} + +static void poll_timer_fn(struct timer_list *t) +{ + struct psi_group *group = from_timer(group, t, rtpoll_timer); + + atomic_set(&group->rtpoll_wakeup, 1); + wake_up_interruptible(&group->rtpoll_wait); +} + +static void record_times(struct psi_group_cpu *groupc, u64 now) { u32 delta; - u64 now; - now = cpu_clock(cpu); delta = now - groupc->state_start; groupc->state_start = now; @@ -643,27 +756,13 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_MEM_SOME] += delta; if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; - else if (memstall_tick) { - u32 sample; - /* - * Since we care about lost potential, a - * memstall is FULL when there are no other - * working tasks, but also when the CPU is - * actively reclaiming and nothing productive - * could run even if it were runnable. - * - * When the timer tick sees a reclaiming CPU, - * regardless of runnable tasks, sample a FULL - * tick (or less if it hasn't been a full tick - * since the last state change). - */ - sample = min(delta, (u32)jiffies_to_nsecs(1)); - groupc->times[PSI_MEM_FULL] += sample; - } } - if (groupc->state_mask & (1 << PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) { groupc->times[PSI_CPU_SOME] += delta; + if (groupc->state_mask & (1 << PSI_CPU_FULL)) + groupc->times[PSI_CPU_FULL] += delta; + } if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; @@ -674,78 +773,112 @@ static void psi_group_change(struct psi_group *group, int cpu, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; - enum psi_states s; + u32 state_mask; + u64 now; + lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state + * First we update the task counts according to the state * change requested through the @clear and @set bits. + * + * Then if the cgroup PSI stats accounting enabled, we + * assess the aggregate resource states this CPU's tasks + * have been in since the last change, and account any + * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); - record_times(groupc, cpu, false); + /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; - if (groupc->tasks[t] == 0 && !psi_bug) { + if (groupc->tasks[t]) { + groupc->tasks[t]--; + } else if (!psi_bug) { printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], groupc->tasks[3], clear, set); psi_bug = 1; } - groupc->tasks[t]--; } for (t = 0; set; set &= ~(1 << t), t++) if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ - for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) - state_mask |= (1 << s); + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; + + write_seqcount_end(&groupc->seq); + return; } + + state_mask = test_states(groupc->tasks, state_mask); + + /* + * Since we care about lost potential, a memstall is FULL + * when there are no other working tasks, but also when + * the CPU is actively reclaiming and nothing productive + * could run even if it were runnable. So when the current + * task in a cgroup is in_memstall, the corresponding groupc + * on that cpu is in PSI_MEM_FULL state. + */ + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) + state_mask |= (1 << PSI_MEM_FULL); + + record_times(groupc, now); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); - if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + if (state_mask & group->rtpoll_states) + psi_schedule_rtpoll_work(group, 1, false); if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { #ifdef CONFIG_CGROUPS - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else if (*iter == &psi_system) - return NULL; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } -#else - if (*iter) - return NULL; + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } @@ -768,27 +901,16 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; if (!task->pid) return; psi_flags_change(task, clear, set); - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; - - while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, wake_clock); + group = task_psi_group(task); + do { + psi_group_change(group, cpu, clear, set, true); + } while ((group = group->parent)); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -796,59 +918,124 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - void *iter; if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); /* - * When moving state between tasks, the group that - * contains them both does not change: we can stop - * updating the tree once we reach the first common - * ancestor. Iterate @next's ancestors until we - * encounter @prev's state. + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. */ - iter = NULL; - while ((group = iterate_groups(next, &iter))) { - if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + group = task_psi_group(next); + do { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { common = group; break; } psi_group_change(group, cpu, 0, TSK_ONCPU, true); - } + } while ((group = group->parent)); } - /* - * If this is a voluntary sleep, dequeue will have taken care - * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We - * only need to deal with it during preemption. - */ - if (sleep) - return; - if (prev->pid) { - psi_flags_change(prev, TSK_ONCPU, 0); + int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; + + /* + * When we're going to sleep, psi_dequeue() lets us + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and + * TSK_IOWAIT here, where we can combine it with + * TSK_ONCPU and save walking common ancestors twice. + */ + if (sleep) { + clear |= TSK_RUNNING; + if (prev->in_memstall) + clear |= TSK_MEMSTALL_RUNNING; + if (prev->in_iowait) + set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; + } - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, TSK_ONCPU, 0, true); + psi_flags_change(prev, clear, set); + + group = task_psi_group(prev); + do { + if (group == common) + break; + psi_group_change(group, cpu, clear, set, wake_clock); + } while ((group = group->parent)); + + /* + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. + */ + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { + clear &= ~TSK_ONCPU; + for (; group; group = group->parent) + psi_group_change(group, cpu, clear, set, wake_clock); + } } } -void psi_memstall_tick(struct task_struct *task, int cpu) +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { + int cpu = task_cpu(curr); struct psi_group *group; - void *iter = NULL; + struct psi_group_cpu *groupc; + s64 delta; + u64 irq; + + if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) + return; - while ((group = iterate_groups(task, &iter))) { - struct psi_group_cpu *groupc; + if (!curr->pid) + return; + + lockdep_assert_rq_held(rq); + group = task_psi_group(curr); + if (prev && task_psi_group(prev) == group) + return; + + irq = irq_time_read(cpu); + delta = (s64)(irq - rq->psi_irq_time); + if (delta < 0) + return; + rq->psi_irq_time = irq; + + do { + u64 now; + + if (!group->enabled) + continue; groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); + now = cpu_clock(cpu); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + write_seqcount_end(&groupc->seq); - } + + if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_rtpoll_work(group, 1, false); + } while ((group = group->parent)); } +#endif /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -876,10 +1063,11 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; - psi_task_change(current, 0, TSK_MEMSTALL); + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -905,33 +1093,42 @@ void psi_memstall_leave(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 0; - psi_task_change(current, TSK_MEMSTALL, 0); + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0); rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; - cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); - if (!cgroup->psi.pcpu) + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); + if (!cgroup->psi) + return -ENOMEM; + + cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi->pcpu) { + kfree(cgroup->psi); return -ENOMEM; - group_init(&cgroup->psi); + } + group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; - cancel_delayed_work_sync(&cgroup->psi.avgs_work); - free_percpu(cgroup->psi.pcpu); + cancel_delayed_work_sync(&cgroup->psi->avgs_work); + free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ - WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); + WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); + kfree(cgroup->psi); } /** @@ -948,11 +1145,11 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - unsigned int task_flags = 0; + unsigned int task_flags; struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -963,15 +1160,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) { - task_flags = TSK_RUNNING; - if (task_current(rq, task)) - task_flags |= TSK_ONCPU; - } else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->in_memstall) - task_flags |= TSK_MEMSTALL; + /* + * We may race with schedule() dropping the rq lock between + * deactivating prev and switching to next. Because the psi + * updates from the deactivation are deferred to the switch + * callback to save cgroup tree updates, the task's scheduling + * state here is not coherent with its psi state: + * + * schedule() cgroup_move_task() + * rq_lock() + * deactivate_task() + * p->on_rq = 0 + * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates + * pick_next_task() + * rq_unlock() + * rq_lock() + * psi_task_change() // old cgroup + * task->cgroups = to + * psi_task_change() // new cgroup + * rq_unlock() + * rq_lock() + * psi_sched_switch() // does deferred updates in new cgroup + * + * Don't rely on the scheduling state. Use psi_flags instead. + */ + task_flags = task->psi_flags; if (task_flags) psi_task_change(task, task_flags, 0); @@ -984,16 +1197,54 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_states() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_states() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irq(rq, &rf); + psi_group_change(group, cpu, 0, 0, true); + rq_unlock_irq(rq, &rf); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (!irqtime_enabled() && res == PSI_IRQ) + return -EOPNOTSUPP; +#endif + /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); @@ -1002,18 +1253,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2 - (res == PSI_CPU); full++) { - unsigned long avg[3]; - u64 total; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1023,47 +1281,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_io_show, NULL); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_memory_show, NULL); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return single_open(file, psi_cpu_show, NULL); -} - -struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of) { struct psi_trigger *t; enum psi_states state; u32 threshold_us; + bool privileged; u32 window_us; if (static_branch_likely(&psi_disabled)) return ERR_PTR(-EOPNOTSUPP); + /* + * Checking the privilege here on file->f_cred implies that a privileged user + * could open the file and delegate the write to an unprivileged one. + */ + privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) state = PSI_IO_SOME + res * 2; else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) @@ -1071,11 +1307,22 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); - if (window_us < WINDOW_MIN_US || - window_us > WINDOW_MAX_US) + if (window_us == 0 || window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* + * Unprivileged users can only use 2s windows so that averages aggregation + * work is used, and no RT threads need to be spawned. + */ + if (!privileged && window_us % 2000000) return ERR_PTR(-EINVAL); /* Check threshold */ @@ -1090,123 +1337,137 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->state = state; t->threshold = threshold_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC; - window_reset(&t->win, 0, 0, 0); + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; - init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); - - mutex_lock(&group->trigger_lock); - - if (!rcu_access_pointer(group->poll_kworker)) { - struct sched_param param = { - .sched_priority = 1, - }; - struct kthread_worker *kworker; - - kworker = kthread_create_worker(0, "psimon"); - if (IS_ERR(kworker)) { - kfree(t); - mutex_unlock(&group->trigger_lock); - return ERR_CAST(kworker); + t->of = of; + if (!of) + init_waitqueue_head(&t->event_wait); + t->pending_event = false; + t->aggregator = privileged ? PSI_POLL : PSI_AVGS; + + if (privileged) { + mutex_lock(&group->rtpoll_trigger_lock); + + if (!rcu_access_pointer(group->rtpoll_task)) { + struct task_struct *task; + + task = kthread_create(psi_rtpoll_worker, group, "psimon"); + if (IS_ERR(task)) { + kfree(t); + mutex_unlock(&group->rtpoll_trigger_lock); + return ERR_CAST(task); + } + atomic_set(&group->rtpoll_wakeup, 0); + wake_up_process(task); + rcu_assign_pointer(group->rtpoll_task, task); } - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); - kthread_init_delayed_work(&group->poll_work, - psi_poll_work); - rcu_assign_pointer(group->poll_kworker, kworker); - } - list_add(&t->node, &group->triggers); - group->poll_min_period = min(group->poll_min_period, - div_u64(t->win.size, UPDATES_PER_WINDOW)); - group->nr_triggers[t->state]++; - group->poll_states |= (1 << t->state); + list_add(&t->node, &group->rtpoll_triggers); + group->rtpoll_min_period = min(group->rtpoll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->rtpoll_nr_triggers[t->state]++; + group->rtpoll_states |= (1 << t->state); - mutex_unlock(&group->trigger_lock); + mutex_unlock(&group->rtpoll_trigger_lock); + } else { + mutex_lock(&group->avgs_lock); + list_add(&t->node, &group->avg_triggers); + group->avg_nr_triggers[t->state]++; + + mutex_unlock(&group->avgs_lock); + } return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; - struct kthread_worker *kworker_to_destroy = NULL; + struct psi_group *group; + struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* - * Wakeup waiters to stop polling. Can happen if cgroup is deleted - * from under a polling process. + * Wakeup waiters to stop polling and clear the queue to prevent it from + * being accessed later. Can happen if cgroup is deleted from under a + * polling process. */ - wake_up_interruptible(&t->event_wait); - - mutex_lock(&group->trigger_lock); - - if (!list_empty(&t->node)) { - struct psi_trigger *tmp; - u64 period = ULLONG_MAX; - - list_del(&t->node); - group->nr_triggers[t->state]--; - if (!group->nr_triggers[t->state]) - group->poll_states &= ~(1 << t->state); - /* reset min update period for the remaining triggers */ - list_for_each_entry(tmp, &group->triggers, node) - period = min(period, div_u64(tmp->win.size, - UPDATES_PER_WINDOW)); - group->poll_min_period = period; - /* Destroy poll_kworker when the last trigger is destroyed */ - if (group->poll_states == 0) { - group->polling_until = 0; - kworker_to_destroy = rcu_dereference_protected( - group->poll_kworker, - lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_kworker, NULL); + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + + if (t->aggregator == PSI_AVGS) { + mutex_lock(&group->avgs_lock); + if (!list_empty(&t->node)) { + list_del(&t->node); + group->avg_nr_triggers[t->state]--; } + mutex_unlock(&group->avgs_lock); + } else { + mutex_lock(&group->rtpoll_trigger_lock); + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->rtpoll_nr_triggers[t->state]--; + if (!group->rtpoll_nr_triggers[t->state]) + group->rtpoll_states &= ~(1 << t->state); + /* + * Reset min update period for the remaining triggers + * iff the destroying trigger had the min window size. + */ + if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) { + list_for_each_entry(tmp, &group->rtpoll_triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->rtpoll_min_period = period; + } + /* Destroy rtpoll_task when the last trigger is destroyed */ + if (group->rtpoll_states == 0) { + group->rtpoll_until = 0; + task_to_destroy = rcu_dereference_protected( + group->rtpoll_task, + lockdep_is_held(&group->rtpoll_trigger_lock)); + rcu_assign_pointer(group->rtpoll_task, NULL); + del_timer(&group->rtpoll_timer); + } + } + mutex_unlock(&group->rtpoll_trigger_lock); } - mutex_unlock(&group->trigger_lock); - /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_kworker RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_kworker + * Wait for psi_schedule_rtpoll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * rtpoll_task. */ synchronize_rcu(); /* - * Destroy the kworker after releasing trigger_lock to prevent a - * deadlock while waiting for psi_poll_work to acquire trigger_lock + * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent + * a deadlock while waiting for psi_rtpoll_work to acquire + * rtpoll_trigger_lock */ - if (kworker_to_destroy) { + if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_kworker. - * But it might have been already scheduled before - * that - deschedule it cleanly before destroying it. + * can no longer be found through group->rtpoll_task. */ - kthread_cancel_delayed_work_sync(&group->poll_work); - atomic_set(&group->poll_scheduled, 0); - - kthread_destroy_worker(kworker_to_destroy); + kthread_stop(task_to_destroy); + atomic_set(&group->rtpoll_scheduled, 0); } kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1216,27 +1477,52 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - rcu_read_unlock(); - - poll_wait(file, &t->event_wait, wait); + if (t->of) + kernfs_generic_poll(t->of, wait); + else + poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1257,14 +1543,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, res, file, NULL); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1299,7 +1595,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } @@ -1330,14 +1626,46 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_irq_show, NULL); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); - proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); + proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); + proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); + proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f395ddb75f38..4b8e33c615b1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -3,18 +3,98 @@ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR * policies) */ -#include "sched.h" - -#include "pelt.h" int sched_rr_timeslice = RR_TIMESLICE; -int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); +/* + * period over which we measure -rt task CPU usage in us. + * default: 1s + */ +int sysctl_sched_rt_period = 1000000; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +#ifdef CONFIG_SYSCTL +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; +static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +static const struct ctl_table sched_rt_sysctls[] = { + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_NEG_ONE, + .extra2 = (void *)&sysctl_sched_rt_period, + }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sysctl_sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, +}; + +static int __init sched_rt_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_rt_sysctls); + return 0; +} +late_initcall(sched_rt_sysctl_init); +#endif + +void init_rt_rq(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); -struct rt_bandwidth def_rt_bandwidth; +#if defined CONFIG_SMP + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif /* CONFIG_SMP */ + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); +#endif +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { @@ -52,11 +132,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.function = sched_rt_period_timer; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) { - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; @@ -75,36 +152,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -void init_rt_rq(struct rt_rq *rt_rq) +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ - /* We start is dequeued state, because no RT tasks are queued */ - rt_rq->rt_queued = 0; + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); + do_start_rt_bandwidth(rt_b); } -#ifdef CONFIG_RT_GROUP_SCHED static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) { hrtimer_cancel(&rt_b->rt_period_timer); @@ -137,12 +192,15 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) return rt_rq->rq; } -void free_rt_sched_group(struct task_group *tg) +void unregister_rt_sched_group(struct task_group *tg) { - int i; - if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); +} + +void free_rt_sched_group(struct task_group *tg) +{ + int i; for_each_possible_cpu(i) { if (tg->rt_rq) @@ -161,7 +219,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; rt_rq->tg = tg; @@ -195,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!tg->rt_se) goto err; - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); + init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); for_each_possible_cpu(i) { rt_rq = kzalloc_node(sizeof(struct rt_rq), @@ -250,6 +307,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return &rq->rt; } +void unregister_rt_sched_group(struct task_group *tg) { } + void free_rt_sched_group(struct task_group *tg) { } int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) @@ -260,12 +319,10 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP -static void pull_rt_task(struct rq *this_rq); - static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + return rq->online && rq->rt.highest_prio.curr > prev->prio; } static inline int rt_overloaded(struct rq *rq) @@ -302,60 +359,13 @@ static inline void rt_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } -static void update_rt_migration(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { - if (!rt_rq->overloaded) { - rt_set_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 1; - } - } else if (rt_rq->overloaded) { - rt_clear_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 0; - } -} - -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory++; - - update_rt_migration(rt_rq); -} - -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory--; - - update_rt_migration(rt_rq); -} - static inline int has_pushable_tasks(struct rq *rq) { return !plist_head_empty(&rq->rt.pushable_tasks); } -static DEFINE_PER_CPU(struct callback_head, rt_push_head); -static DEFINE_PER_CPU(struct callback_head, rt_pull_head); +static DEFINE_PER_CPU(struct balance_callback, rt_push_head); +static DEFINE_PER_CPU(struct balance_callback, rt_pull_head); static void push_rt_tasks(struct rq *); static void pull_rt_task(struct rq *); @@ -382,6 +392,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) /* Update the highest prio pushable task */ if (p->prio < rq->rt.highest_prio.next) rq->rt.highest_prio.next = p->prio; + + if (!rq->rt.overloaded) { + rt_set_overload(rq); + rq->rt.overloaded = 1; + } } static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) @@ -393,8 +408,14 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks); rq->rt.highest_prio.next = p->prio; - } else - rq->rt.highest_prio.next = MAX_RT_PRIO; + } else { + rq->rt.highest_prio.next = MAX_RT_PRIO-1; + + if (rq->rt.overloaded) { + rt_clear_overload(rq); + rq->rt.overloaded = 0; + } + } } #else @@ -407,32 +428,13 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) { } -static inline -void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - -static inline -void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - -static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_rt_task(struct rq *this_rq) -{ -} - static inline void rt_queue_push_tasks(struct rq *rq) { } #endif /* CONFIG_SMP */ static void enqueue_top_rt_rq(struct rt_rq *rt_rq); -static void dequeue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { @@ -461,13 +463,13 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) unsigned int cpu_cap; /* Only heterogeneous systems can benefit from this check */ - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return true; min_cap = uclamp_eff_value(p, UCLAMP_MIN); max_cap = uclamp_eff_value(p, UCLAMP_MAX); - cpu_cap = capacity_orig_of(cpu); + cpu_cap = arch_scale_cpu_capacity(cpu); return cpu_cap >= min(min_cap, max_cap); } @@ -526,7 +528,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; @@ -540,7 +542,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) else if (!on_rt_rq(rt_se)) enqueue_rt_entity(rt_se, 0); - if (rt_rq->highest_prio.curr < curr->prio) + if (rt_rq->highest_prio.curr < donor->prio) resched_curr(rq); } } @@ -553,7 +555,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (!rt_se) { - dequeue_top_rt_rq(rt_rq); + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); } @@ -601,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &rt_rq->tg->rt_bandwidth; } -#else /* !CONFIG_RT_GROUP_SCHED */ - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(def_rt_bandwidth.rt_period); -} - -typedef struct rt_rq *rt_rq_iter_t; - -#define for_each_rt_rq(rt_rq, iter, rq) \ - for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = NULL) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return NULL; -} - -static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - struct rq *rq = rq_of_rt_rq(rt_rq); - - if (!rt_rq->rt_nr_running) - return; - - enqueue_top_rt_rq(rt_rq); - resched_curr(rq); -} - -static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ - dequeue_top_rt_rq(rt_rq); -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) -{ - return &cpu_rq(cpu)->rt; -} - -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &def_rt_bandwidth; -} - -#endif /* CONFIG_RT_GROUP_SCHED */ - bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -699,7 +637,7 @@ static void do_balance_runtime(struct rt_rq *rt_rq) /* * Either all rqs have inf runtime and there's nothing to steal * or __disable_runtime() below sets a specific rq to inf to - * indicate its been disabled and disalow stealing. + * indicate its been disabled and disallow stealing. */ if (iter->rt_runtime == RUNTIME_INF) goto next; @@ -795,7 +733,7 @@ static void __disable_runtime(struct rq *rq) * We cannot be left wanting - that would mean some runtime * leaked out of the system. */ - BUG_ON(want); + WARN_ON_ONCE(want); balanced: /* * Disable all the borrow logic by pretending we have inf @@ -856,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); -#ifdef CONFIG_RT_GROUP_SCHED + /* * FIXME: isolated CPUs should really leave the root task group, * whether they are isolcpus or were isolated via cpusets, lest @@ -868,11 +806,12 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) */ if (rt_b == &root_task_group.rt_bandwidth) span = cpu_online_mask; -#endif + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq_flags rf; int skip; /* @@ -887,7 +826,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); if (rt_rq->rt_time) { @@ -904,7 +843,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) /* * When we're idle and a woken (rt) task is - * throttled check_preempt_curr() will set + * throttled wakeup_preempt() will set * skip_update and the time between the wakeup * and this unthrottle will get accounted as * 'runtime'. @@ -925,7 +864,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) @@ -934,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) return idle; } -static inline int rt_se_prio(struct sched_rt_entity *rt_se) -{ -#ifdef CONFIG_RT_GROUP_SCHED - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq) - return rt_rq->highest_prio.curr; -#endif - - return rt_task_of(rt_se)->prio; -} - static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -989,52 +916,114 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } +#else /* !CONFIG_RT_GROUP_SCHED */ + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = NULL) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return NULL; +} + +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_curr(rq); +} + +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return false; +} + +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +#ifdef CONFIG_SMP +static void __enable_runtime(struct rq *rq) { } +static void __disable_runtime(struct rq *rq) { } +#endif + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline int rt_se_prio(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq) + return rt_rq->highest_prio.curr; +#endif + + return rt_task_of(rt_se)->prio; +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ static void update_curr_rt(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_rt_entity *rt_se = &curr->rt; - u64 delta_exec; - u64 now; + struct task_struct *donor = rq->donor; + s64 delta_exec; - if (curr->sched_class != &rt_sched_class) + if (donor->sched_class != &rt_sched_class) return; - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + delta_exec = update_curr_common(rq); + if (unlikely(delta_exec <= 0)) return; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity *rt_se = &donor->rt; if (!rt_bandwidth_enabled()) return; for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } +#endif } static void -dequeue_top_rt_rq(struct rt_rq *rt_rq) +dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count) { struct rq *rq = rq_of_rt_rq(rt_rq); @@ -1045,7 +1034,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) BUG_ON(!rq->nr_running); - sub_nr_running(rq, rt_rq->rt_nr_running); + sub_nr_running(rq, count); rt_rq->rt_queued = 0; } @@ -1138,7 +1127,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) /* * This may have been our highest task, and therefore - * we may have some recomputation to do + * we may have some re-computation to do */ if (prio == prev_prio) { struct rt_prio_array *array = &rt_rq->active; @@ -1147,8 +1136,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) sched_find_first_bit(array->bitmap); } - } else - rt_rq->highest_prio.curr = MAX_RT_PRIO; + } else { + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + } dec_rt_prio_smp(rt_rq, prio, prev_prio); } @@ -1186,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { - start_rt_bandwidth(&def_rt_bandwidth); } static inline @@ -1229,7 +1218,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); - inc_rt_migration(rt_se, rt_rq); inc_rt_group(rt_se, rt_rq); } @@ -1242,7 +1230,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); - dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } @@ -1269,6 +1256,112 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr rt_se->on_list = 0; } +static inline struct sched_statistics * +__schedstats_from_rt_se(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + /* schedstats is not supported for rt group. */ + if (!rt_entity_is_task(rt_se)) + return NULL; +#endif + + return &rt_task_of(rt_se)->stats; +} + +static inline void +update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_rt(rt_rq, rt_se); +} + +static inline void +update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + if ((flags & DEQUEUE_SLEEP) && p) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_rt_rq(rt_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_rt_rq(rt_rq))); + } +} + static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); @@ -1324,24 +1417,29 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; + unsigned int rt_nr_running; for_each_sched_rt_entity(rt_se) { rt_se->back = back; back = rt_se; } - dequeue_top_rt_rq(rt_rq_of_se(back)); + rt_nr_running = rt_rq_of_se(back)->rt_nr_running; for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se, flags); } + + dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running); } static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); + update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, flags); @@ -1352,6 +1450,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); + update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { @@ -1374,13 +1474,16 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; + check_schedstat_required(); + update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); + enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } -static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) +static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; @@ -1388,6 +1491,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); + + return true; } /* @@ -1428,20 +1533,21 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int flags) { - struct task_struct *curr; + struct task_struct *curr, *donor; struct rq *rq; bool test; /* For anything but wake ups, just return the task_cpu */ - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (!(flags & (WF_TTWU | WF_FORK))) goto out; rq = cpu_rq(cpu); rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ + donor = READ_ONCE(rq->donor); /* * If the current task on @p's runqueue is an RT task, then @@ -1459,7 +1565,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * * For equal prio tasks, we just let the scheduler sort it out. * - * Otherwise, just let it ride on the affined RQ and the + * Otherwise, just let it ride on the affine RQ and the * post-schedule router will push the preempted task away * * This test is optimistic, if we get it wrong the load-balancer @@ -1470,8 +1576,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * systems like big.LITTLE. */ test = curr && - unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); + unlikely(rt_task(donor)) && + (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); @@ -1501,12 +1607,8 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - /* - * Current can't be migrated, useless to reschedule, - * let's hope p can move out. - */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) return; /* @@ -1547,9 +1649,11 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { - if (p->prio < rq->curr->prio) { + struct task_struct *donor = rq->donor; + + if (p->prio < donor->prio) { resched_curr(rq); return; } @@ -1567,14 +1671,19 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); #endif } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; + p->se.exec_start = rq_clock_task(rq); + if (on_rt_rq(&p->rt)) + update_stats_wait_end_rt(rt_rq, rt_se); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); @@ -1587,14 +1696,13 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f * utilization. We only care of the case where we start to schedule a * rt task */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->donor->sched_class != &rt_sched_class) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); rt_queue_push_tasks(rq); } -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq) +static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) { struct rt_prio_array *array = &rt_rq->active; struct sched_rt_entity *next = NULL; @@ -1605,6 +1713,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; + if (SCHED_WARN_ON(list_empty(queue))) + return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; @@ -1616,15 +1726,16 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; do { - rt_se = pick_next_rt_entity(rq, rt_rq); - BUG_ON(!rt_se); + rt_se = pick_next_rt_entity(rt_rq); + if (unlikely(!rt_se)) + return NULL; rt_rq = group_rt_rq(rt_se); } while (rt_rq); return rt_task_of(rt_se); } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq) { struct task_struct *p; @@ -1632,12 +1743,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) return NULL; p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p, true); + return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next) { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; + + if (on_rt_rq(&p->rt)) + update_stats_wait_start_rt(rt_rq, rt_se); + update_curr_rt(rq); update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); @@ -1655,15 +1772,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) -{ - if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) - return 1; - - return 0; -} - /* * Return the highest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise @@ -1677,7 +1785,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; plist_for_each_entry(p, head, pushable_tasks) { - if (pick_rt_task(rq, p, cpu)) + if (task_is_pushable(rq, p, cpu)) return p; } @@ -1705,7 +1813,7 @@ static int find_lowest_rq(struct task_struct *task) * If we're on asym system ensure we consider the different capacities * of the CPUs when searching for the lowest_mask. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, task, lowest_mask, @@ -1752,8 +1860,8 @@ static int find_lowest_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(lowest_mask, + sched_domain_span(sd)); if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; @@ -1770,7 +1878,7 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(lowest_mask); + cpu = cpumask_any_distribute(lowest_mask); if (cpu < nr_cpu_ids) return cpu; @@ -1809,11 +1917,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * the mean time, task could have * migrated already or had its affinity changed. * Also make sure that it wasn't scheduled on its rq. + * It is possible the task was scheduled, set + * "migrate_disabled" and then got preempted, so we must + * check the task migration disable flag here too. */ if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || - task_running(rq, task) || + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || + task_on_cpu(rq, task) || !rt_task(task) || + is_migration_disabled(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); @@ -1846,6 +1958,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!task_on_rq_queued(p)); @@ -1859,7 +1972,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) * running task can migrate over to a CPU that is running a task * of lesser priority. */ -static int push_rt_task(struct rq *rq) +static int push_rt_task(struct rq *rq, bool pull) { struct task_struct *next_task; struct rq *lowest_rq; @@ -1873,19 +1986,61 @@ static int push_rt_task(struct rq *rq) return 0; retry: - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * It's possible that the next_task slipped in of * higher priority than current. If that's the case * just reschedule current. */ - if (unlikely(next_task->prio < rq->curr->prio)) { + if (unlikely(next_task->prio < rq->donor->prio)) { resched_curr(rq); return 0; } + if (is_migration_disabled(next_task)) { + struct task_struct *push_task = NULL; + int cpu; + + if (!pull || rq->push_busy) + return 0; + + /* + * Invoking find_lowest_rq() on anything but an RT task doesn't + * make sense. Per the above priority check, curr has to + * be of higher priority than next_task, so no need to + * reschedule when bailing out. + * + * Note that the stoppers are masqueraded as SCHED_FIFO + * (cf. sched_set_stop_task()), so we can't rely on rt_task(). + */ + if (rq->donor->sched_class != &rt_sched_class) + return 0; + + cpu = find_lowest_rq(rq->curr); + if (cpu == -1 || cpu == rq->cpu) + return 0; + + /* + * Given we found a CPU with lower priority than @next_task, + * therefore it should be running. However we cannot migrate it + * to this other CPU, instead attempt to push the current + * running task on this CPU away. + */ + push_task = get_push_task(rq); + if (push_task) { + preempt_disable(); + raw_spin_rq_unlock(rq); + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + push_task, &rq->push_work); + preempt_enable(); + raw_spin_rq_lock(rq); + } + + return 0; + } + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -1924,15 +2079,11 @@ retry: goto retry; } - deactivate_task(rq, next_task, 0); - set_task_cpu(next_task, lowest_rq->cpu); - activate_task(lowest_rq, next_task, 0); - ret = 1; - + move_queued_task_locked(rq, lowest_rq, next_task); resched_curr(lowest_rq); + ret = 1; double_unlock_balance(rq, lowest_rq); - out: put_task_struct(next_task); @@ -1942,7 +2093,7 @@ out: static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ - while (push_rt_task(rq)) + while (push_rt_task(rq, false)) ; } @@ -1968,14 +2119,14 @@ static void push_rt_tasks(struct rq *rq) * if its the only CPU with multiple RT tasks queued, and a large number * of CPUs scheduling a lower priority task at the same time. * - * Each root domain has its own irq work function that can iterate over + * Each root domain has its own IRQ work function that can iterate over * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT - * tassk must be checked if there's one or many CPUs that are lowering - * their priority, there's a single irq work iterator that will try to + * task must be checked if there's one or many CPUs that are lowering + * their priority, there's a single IRQ work iterator that will try to * push off RT tasks that are waiting to run. * * When a CPU schedules a lower priority task, it will kick off the - * irq work iterator that will jump to each CPU with overloaded RT tasks. + * IRQ work iterator that will jump to each CPU with overloaded RT tasks. * As it only takes the first CPU that schedules a lower priority task * to start the process, the rto_start variable is incremented and if * the atomic result is one, then that CPU will try to take the rto_lock. @@ -1983,7 +2134,7 @@ static void push_rt_tasks(struct rq *rq) * CPUs scheduling lower priority tasks. * * All CPUs that are scheduling a lower priority task will increment the - * rt_loop_next variable. This will make sure that the irq work iterator + * rt_loop_next variable. This will make sure that the IRQ work iterator * checks all RT overloaded CPUs whenever a CPU schedules a new lower * priority task, even if the iterator is in the middle of a scan. Incrementing * the rt_loop_next will cause the iterator to perform another scan. @@ -2063,7 +2214,7 @@ static void tell_cpu_to_push(struct rq *rq) * The rto_cpu is updated under the lock, if it has a valid CPU * then the IPI is still running and will continue due to the * update to loop_next, and nothing needs to be done here. - * Otherwise it is finishing up and an ipi needs to be sent. + * Otherwise it is finishing up and an IPI needs to be sent. */ if (rq->rd->rto_cpu < 0) cpu = rto_next_cpu(rq->rd); @@ -2094,9 +2245,10 @@ void rto_push_irq_work_func(struct irq_work *work) * When it gets updated, a check is made if a push is possible. */ if (has_pushable_tasks(rq)) { - raw_spin_lock(&rq->lock); - push_rt_tasks(rq); - raw_spin_unlock(&rq->lock); + raw_spin_rq_lock(rq); + while (push_rt_task(rq, true)) + ; + raw_spin_rq_unlock(rq); } raw_spin_lock(&rd->rto_lock); @@ -2120,7 +2272,7 @@ static void pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; bool resched = false; - struct task_struct *p; + struct task_struct *p, *push_task; struct rq *src_rq; int rt_overload_count = rt_overloaded(this_rq); @@ -2167,6 +2319,7 @@ static void pull_rt_task(struct rq *this_rq) * double_lock_balance, and another CPU could * alter this_rq */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2186,19 +2339,20 @@ static void pull_rt_task(struct rq *this_rq) /* * There's a chance that p is higher in priority * than what's currently running on its CPU. - * This is just that p is wakeing up and hasn't + * This is just that p is waking up and hasn't * had a chance to schedule. We only pull * p if it is lower in priority than the * current task on the run queue */ - if (p->prio < src_rq->curr->prio) + if (p->prio < src_rq->donor->prio) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + move_queued_task_locked(src_rq, this_rq, p); + resched = true; + } /* * We continue with the search, just in * case there's an even higher prio task @@ -2208,6 +2362,15 @@ static void pull_rt_task(struct rq *this_rq) } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + preempt_disable(); + raw_spin_rq_unlock(this_rq); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + preempt_enable(); + raw_spin_rq_lock(this_rq); + } } if (resched) @@ -2220,12 +2383,12 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - bool need_to_push = !task_running(rq, p) && + bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && + (dl_task(rq->donor) || rt_task(rq->donor)) && (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio); + rq->donor->prio <= p->prio); if (need_to_push) push_rt_tasks(rq); @@ -2291,18 +2454,25 @@ void __init init_sched_rt_class(void) static void switched_to_rt(struct rq *rq, struct task_struct *p) { /* - * If we are already running, then there's nothing - * that needs to be done. But if we are not running - * we may need to preempt the current running task. - * If that current running task is also an RT task + * If we are running, update the avg_rt tracking, as the running time + * will now on be accounted into the latter. + */ + if (task_current(rq, p)) { + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + return; + } + + /* + * If we are not running we may need to preempt the current + * running task. If that current running task is also an RT task * then see if we can move to another run queue. */ - if (task_on_rq_queued(p) && rq->curr != p) { + if (task_on_rq_queued(p)) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) + if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } @@ -2317,7 +2487,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->curr == p) { + if (task_current_donor(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we @@ -2343,7 +2513,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * greater than the current running task * then reschedule. */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->donor->prio) resched_curr(rq); } } @@ -2394,7 +2564,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) watchdog(rq, p); /* - * RR tasks need a special form of timeslice management. + * RR tasks need a special form of time-slice management. * FIFO tasks have no timeslices. */ if (p->policy != SCHED_RR) @@ -2429,15 +2599,30 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class = { - .next = &fair_sched_class, +#ifdef CONFIG_SCHED_CORE +static int task_is_throttled_rt(struct task_struct *p, int cpu) +{ + struct rt_rq *rt_rq; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq = task_group(p)->rt_rq[cpu]; +#else + rt_rq = &cpu_rq(cpu)->rt; +#endif + + return rt_rq_throttled(rt_rq); +} +#endif + +DEFINE_SCHED_CLASS(rt) = { + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, - .check_preempt_curr = check_preempt_curr_rt, + .wakeup_preempt = wakeup_preempt_rt, - .pick_next_task = pick_next_task_rt, + .pick_task = pick_task_rt, .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, @@ -2449,6 +2634,7 @@ const struct sched_class rt_sched_class = { .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, + .find_lock_rq = find_lock_lowest_rq, #endif .task_tick = task_tick_rt, @@ -2460,6 +2646,10 @@ const struct sched_class rt_sched_class = { .update_curr = update_curr_rt, +#ifdef CONFIG_SCHED_CORE + .task_is_throttled = task_is_throttled_rt, +#endif + #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif @@ -2664,6 +2854,7 @@ long sched_group_rt_period(struct task_group *tg) return rt_period_us; } +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { int ret = 0; @@ -2674,10 +2865,11 @@ static int sched_rt_global_constraints(void) return ret; } +#endif /* CONFIG_SYSCTL */ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { - /* Don't accept realtime tasks when there is no way for them to run */ + /* Don't accept real-time tasks when there is no way for them to run */ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) return 0; @@ -2685,30 +2877,18 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) } #else /* !CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; } +#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - if ((sysctl_sched_rt_runtime != RUNTIME_INF) && ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || ((u64)sysctl_sched_rt_runtime * @@ -2720,11 +2900,9 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_period, old_runtime; @@ -2735,7 +2913,7 @@ int sched_rt_handler(struct ctl_table *table, int write, void *buffer, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_validate(); @@ -2763,7 +2941,7 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, void *buffer, +static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2773,17 +2951,21 @@ int sched_rr_handler(struct ctl_table *table, int write, void *buffer, ret = proc_dointvec(table, write, buffer, lenp, ppos); /* * Make sure that internally we keep jiffies. - * Also, writing zero resets the timeslice to default: + * Also, writing zero resets the time-slice to default: */ if (!ret && write) { sched_rr_timeslice = sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : msecs_to_jiffies(sysctl_sched_rr_timeslice); + + if (sysctl_sched_rr_timeslice <= 0) + sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE); } mutex_unlock(&mutex); return ret; } +#endif /* CONFIG_SYSCTL */ #ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 877fb08eb1b0..023b844159c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,88 +2,101 @@ /* * Scheduler internal types and methods: */ -#include <linux/sched.h> +#ifndef _KERNEL_SCHED_SCHED_H +#define _KERNEL_SCHED_SCHED_H +#include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> -#include <linux/sched/clock.h> -#include <linux/sched/coredump.h> #include <linux/sched/cpufreq.h> -#include <linux/sched/cputime.h> #include <linux/sched/deadline.h> -#include <linux/sched/debug.h> -#include <linux/sched/hotplug.h> -#include <linux/sched/idle.h> -#include <linux/sched/init.h> -#include <linux/sched/isolation.h> -#include <linux/sched/jobctl.h> +#include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/mm.h> -#include <linux/sched/nohz.h> -#include <linux/sched/numa_balancing.h> -#include <linux/sched/prio.h> -#include <linux/sched/rt.h> +#include <linux/sched/rseq_api.h> #include <linux/sched/signal.h> #include <linux/sched/smt.h> #include <linux/sched/stat.h> #include <linux/sched/sysctl.h> +#include <linux/sched/task_flags.h> #include <linux/sched/task.h> -#include <linux/sched/task_stack.h> #include <linux/sched/topology.h> -#include <linux/sched/user.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/xacct.h> -#include <uapi/linux/sched/types.h> - -#include <linux/binfmts.h> -#include <linux/blkdev.h> -#include <linux/compat.h> +#include <linux/atomic.h> +#include <linux/bitmap.h> +#include <linux/bug.h> +#include <linux/capability.h> +#include <linux/cgroup_api.h> +#include <linux/cgroup.h> #include <linux/context_tracking.h> #include <linux/cpufreq.h> -#include <linux/cpuidle.h> -#include <linux/cpuset.h> +#include <linux/cpumask_api.h> #include <linux/ctype.h> -#include <linux/debugfs.h> -#include <linux/delayacct.h> -#include <linux/energy_model.h> -#include <linux/init_task.h> -#include <linux/kprobes.h> +#include <linux/file.h> +#include <linux/fs_api.h> +#include <linux/hrtimer_api.h> +#include <linux/interrupt.h> +#include <linux/irq_work.h> +#include <linux/jiffies.h> +#include <linux/kref_api.h> #include <linux/kthread.h> -#include <linux/membarrier.h> -#include <linux/migrate.h> -#include <linux/mmu_context.h> -#include <linux/nmi.h> +#include <linux/ktime_api.h> +#include <linux/lockdep_api.h> +#include <linux/lockdep.h> +#include <linux/minmax.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/mutex_api.h> +#include <linux/plist.h> +#include <linux/poll.h> #include <linux/proc_fs.h> -#include <linux/prefetch.h> #include <linux/profile.h> #include <linux/psi.h> -#include <linux/rcupdate_wait.h> -#include <linux/security.h> +#include <linux/rcupdate.h> +#include <linux/seq_file.h> +#include <linux/seqlock.h> +#include <linux/softirq.h> +#include <linux/spinlock_api.h> +#include <linux/static_key.h> #include <linux/stop_machine.h> -#include <linux/suspend.h> -#include <linux/swait.h> +#include <linux/syscalls_api.h> #include <linux/syscalls.h> -#include <linux/task_work.h> -#include <linux/tsacct_kern.h> +#include <linux/tick.h> +#include <linux/topology.h> +#include <linux/types.h> +#include <linux/u64_stats_sync_api.h> +#include <linux/uaccess.h> +#include <linux/wait_api.h> +#include <linux/wait_bit.h> +#include <linux/workqueue_api.h> +#include <linux/delayacct.h> -#include <asm/tlb.h> +#include <trace/events/power.h> +#include <trace/events/sched.h> + +#include "../workqueue_internal.h" + +struct rq; +struct cfs_rq; +struct rt_rq; +struct sched_group; +struct cpuidle_state; #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> +# include <asm/paravirt_api_clock.h> #endif +#include <asm/barrier.h> + #include "cpupri.h" #include "cpudeadline.h" #ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif -struct rq; -struct cpuidle_state; - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -96,15 +109,35 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); +extern void call_trace_sched_update_nr_running(struct rq *rq, int count); + +extern int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; +extern int sched_rr_timeslice; + +/* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + struct rcu_head rcu; + unsigned long capacity; + unsigned long cpus[]; +}; + +extern struct list_head asym_cap_list; + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + /* * Helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ)) /* * Increase resolution of nice-level calculations for 64-bit architectures. * The extra resolution improves shares distribution and load balancing of - * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group * hierarchies, especially on larger systems. This is not a user-visible change * and does not change the user-interface for setting shares/weights. * @@ -118,12 +151,13 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w) \ -({ \ - unsigned long __w = (w); \ - if (__w) \ - __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ - __w; \ +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ }) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) @@ -137,7 +171,7 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust); * scale_load() and scale_load_down(w) to convert between them. The * following must be true: * - * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD * */ #define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) @@ -158,9 +192,19 @@ static inline int idle_policy(int policy) { return policy == SCHED_IDLE; } + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT + if (policy == SCHED_EXT) + return true; +#endif + return policy == SCHED_NORMAL; +} + static inline int fair_policy(int policy) { - return policy == SCHED_NORMAL || policy == SCHED_BATCH; + return normal_policy(policy) || policy == SCHED_BATCH; } static inline int rt_policy(int policy) @@ -172,6 +216,7 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } + static inline bool valid_policy(int policy) { return idle_policy(policy) || fair_policy(policy) || @@ -193,15 +238,41 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) static inline void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; + *avg += diff / 8; } /* + * Shifting a value by an exponent greater *or equal* to the size of said value + * is UB; cap at size-1. + */ +#define shr_bound(val, shift) \ + (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) + +/* + * cgroup weight knobs should use the common MIN, DFL and MAX values which are + * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it + * maps pretty well onto the shares value used by scheduler and the round-trip + * conversions preserve the original value over the entire range. + */ +static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) +{ + return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); +} + +static inline unsigned long sched_weight_to_cgroup(unsigned long weight) +{ + return clamp_t(unsigned long, + DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +/* * !! For sched_setattr_nocheck() (kernel) only !! * * This is actually gross. :( @@ -215,7 +286,9 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 -static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + +static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); @@ -227,8 +300,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) /* * Tells if entity @a should preempt entity @b. */ -static inline bool -dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +static inline bool dl_entity_preempt(const struct sched_dl_entity *a, + const struct sched_dl_entity *b) { return dl_entity_is_special(a) || dl_time_before(a->deadline, b->deadline); @@ -251,13 +324,17 @@ struct rt_bandwidth { unsigned int rt_period_active; }; -void __dl_clear_params(struct task_struct *p); +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} /* - * To keep the bandwidth of -deadline tasks and groups under control + * To keep the bandwidth of -deadline tasks under control * we need some place where: - * - store the maximum -deadline bandwidth of the system (the group); - * - cache the fraction of that bandwidth that is currently allocated. + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; * * This is all done in the data structure below. It is similar to the * one used for RT-throttling (rt_bandwidth), with the main difference @@ -265,58 +342,17 @@ void __dl_clear_params(struct task_struct *p); * do not decrease any runtime while the group "executes", neither we * need a timer to replenish it. * - * With respect to SMP, the bandwidth is given on a per-CPU basis, + * With respect to SMP, bandwidth is given on a per root domain basis, * meaning that: - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; - * - dl_total_bw array contains, in the i-eth element, the currently - * allocated bandwidth on the i-eth CPU. - * Moreover, groups consume bandwidth on each CPU, while tasks only - * consume bandwidth on the CPU they're running on. - * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw - * that will be shown the next time the proc or cgroup controls will - * be red. It on its turn can be changed by writing on its own - * control. + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; */ -struct dl_bandwidth { - raw_spinlock_t dl_runtime_lock; - u64 dl_runtime; - u64 dl_period; -}; - -static inline int dl_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - struct dl_bw { raw_spinlock_t lock; u64 bw; u64 total_bw; }; -static inline void __dl_update(struct dl_bw *dl_b, s64 bw); - -static inline -void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) -{ - dl_b->total_bw -= tsk_bw; - __dl_update(dl_b, (s32)tsk_bw / cpus); -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) -{ - dl_b->total_bw += tsk_bw; - __dl_update(dl_b, -((s32)tsk_bw / cpus)); -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); @@ -325,17 +361,49 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); -extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern bool dl_cpu_busy(unsigned int cpu); +extern int dl_bw_deactivate(int cpu); +extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); +/* + * SCHED_DEADLINE supports servers (nested scheduling) with the following + * interface: + * + * dl_se::rq -- runqueue we belong to. + * + * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the + * server when it runs out of tasks to run. + * + * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this + * returns NULL. + * + * dl_server_update() -- called from update_curr_common(), propagates runtime + * to the server. + * + * dl_server_start() + * dl_server_stop() -- start/stop the server when it has (no) tasks. + * + * dl_server_init() -- initializes the server. + */ +extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); +extern void dl_server_start(struct sched_dl_entity *dl_se); +extern void dl_server_stop(struct sched_dl_entity *dl_se); +extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick_task); -#ifdef CONFIG_CGROUP_SCHED +extern void dl_server_update_idle_time(struct rq *rq, + struct task_struct *p); +extern void fair_server_init(struct rq *rq); +extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); +extern int dl_server_apply_params(struct sched_dl_entity *dl_se, + u64 runtime, u64 period, bool init); -#include <linux/cgroup.h> -#include <linux/psi.h> +static inline bool dl_server_active(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server_active; +} -struct cfs_rq; -struct rt_rq; +#ifdef CONFIG_CGROUP_SCHED extern struct list_head task_groups; @@ -345,6 +413,8 @@ struct cfs_bandwidth { ktime_t period; u64 quota; u64 runtime; + u64 burst; + u64 runtime_snap; s64 hierarchical_quota; u8 idle; @@ -357,7 +427,9 @@ struct cfs_bandwidth { /* Statistics: */ int nr_periods; int nr_throttled; + int nr_burst; u64 throttled_time; + u64 burst_time; #endif }; @@ -365,17 +437,21 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_GROUP_SCHED_WEIGHT + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each CPU */ struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; - #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put - * it in its own cacheline separated from the fields above which + * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; @@ -389,6 +465,11 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + struct rcu_head rcu; struct list_head list; @@ -413,7 +494,7 @@ struct task_group { }; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD /* @@ -444,23 +525,38 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) return walk_tg_tree_from(&root_task_group, down, up, data); } +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + extern int tg_nop(struct task_group *tg, void *data); +#ifdef CONFIG_FAIR_GROUP_SCHED extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); +#else +static inline void free_fair_sched_group(struct task_group *tg) { } +static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +static inline void online_fair_sched_group(struct task_group *tg) { } +static inline void unregister_fair_sched_group(struct task_group *tg) { } +#endif + extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); -extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); +extern bool cfs_task_bw_constrained(struct task_struct *p); -extern void free_rt_sched_group(struct task_group *tg); -extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); @@ -474,13 +570,15 @@ extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_online_group(struct task_group *tg, struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); -extern void sched_offline_group(struct task_group *tg); +extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_idle(struct task_group *tg, long idle); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@ -488,25 +586,82 @@ extern void set_task_rq_fair(struct sched_entity *se, static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } #endif /* CONFIG_SMP */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ +static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } +static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; +static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } + #endif /* CONFIG_CGROUP_SCHED */ +extern void unregister_rt_sched_group(struct task_group *tg); +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); + +/* + * u64_u32_load/u64_u32_store + * + * Use a copy of a u64 value to protect against data race. This is only + * applicable for 32-bits architectures. + */ +#ifdef CONFIG_64BIT +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) +#else +# define u64_u32_load_copy(var, copy) \ +({ \ + u64 __val, __val_copy; \ + do { \ + __val_copy = copy; \ + /* \ + * paired with u64_u32_store_copy(), ordering access \ + * to var and copy. \ + */ \ + smp_rmb(); \ + __val = var; \ + } while (__val != __val_copy); \ + __val; \ +}) +# define u64_u32_store_copy(var, copy, val) \ +do { \ + typeof(val) __val = (val); \ + var = __val; \ + /* \ + * paired with u64_u32_load_copy(), ordering access to var and \ + * copy. \ + */ \ + smp_wmb(); \ + copy = __val; \ +} while (0) +#endif +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; + /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ + unsigned int nr_queued; + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ + + s64 avg_vruntime; + u64 avg_load; - u64 exec_clock; u64 min_vruntime; -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; +#ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; #endif struct rb_root_cached tasks_timeline; @@ -517,12 +672,6 @@ struct cfs_rq { */ struct sched_entity *curr; struct sched_entity *next; - struct sched_entity *last; - struct sched_entity *skip; - -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif #ifdef CONFIG_SMP /* @@ -530,7 +679,7 @@ struct cfs_rq { */ struct sched_avg avg; #ifndef CONFIG_64BIT - u64 load_last_update_time_copy; + u64 last_update_time_copy; #endif struct { raw_spinlock_t lock ____cacheline_aligned; @@ -541,6 +690,7 @@ struct cfs_rq { } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + u64 last_update_tg_load_avg; unsigned long tg_load_avg_contrib; long propagate; long prop_runnable_sum; @@ -572,20 +722,71 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + /* Locally cached copy of our task_group's idle value */ + int idle; + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; + u64 throttled_pelt_idle; +#ifndef CONFIG_64BIT + u64 throttled_pelt_idle_copy; +#endif u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; + u64 throttled_clock_self; + u64 throttled_clock_self_time; int throttled; int throttle_count; struct list_head throttled_list; + struct list_head throttled_csd_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; +#ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + /* + * A hotplugged CPU starts scheduling before rq_online_scx(). Track + * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called + * only while the BPF scheduler considers the CPU to be online. + */ + SCX_RQ_ONLINE = 1 << 0, + SCX_RQ_CAN_STOP_TICK = 1 << 1, + SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ + SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ + SCX_RQ_BYPASSING = 1 << 4, + SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ + + SCX_RQ_IN_WAKEUP = 1 << 16, + SCX_RQ_IN_BALANCE = 1 << 17, +}; + +struct scx_rq { + struct scx_dispatch_q local_dsq; + struct list_head runnable_list; /* runnable tasks on this rq */ + struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */ + unsigned long ops_qseq; + u64 extra_enq_flags; /* see move_task_to_local_dsq() */ + u32 nr_running; + u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ + bool cpu_released; + u32 flags; + u64 clock; /* current per-rq clock -- see scx_bpf_now() */ + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_kick_if_idle; + cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; + struct balance_callback deferred_bal_cb; + struct irq_work deferred_irq_work; + struct irq_work kick_cpus_irq_work; +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -610,22 +811,20 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; - int overloaded; + bool overloaded; struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ int rt_queued; +#ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; -#ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; + unsigned int rt_nr_boosted; struct rq *rq; struct task_group *tg; @@ -642,7 +841,7 @@ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ struct rb_root_cached root; - unsigned long dl_nr_running; + unsigned int dl_nr_running; #ifdef CONFIG_SMP /* @@ -656,8 +855,7 @@ struct dl_rq { u64 next; } earliest_dl; - unsigned long dl_nr_migratory; - int overloaded; + bool overloaded; /* * Tasks on this rq that can be pushed away. They are kept in @@ -688,6 +886,12 @@ struct dl_rq { u64 extra_bw; /* + * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM + * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB). + */ + u64 max_bw; + + /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. */ @@ -695,33 +899,42 @@ struct dl_rq { }; #ifdef CONFIG_FAIR_GROUP_SCHED + /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) static inline void se_update_runnable(struct sched_entity *se) { if (!entity_is_task(se)) - se->runnable_weight = se->my_q->h_nr_running; + se->runnable_weight = se->my_q->h_nr_runnable; } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + if (entity_is_task(se)) return !!se->on_rq; else return se->runnable_weight; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ + #define entity_is_task(se) 1 -static inline void se_update_runnable(struct sched_entity *se) {} +static inline void se_update_runnable(struct sched_entity *se) { } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + return !!se->on_rq; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP /* @@ -744,10 +957,6 @@ struct perf_domain { struct rcu_head rcu; }; -/* Scheduling group status flags */ -#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ - /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -768,10 +977,10 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overload; + bool overloaded; - /* Indicate one or more cpus over-utilized (tipping point) */ - int overutilized; + /* Indicate one or more CPUs over-utilized (tipping point) */ + bool overutilized; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -782,6 +991,15 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; + /* + * Indicate whether a root_domain's dl_bw has been checked or + * updated. It's monotonously increasing value. + * + * Also, some corner cases, like 'wrap around' is dangerous, but given + * that u64 is 'big enough'. So that shouldn't be a concern. + */ + u64 visit_gen; + #ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. @@ -802,8 +1020,6 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; - /* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. @@ -817,6 +1033,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); +static inline int get_rd_overloaded(struct root_domain *rd) +{ + return READ_ONCE(rd->overloaded); +} + +static inline void set_rd_overloaded(struct root_domain *rd, int status) +{ + if (get_rd_overloaded(rd) != status) + WRITE_ONCE(rd->overloaded, status); +} + #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif @@ -862,6 +1089,8 @@ struct uclamp_rq { unsigned int value; struct uclamp_bucket bucket[UCLAMP_BUCKETS]; }; + +DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ /* @@ -873,12 +1102,8 @@ struct uclamp_rq { */ struct rq { /* runqueue lock: */ - raw_spinlock_t lock; + raw_spinlock_t __lock; - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -910,6 +1135,11 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_EXT + struct scx_rq scx; +#endif + + struct sched_dl_entity fair_server; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -923,9 +1153,13 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned int nr_uninterruptible; - struct task_struct __rcu *curr; + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; + struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; @@ -937,9 +1171,20 @@ struct rq { u64 clock_task ____cacheline_aligned; u64 clock_pelt; unsigned long lost_idle_time; + u64 clock_pelt_idle; + u64 clock_idle; +#ifndef CONFIG_64BIT + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; +#endif atomic_t nr_iowait; +#ifdef CONFIG_SCHED_DEBUG + u64 last_seen_need_resched_ns; + int ticks_without_resched; +#endif + #ifdef CONFIG_MEMBARRIER int membarrier_state; #endif @@ -949,9 +1194,8 @@ struct rq { struct sched_domain __rcu *sd; unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; - struct callback_head *balance_callback; + struct balance_callback *balance_callback; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -974,18 +1218,23 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif -#ifdef CONFIG_SCHED_THERMAL_PRESSURE - struct sched_avg avg_thermal; +#ifdef CONFIG_SCHED_HW_PRESSURE + struct sched_avg avg_hw; #endif u64 idle_stamp; u64 avg_idle; /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; + +#ifdef CONFIG_HOTPLUG_CPU + struct rcuwait hotplug_wait; +#endif #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; + u64 psi_irq_time; #endif #ifdef CONFIG_PARAVIRT u64 prev_steal_time; @@ -1003,13 +1252,13 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_count; @@ -1024,9 +1273,42 @@ struct rq { #endif #ifdef CONFIG_CPU_IDLE - /* Must be inspected within a rcu lock section */ + /* Must be inspected within a RCU lock section */ struct cpuidle_state *idle_state; #endif + +#ifdef CONFIG_SMP + unsigned int nr_pinned; +#endif + unsigned int push_busy; + struct cpu_stop_work push_work; + +#ifdef CONFIG_SCHED_CORE + /* per rq */ + struct rq *core; + struct task_struct *core_pick; + struct sched_dl_entity *core_dl_server; + unsigned int core_enabled; + unsigned int core_sched_seq; + struct rb_root core_tree; + + /* shared state -- careful with sched_core_cpu_deactivate() */ + unsigned int core_task_seq; + unsigned int core_pick_seq; + unsigned long core_cookie; + unsigned int core_forceidle_count; + unsigned int core_forceidle_seq; + unsigned int core_forceidle_occupation; + u64 core_forceidle_start; +#endif + + /* Scratch cpumask to be temporarily used under rq_lock */ + cpumask_var_t scratch_mask; + +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) + call_single_data_t cfsb_csd; + struct list_head cfsb_csd_list; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1054,6 +1336,223 @@ static inline int cpu_of(struct rq *rq) #endif } +#define MDF_PUSH 0x01 + +static inline bool is_migration_disabled(struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->migration_disabled; +#else + return false; +#endif +} + +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() this_cpu_ptr(&runqueues) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() raw_cpu_ptr(&runqueues) + +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + /* Do nothing */ +} + +#ifdef CONFIG_SCHED_CORE +static inline struct cpumask *sched_group_span(struct sched_group *sg); + +DECLARE_STATIC_KEY_FALSE(__sched_core_enabled); + +static inline bool sched_core_enabled(struct rq *rq) +{ + return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled; +} + +static inline bool sched_core_disabled(void) +{ + return !static_branch_unlikely(&__sched_core_enabled); +} + +/* + * Be careful with this function; not for general use. The return value isn't + * stable unless you actually hold a relevant rq->__lock. + */ +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + if (sched_core_enabled(rq)) + return &rq->core->__lock; + + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + if (rq->core_enabled) + return &rq->core->__lock; + + return &rq->__lock; +} + +extern bool +cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi); + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); + +/* + * Helpers to check if the CPU's core cookie matches with the task's cookie + * when core scheduling is enabled. + * A special case is that the task's cookie always matches with CPU's core + * cookie if the CPU is in an idle core. + */ +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + return rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + bool idle_core = true; + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { + if (!available_idle_cpu(cpu)) { + idle_core = false; + break; + } + } + + /* + * A CPU in an idle core is always the best choice for tasks with + * cookies. + */ + return idle_core || rq->core->core_cookie == p->core_cookie; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + int cpu; + + /* Ignore cookie match if core scheduler is not enabled on the CPU. */ + if (!sched_core_enabled(rq)) + return true; + + for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) { + if (sched_core_cookie_match(cpu_rq(cpu), p)) + return true; + } + return false; +} + +static inline bool sched_core_enqueued(struct task_struct *p) +{ + return !RB_EMPTY_NODE(&p->core_node); +} + +extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); + +extern void sched_core_get(void); +extern void sched_core_put(void); + +#else /* !CONFIG_SCHED_CORE: */ + +static inline bool sched_core_enabled(struct rq *rq) +{ + return false; +} + +static inline bool sched_core_disabled(void) +{ + return true; +} + +static inline raw_spinlock_t *rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline raw_spinlock_t *__rq_lockp(struct rq *rq) +{ + return &rq->__lock; +} + +static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) +{ + return true; +} + +static inline bool sched_group_cookie_match(struct rq *rq, + struct task_struct *p, + struct sched_group *group) +{ + return true; +} + +#endif /* !CONFIG_SCHED_CORE */ + +static inline void lockdep_assert_rq_held(struct rq *rq) +{ + lockdep_assert_held(__rq_lockp(rq)); +} + +extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass); +extern bool raw_spin_rq_trylock(struct rq *rq); +extern void raw_spin_rq_unlock(struct rq *rq); + +static inline void raw_spin_rq_lock(struct rq *rq) +{ + raw_spin_rq_lock_nested(rq, 0); +} + +static inline void raw_spin_rq_lock_irq(struct rq *rq) +{ + local_irq_disable(); + raw_spin_rq_lock(rq); +} + +static inline void raw_spin_rq_unlock_irq(struct rq *rq) +{ + raw_spin_rq_unlock(rq); + local_irq_enable(); +} + +static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) +{ + unsigned long flags; + + local_irq_save(flags); + raw_spin_rq_lock(rq); + + return flags; +} + +static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags) +{ + raw_spin_rq_unlock(rq); + local_irq_restore(flags); +} + +#define raw_spin_rq_lock_irqsave(rq, flags) \ +do { \ + flags = _raw_spin_rq_lock_irqsave(rq); \ +} while (0) #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); @@ -1068,21 +1567,58 @@ static inline void update_idle_core(struct rq *rq) static inline void update_idle_core(struct rq *rq) { } #endif -DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#ifdef CONFIG_FAIR_GROUP_SCHED -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() this_cpu_ptr(&runqueues) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() raw_cpu_ptr(&runqueues) +static inline struct task_struct *task_of(struct sched_entity *se) +{ + SCHED_WARN_ON(!entity_is_task(se)); + return container_of(se, struct task_struct, se); +} -extern void update_rq_clock(struct rq *rq); +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} -static inline u64 __rq_clock_broken(struct rq *rq) +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) { - return READ_ONCE(rq->clock); + return se->cfs_rq; } +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +#else /* !CONFIG_FAIR_GROUP_SCHED: */ + +#define task_of(_se) container_of(_se, struct task_struct, se) + +static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) +{ + const struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ + +extern void update_rq_clock(struct rq *rq); + /* * rq::clock_update_flags bits * @@ -1102,7 +1638,7 @@ static inline u64 __rq_clock_broken(struct rq *rq) * * if (rq-clock_update_flags >= RQCF_UPDATED) * - * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * to check if %RQCF_UPDATED is set. It'll never be shifted more than * one position though, because the next rq_unpin_lock() will shift it * back. */ @@ -1121,7 +1657,7 @@ static inline void assert_clock_updated(struct rq *rq) static inline u64 rq_clock(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq); return rq->clock; @@ -1129,46 +1665,50 @@ static inline u64 rq_clock(struct rq *rq) static inline u64 rq_clock_task(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); assert_clock_updated(rq); return rq->clock_task; } -/** - * By default the decay is the default pelt decay period. - * The decay shift can change the decay period in - * multiples of 32. - * Decay shift Decay period(ms) - * 0 32 - * 1 64 - * 2 128 - * 3 256 - * 4 512 - */ -extern int sched_thermal_decay_shift; - -static inline u64 rq_clock_thermal(struct rq *rq) -{ - return rq_clock_task(rq) >> sched_thermal_decay_shift; -} - static inline void rq_clock_skip_update(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rq->clock_update_flags |= RQCF_REQ_SKIP; } /* * See rt task throttling, which is the only time a skip - * request is cancelled. + * request is canceled. */ static inline void rq_clock_cancel_skipupdate(struct rq *rq) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); rq->clock_update_flags &= ~RQCF_REQ_SKIP; } +/* + * During cpu offlining and rq wide unthrottling, we can trigger + * an update_rq_clock() for several cfs and rt runqueues (Typically + * when using list_for_each_entry_*) + * rq_clock_start_loop_update() can be called after updating the clock + * once and before iterating over the list to prevent multiple update. + * After the iterative traversal, we need to call rq_clock_stop_loop_update() + * to clear RQCF_ACT_SKIP of rq->clock_update_flags. + */ +static inline void rq_clock_start_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + rq->clock_update_flags |= RQCF_ACT_SKIP; +} + +static inline void rq_clock_stop_loop_update(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + rq->clock_update_flags &= ~RQCF_ACT_SKIP; +} + struct rq_flags { unsigned long flags; struct pin_cookie cookie; @@ -1182,13 +1722,60 @@ struct rq_flags { #endif }; +extern struct balance_callback balance_push_callback; + +#ifdef CONFIG_SCHED_CLASS_EXT +extern const struct sched_class ext_sched_class; + +DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ + +#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.clock, clock); + smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID); +} + +static inline void scx_rq_clock_invalidate(struct rq *rq) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); +} + +#else /* !CONFIG_SCHED_CLASS_EXT */ +#define scx_enabled() false +#define scx_switched_all() false + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {} +static inline void scx_rq_clock_invalidate(struct rq *rq) {} +#endif /* !CONFIG_SCHED_CLASS_EXT */ + +/* + * Lockdep annotation that avoids accidental unlocks; it's like a + * sticky/continuous lockdep_assert_held(). + * + * This avoids code that has access to 'struct rq *rq' (basically everything in + * the scheduler) from accidentally unlocking the rq if they do not also have a + * copy of the (on-stack) 'struct rq_flags rf'. + * + * Also see Documentation/locking/lockdep-design.rst. + */ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); #ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; +# ifdef CONFIG_SMP + SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); +# endif #endif } @@ -1198,13 +1785,13 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED; #endif - - lockdep_unpin_lock(&rq->lock, rf->cookie); + scx_rq_clock_invalidate(rq); + lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { - lockdep_repin_lock(&rq->lock, rf->cookie); + lockdep_repin_lock(__rq_lockp(rq), rf->cookie); #ifdef CONFIG_SCHED_DEBUG /* @@ -1214,9 +1801,11 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +extern struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); +extern struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1225,7 +1814,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } static inline void @@ -1234,68 +1823,73 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(p->pi_lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irqsave(&rq->lock, rf->flags); - rq_pin_lock(rq, rf); -} +DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, + _T->rq = task_rq_lock(_T->lock, &_T->rf), + task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock_irq(&rq->lock); + raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); } -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock(&rq->lock); + raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); } -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); + raw_spin_rq_lock(rq); + rq_pin_lock(rq, rf); } -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); + raw_spin_rq_unlock_irqrestore(rq, rf->flags); } -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); } -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); } -static inline struct rq * -this_rq_lock_irq(struct rq_flags *rf) +DEFINE_LOCK_GUARD_1(rq_lock, struct rq, + rq_lock(_T->lock, &_T->rf), + rq_unlock(_T->lock, &_T->rf), + struct rq_flags rf) + +DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq, + rq_lock_irq(_T->lock, &_T->rf), + rq_unlock_irq(_T->lock, &_T->rf), + struct rq_flags rf) + +DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, + rq_lock_irqsave(_T->lock, &_T->rf), + rq_unlock_irqrestore(_T->lock, &_T->rf), + struct rq_flags rf) + +static inline struct rq *this_rq_lock_irq(struct rq_flags *rf) __acquires(rq->lock) { struct rq *rq; @@ -1303,33 +1897,43 @@ this_rq_lock_irq(struct rq_flags *rf) local_irq_disable(); rq = this_rq(); rq_lock(rq, rf); + return rq; } #ifdef CONFIG_NUMA + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, NUMA_BACKPLANE, }; + extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); -extern void sched_init_numa(void); +extern void sched_init_numa(int offline_node); +extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -#else -static inline void sched_init_numa(void) { } + +#else /* !CONFIG_NUMA: */ + +static inline void sched_init_numa(int offline_node) { } +static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } + static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { return nr_cpu_ids; } -#endif + +#endif /* !CONFIG_NUMA */ #ifdef CONFIG_NUMA_BALANCING + /* The regions in numa_faults array from task_struct */ enum numa_faults_stats { NUMA_MEM = 0, @@ -1337,38 +1941,46 @@ enum numa_faults_stats { NUMA_MEMBUF, NUMA_CPUBUF }; + extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); -#else + +#else /* !CONFIG_NUMA_BALANCING: */ + static inline void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) { } -#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* !CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP static inline void queue_balance_callback(struct rq *rq, - struct callback_head *head, + struct balance_callback *head, void (*func)(struct rq *rq)) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); - if (unlikely(head->next)) + /* + * Don't (re)queue an already queued item; nor queue anything when + * balance_push() is active, see the comment with + * balance_push_callback. + */ + if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; - head->func = (void (*)(struct callback_head *))func; + head->func = func; head->next = rq->balance_callback; rq->balance_callback = head; } #define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) + rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -1381,6 +1993,13 @@ queue_balance_callback(struct rq *rq, for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) +/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) | +static const unsigned int SD_SHARED_CHILD_MASK = +#include <linux/sched/sd_flags.h> +0; +#undef SD_FLAG + /** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to @@ -1388,16 +2007,25 @@ queue_balance_callback(struct rq *rq, * @flag: The flag to check for the highest sched_domain * for the given CPU. * - * Returns the highest sched_domain of a CPU which contains the given flag. + * Returns the highest sched_domain of a CPU which contains @flag. If @flag has + * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag. */ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) { struct sched_domain *sd, *hsd = NULL; for_each_domain(cpu, sd) { - if (!(sd->flags & flag)) + if (sd->flags & flag) { + hsd = sd; + continue; + } + + /* + * Stop the search if @flag is known to be shared at lower + * levels. It will not be found further up. + */ + if (flag & SD_SHARED_CHILD_MASK) break; - hsd = sd; } return hsd; @@ -1418,11 +2046,19 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; + +static __always_inline bool sched_asym_cpucap_active(void) +{ + return static_branch_unlikely(&sched_asym_cpucapacity); +} struct sched_group_capacity { atomic_t ref; @@ -1440,7 +2076,7 @@ struct sched_group_capacity { int id; #endif - unsigned long cpumask[0]; /* Balance mask */ + unsigned long cpumask[]; /* Balance mask */ }; struct sched_group { @@ -1448,8 +2084,10 @@ struct sched_group { atomic_t ref; unsigned int group_weight; + unsigned int cores; struct sched_group_capacity *sgc; int asym_prefer_cpu; /* CPU of highest priority in group */ + int flags; /* * The CPUs this group covers. @@ -1474,41 +2112,26 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) return to_cpumask(sg->sgc->cpumask); } -/** - * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. - * @group: The group whose first CPU is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_span(group)); -} - extern int group_balance_cpu(struct sched_group *sg); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -void register_sched_domain_sysctl(void); -void dirty_sched_domain_sysctl(int cpu); -void unregister_sched_domain_sysctl(void); +#ifdef CONFIG_SCHED_DEBUG +extern void update_sched_domain_debugfs(void); +extern void dirty_sched_domain_sysctl(int cpu); #else -static inline void register_sched_domain_sysctl(void) -{ -} -static inline void dirty_sched_domain_sysctl(int cpu) -{ -} -static inline void unregister_sched_domain_sysctl(void) -{ -} +static inline void update_sched_domain_debugfs(void) { } +static inline void dirty_sched_domain_sysctl(int cpu) { } #endif -extern void flush_smp_call_function_from_idle(void); +extern int sched_update_scaling(void); -#else /* !CONFIG_SMP: */ -static inline void flush_smp_call_function_from_idle(void) { } -#endif +static inline const struct cpumask *task_user_cpus(struct task_struct *p) +{ + if (!p->user_cpus_ptr) + return cpu_possible_mask; /* &init_task.cpus_mask */ + return p->user_cpus_ptr; +} -#include "stats.h" -#include "autogroup.h" +#endif /* CONFIG_SMP */ #ifdef CONFIG_CGROUP_SCHED @@ -1541,6 +2164,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; + p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -1549,15 +2173,16 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif } -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } + static inline struct task_group *task_group(struct task_struct *p) { return NULL; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -1569,11 +2194,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); -#ifdef CONFIG_THREAD_INFO_IN_TASK - WRITE_ONCE(p->cpu, cpu); -#else WRITE_ONCE(task_thread_info(p)->cpu, cpu); -#endif p->wake_cpu = cpu; #endif } @@ -1582,7 +2203,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include <linux/static_key.h> # define const_debug __read_mostly #else # define const_debug const @@ -1598,7 +2218,7 @@ enum { #undef SCHED_FEAT -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * To support run-time toggling of sched features, all the translation units @@ -1606,6 +2226,8 @@ enum { */ extern const_debug unsigned int sysctl_sched_features; +#ifdef CONFIG_JUMP_LABEL + #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1618,7 +2240,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */ +#else /* !CONFIG_JUMP_LABEL: */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* !CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG: */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -1634,7 +2262,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */ +#endif /* !SCHED_DEBUG */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -1652,12 +2280,26 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * Is p the current execution context? + */ static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; } -static inline int task_running(struct rq *rq, struct task_struct *p) +/* + * Is p the current scheduling context? + * + * Note that it might be the current execution context at the same time if + * rq->curr == rq->donor == p. + */ +static inline int task_current_donor(struct rq *rq, struct task_struct *p) +{ + return rq->donor == p; +} + +static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP return p->on_cpu; @@ -1668,7 +2310,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p) static inline int task_on_rq_queued(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_QUEUED; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED; } static inline int task_on_rq_migrating(struct task_struct *p) @@ -1676,13 +2318,21 @@ static inline int task_on_rq_migrating(struct task_struct *p) return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; } -/* - * wake flags - */ -#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ -#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ +/* Wake flags. The first three directly map to some SD flag value */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ + +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ + +#ifdef CONFIG_SMP +static_assert(WF_EXEC == SD_BALANCE_EXEC); +static_assert(WF_FORK == SD_BALANCE_FORK); +static_assert(WF_TTWU == SD_BALANCE_WAKE); +#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1712,16 +2362,24 @@ extern const u32 sched_prio_to_wmult[40]; * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location * in the runqueue. * + * NOCLOCK - skip the update_rq_clock() (avoids double updates) + * + * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ -#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_SPECIAL 0x10 +#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -1735,40 +2393,63 @@ extern const u32 sched_prio_to_wmult[40]; #else #define ENQUEUE_MIGRATED 0x00 #endif +#define ENQUEUE_INITIAL 0x80 +#define ENQUEUE_MIGRATING 0x100 +#define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) +struct affinity_context { + const struct cpumask *new_mask; + struct cpumask *user_mask; + unsigned int flags; +}; + +extern s64 update_curr_common(struct rq *rq); + struct sched_class { - const struct sched_class *next; #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); - bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); - void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); - struct task_struct *(*pick_next_task)(struct rq *rq); + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + struct task_struct *(*pick_task)(struct rq *rq); + /* + * Optional! When implemented pick_next_task() should be equivalent to: + * + * next = pick_task(); + * if (next) { + * put_prev_task(prev); + * set_next_task_first(next); + * } + */ + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); #ifdef CONFIG_SMP - int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); - void (*set_cpus_allowed)(struct task_struct *p, - const struct cpumask *newmask); + void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + + struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); #endif void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); @@ -1777,11 +2458,14 @@ struct sched_class { /* * The switched_from() call is allowed to drop rq->lock, therefore we - * cannot assume the switched_from/switched_to pair is serliazed by + * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); void (*switched_from)(struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*reweight_task)(struct rq *this_rq, struct task_struct *task, + const struct load_weight *lw); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); @@ -1790,37 +2474,69 @@ struct sched_class { void (*update_curr)(struct rq *rq); -#define TASK_SET_GROUP 0 -#define TASK_MOVE_GROUP 1 - #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_change_group)(struct task_struct *p, int type); + void (*task_change_group)(struct task_struct *p); +#endif + +#ifdef CONFIG_SCHED_CORE + int (*task_is_throttled)(struct task_struct *p, int cpu); #endif }; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); + WARN_ON_ONCE(rq->donor != prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } static inline void set_next_task(struct rq *rq, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != next); next->sched_class->set_next_task(rq, next, false); } -#ifdef CONFIG_SMP -#define sched_class_highest (&stop_sched_class) -#else -#define sched_class_highest (&dl_sched_class) -#endif +static inline void +__put_prev_set_next_dl_server(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + prev->dl_server = NULL; + next->dl_server = rq->dl_server; + rq->dl_server = NULL; +} -#define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class = class->next) +static inline void put_prev_set_next_task(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + WARN_ON_ONCE(rq->curr != prev); -#define for_each_class(class) \ - for_class_range(class, sched_class_highest, NULL) + __put_prev_set_next_dl_server(rq, prev, next); + + if (next == prev) + return; + + prev->sched_class->put_prev_task(rq, prev, next); + next->sched_class->set_next_task(rq, next, true); +} + +/* + * Helper to define a sched_class instance; each one is placed in a separate + * section which is ordered by the linker script: + * + * include/asm-generic/vmlinux.lds.h + * + * *CAREFUL* they are laid out in *REVERSE* order!!! + * + * Also enforce alignment on the instance, not the type, to guarantee layout. + */ +#define DEFINE_SCHED_CLASS(name) \ +const struct sched_class name##_sched_class \ + __aligned(__alignof__(struct sched_class)) \ + __section("__" #name "_sched_class") + +/* Defined in include/asm-generic/vmlinux.lds.h */ +extern struct sched_class __sched_class_highest[]; +extern struct sched_class __sched_class_lowest[]; extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1828,6 +2544,36 @@ extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; +/* + * Iterate only active classes. SCX can take over all fair tasks or be + * completely disabled. If the former, skip fair. If the latter, skip SCX. + */ +static inline const struct sched_class *next_active_class(const struct sched_class *class) +{ + class++; +#ifdef CONFIG_SCHED_CLASS_EXT + if (scx_switched_all() && class == &fair_sched_class) + class++; + if (!scx_enabled() && class == &ext_sched_class) + class++; +#endif + return class; +} + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class < (_to); class++) + +#define for_each_class(class) \ + for_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define for_active_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = next_active_class(class)) + +#define for_each_active_class(class) \ + for_active_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) + static inline bool sched_stop_runnable(struct rq *rq) { return rq->stop && task_on_rq_queued(rq->stop); @@ -1845,23 +2591,92 @@ static inline bool sched_rt_runnable(struct rq *rq) static inline bool sched_fair_runnable(struct rq *rq) { - return rq->cfs.nr_running > 0; + return rq->cfs.nr_queued > 0; } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_next_task_idle(struct rq *rq); +extern struct task_struct *pick_task_idle(struct rq *rq); + +#define SCA_CHECK 0x01 +#define SCA_MIGRATE_DISABLE 0x02 +#define SCA_MIGRATE_ENABLE 0x04 +#define SCA_USER 0x08 #ifdef CONFIG_SMP extern void update_group_capacity(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq); +extern void sched_balance_trigger(struct rq *rq); -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); +extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx); +extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); -#endif +static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) +{ + /* When not in the task's cpumask, no point in looking further. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + + /* Can @cpu run a user thread? */ + if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p)) + return false; + + return true; +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + /* + * See do_set_cpus_allowed() above for the rcu_head usage. + */ + int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); + + return kmalloc_node(size, GFP_KERNEL, node); +} + +static inline struct task_struct *get_push_task(struct rq *rq) +{ + struct task_struct *p = rq->donor; + + lockdep_assert_rq_held(rq); + + if (rq->push_busy) + return NULL; + + if (p->nr_cpus_allowed == 1) + return NULL; + + if (p->migration_disabled) + return NULL; + + rq->push_busy = true; + return get_task_struct(p); +} + +extern int push_cpu_stop(void *arg); + +#else /* !CONFIG_SMP: */ + +static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) +{ + return true; +} + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + struct affinity_context *ctx) +{ + return set_cpus_allowed_ptr(p, ctx->new_mask); +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + return NULL; +} + +#endif /* !CONFIG_SMP */ #ifdef CONFIG_CPU_IDLE + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -1874,7 +2689,9 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) return rq->idle_state; } -#else + +#else /* !CONFIG_CPU_IDLE: */ + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -1884,9 +2701,11 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } -#endif + +#endif /* !CONFIG_CPU_IDLE */ extern void schedule_idle(void); +asmlinkage void schedule_user(void); extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); @@ -1896,25 +2715,22 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, int prio); - extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); -extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); -extern struct dl_bandwidth def_dl_bandwidth; -extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); -extern void init_dl_task_timer(struct sched_dl_entity *dl_se); -extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_entity(struct sched_dl_entity *dl_se); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 #define MAX_BW_BITS (64 - BW_SHIFT) #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) -unsigned long to_ratio(u64 period, u64 runtime); + +extern unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); extern void post_init_entity_util_avg(struct task_struct *p); @@ -1930,12 +2746,7 @@ extern int __init sched_tick_offload_init(void); */ static inline void sched_update_tick_dependency(struct rq *rq) { - int cpu; - - if (!tick_nohz_full_enabled()) - return; - - cpu = cpu_of(rq); + int cpu = cpu_of(rq); if (!tick_nohz_full_cpu(cpu)) return; @@ -1945,22 +2756,23 @@ static inline void sched_update_tick_dependency(struct rq *rq) else tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#else +#else /* !CONFIG_NO_HZ_FULL: */ static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; rq->nr_running = prev_nr + count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, count); + } #ifdef CONFIG_SMP - if (prev_nr < 2 && rq->nr_running >= 2) { - if (!READ_ONCE(rq->rd->overload)) - WRITE_ONCE(rq->rd->overload, 1); - } + if (prev_nr < 2 && rq->nr_running >= 2) + set_rd_overloaded(rq->rd, 1); #endif sched_update_tick_dependency(rq); @@ -1969,18 +2781,86 @@ static inline void add_nr_running(struct rq *rq, unsigned count) static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + if (trace_sched_update_nr_running_tp_enabled()) { + call_trace_sched_update_nr_running(rq, -count); + } + /* Check if we still need preemption */ sched_update_tick_dependency(rq); } +static inline void __block_task(struct rq *rq, struct task_struct *p) +{ + if (p->sched_contributes_to_load) + rq->nr_uninterruptible++; + + if (p->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); +} + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + +#ifdef CONFIG_PREEMPT_RT +# define SCHED_NR_MIGRATE_BREAK 8 +#else +# define SCHED_NR_MIGRATE_BREAK 32 +#endif extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_base_slice; + +#ifdef CONFIG_SCHED_DEBUG +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; + +extern unsigned int sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_hot_threshold; +#endif + #ifdef CONFIG_SCHED_HRTICK /* @@ -1990,32 +2870,61 @@ extern const_debug unsigned int sysctl_sched_migration_cost; */ static inline int hrtick_enabled(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; if (!cpu_active(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } -void hrtick_start(struct rq *rq, u64 delay); +static inline int hrtick_enabled_fair(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtick_enabled(rq); +} -#else +static inline int hrtick_enabled_dl(struct rq *rq) +{ + if (!sched_feat(HRTICK_DL)) + return 0; + return hrtick_enabled(rq); +} -static inline int hrtick_enabled(struct rq *rq) +extern void hrtick_start(struct rq *rq, u64 delay); + +#else /* !CONFIG_SCHED_HRTICK: */ + +static inline int hrtick_enabled_fair(struct rq *rq) { return 0; } -#endif /* CONFIG_SCHED_HRTICK */ +static inline int hrtick_enabled_dl(struct rq *rq) +{ + return 0; +} -#ifndef arch_scale_freq_tick -static __always_inline -void arch_scale_freq_tick(void) +static inline int hrtick_enabled(struct rq *rq) { + return 0; } + +#endif /* !CONFIG_SCHED_HRTICK */ + +#ifndef arch_scale_freq_tick +static __always_inline void arch_scale_freq_tick(void) { } #endif #ifndef arch_scale_freq_capacity +/** + * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * f_curr + * ------ * SCHED_CAPACITY_SCALE + * f_max + */ static __always_inline unsigned long arch_scale_freq_capacity(int cpu) { @@ -2023,10 +2932,62 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to + * acquire rq lock instead of rq_lock(). So at the end of these two functions + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. + */ +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) +{ + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPTION + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); +#endif +} +#else +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { } +#endif -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); +#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ +__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ +static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ +{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ + _lock; return _t; } + +#ifdef CONFIG_SMP + +static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) +{ +#ifdef CONFIG_SCHED_CORE + /* + * In order to not have {0,2},{1,3} turn into into an AB-BA, + * order by core-id first and cpu-id second. + * + * Notably: + * + * double_rq_lock(0,3); will take core-0, core-1 lock + * double_rq_lock(1,2); will take core-1, core-0 lock + * + * when only cpu-id is considered. + */ + if (rq1->core->cpu < rq2->core->cpu) + return true; + if (rq1->core->cpu > rq2->core->cpu) + return false; + + /* + * __sched_core_flip() relies on SMT having cpu-id lock order. + */ +#endif + return rq1->cpu < rq2->cpu; +} + +extern void double_rq_lock(struct rq *rq1, struct rq *rq2); + +#ifdef CONFIG_PREEMPTION /* * fair double_lock_balance: Safely acquires both rq->locks in a fair @@ -2041,13 +3002,13 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - raw_spin_unlock(&this_rq->lock); + raw_spin_rq_unlock(this_rq); double_rq_lock(this_rq, busiest); return 1; } -#else +#else /* !CONFIG_PREEMPTION: */ /* * Unfair double_lock_balance: Optimizes throughput at the expense of * latency by eliminating extra atomic operations when the locks are @@ -2060,34 +3021,32 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - int ret = 0; - - if (unlikely(!raw_spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - raw_spin_unlock(&this_rq->lock); - raw_spin_lock(&busiest->lock); - raw_spin_lock_nested(&this_rq->lock, - SINGLE_DEPTH_NESTING); - ret = 1; - } else - raw_spin_lock_nested(&busiest->lock, - SINGLE_DEPTH_NESTING); + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || + likely(raw_spin_rq_trylock(busiest))) { + double_rq_clock_clear_update(this_rq, busiest); + return 0; } - return ret; + + if (rq_order_less(this_rq, busiest)) { + raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(this_rq, busiest); + return 0; + } + + raw_spin_rq_unlock(this_rq); + double_rq_lock(this_rq, busiest); + + return 1; } -#endif /* CONFIG_PREEMPTION */ +#endif /* !CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) { - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work well under rq->lock */ - raw_spin_unlock(&this_rq->lock); - BUG_ON(1); - } + lockdep_assert_irqs_disabled(); return _double_lock_balance(this_rq, busiest); } @@ -2095,8 +3054,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); + if (__rq_lockp(this_rq) != __rq_lockp(busiest)) + raw_spin_rq_unlock(busiest); + lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_); } static inline void double_lock(spinlock_t *l1, spinlock_t *l2) @@ -2126,31 +3086,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) +static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2) { - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } + raw_spin_unlock(l1); + raw_spin_unlock(l2); } +DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t, + double_raw_lock(_T->lock, _T->lock2), + double_raw_unlock(_T->lock, _T->lock2)) + /* * double_rq_unlock - safely unlock two runqueues * @@ -2161,18 +3106,19 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_unlock(rq2); else __release(rq2->lock); + raw_spin_rq_unlock(rq1); } extern void set_rq_online (struct rq *rq); extern void set_rq_offline(struct rq *rq); + extern bool sched_smp_initialized; -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ /* * double_rq_lock - safely lock two runqueues @@ -2184,10 +3130,11 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); - raw_spin_lock(&rq1->lock); + WARN_ON_ONCE(!irqs_disabled()); + WARN_ON_ONCE(rq1 != rq2); + raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ + double_rq_clock_clear_update(rq1, rq2); } /* @@ -2200,18 +3147,23 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - BUG_ON(rq1 != rq2); - raw_spin_unlock(&rq1->lock); + WARN_ON_ONCE(rq1 != rq2); + raw_spin_rq_unlock(rq1); __release(rq2->lock); } -#endif +#endif /* !CONFIG_SMP */ +DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, + double_rq_lock(_T->lock, _T->lock2), + double_rq_unlock(_T->lock, _T->lock2)) + +extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG -extern bool sched_debug_enabled; +extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); @@ -2219,14 +3171,17 @@ extern void print_dl_stats(struct seq_file *m, int cpu); extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); -#ifdef CONFIG_NUMA_BALANCING -extern void -show_numa_stats(struct task_struct *p, struct seq_file *m); + +extern void resched_latency_warn(int cpu, u64 latency); +# ifdef CONFIG_NUMA_BALANCING +extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, - unsigned long tpf, unsigned long gsf, unsigned long gpf); -#endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ + unsigned long tpf, unsigned long gsf, unsigned long gpf); +# endif /* CONFIG_NUMA_BALANCING */ +#else /* !CONFIG_SCHED_DEBUG: */ +static inline void resched_latency_warn(int cpu, u64 latency) { } +#endif /* !CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -2236,49 +3191,66 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON + #define NOHZ_BALANCE_KICK_BIT 0 #define NOHZ_STATS_KICK_BIT 1 +#define NOHZ_NEWILB_KICK_BIT 2 +#define NOHZ_NEXT_KICK_BIT 3 +/* Run sched_balance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +/* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) +/* Update blocked load when entering idle */ +#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT) +/* Update nohz.next_balance */ +#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT) -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) -#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) extern void nohz_balance_exit_idle(struct rq *rq); -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balance_exit_idle(struct rq *rq) { } +#endif /* !CONFIG_NO_HZ_COMMON */ + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void nohz_run_idle_balance(int cpu); +#else +static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" -#ifdef CONFIG_SMP -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); - int i; +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - for_each_cpu_and(i, rd->span, cpu_active_mask) { - struct rq *rq = cpu_rq(i); +extern void __sched_core_account_forceidle(struct rq *rq); - rq->dl.extra_bw += bw; - } -} -#else -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) +static inline void sched_core_account_forceidle(struct rq *rq) { - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} - dl->extra_bw += bw; +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); } -#endif +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING + struct irqtime { u64 total; u64 tick_delta; @@ -2287,10 +3259,16 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +extern int sched_clock_irqtime; + +static inline int irqtime_enabled(void) +{ + return sched_clock_irqtime; +} /* * Returns the irqtime minus the softirq time computed by ksoftirqd. - * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime * and never move forward. */ static inline u64 irq_time_read(int cpu) @@ -2306,9 +3284,18 @@ static inline u64 irq_time_read(int cpu) return total; } + +#else + +static inline int irqtime_enabled(void) +{ + return 0; +} + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ + DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** @@ -2342,119 +3329,189 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) if (data) data->func(data, rq_clock(rq), flags); } +#else /* !CONFIG_CPU_FREQ: */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } +#endif /* !CONFIG_CPU_FREQ */ + +#ifdef arch_scale_freq_capacity +# ifndef arch_scale_freq_invariant +# define arch_scale_freq_invariant() true +# endif #else -static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -#endif /* CONFIG_CPU_FREQ */ +# define arch_scale_freq_invariant() false +#endif -#ifdef CONFIG_UCLAMP_TASK -unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +#ifdef CONFIG_SMP -static __always_inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max); + +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max); + + +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the original capacity of @cpu is + * greater than or equal to task's deadline density right shifted by + * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) { - unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + unsigned long cap = arch_scale_cpu_capacity(cpu); - if (p) { - min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); - max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); - } + return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT); +} - /* - * Since CPU's {min,max}_util clamps are MAX aggregated considering - * RUNNABLE tasks with _different_ clamps, we can end up with an - * inversion. Fix it now when the clamps are applied. - */ - if (unlikely(min_util >= max_util)) - return min_util; +static inline unsigned long cpu_bw_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} - return clamp(util, min_util, max_util); +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); } -#else /* CONFIG_UCLAMP_TASK */ -static inline -unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, - struct task_struct *p) + + +extern unsigned long cpu_util_cfs(int cpu); +extern unsigned long cpu_util_cfs_boost(int cpu); + +static inline unsigned long cpu_util_rt(struct rq *rq) { - return util; + return READ_ONCE(rq->avg_rt.util_avg); } -#endif /* CONFIG_UCLAMP_TASK */ -#ifdef arch_scale_freq_capacity -# ifndef arch_scale_freq_invariant -# define arch_scale_freq_invariant() true -# endif -#else -# define arch_scale_freq_invariant() false -#endif +#else /* !CONFIG_SMP */ +static inline bool update_other_load_avgs(struct rq *rq) { return false; } +#endif /* CONFIG_SMP */ -#ifdef CONFIG_SMP -static inline unsigned long capacity_orig_of(int cpu) +#ifdef CONFIG_UCLAMP_TASK + +unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); + +static inline unsigned long uclamp_rq_get(struct rq *rq, + enum uclamp_id clamp_id) { - return cpu_rq(cpu)->cpu_capacity_orig; + return READ_ONCE(rq->uclamp[clamp_id].value); } -#endif -/** - * enum schedutil_type - CPU utilization type - * @FREQUENCY_UTIL: Utilization used to select frequency - * @ENERGY_UTIL: Utilization used during energy calculation +static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, + unsigned int value) +{ + WRITE_ONCE(rq->uclamp[clamp_id].value, value); +} + +static inline bool uclamp_rq_is_idle(struct rq *rq) +{ + return rq->uclamp_flags & UCLAMP_FLAG_IDLE; +} + +/* Is the rq being capped/throttled by uclamp_max? */ +static inline bool uclamp_rq_is_capped(struct rq *rq) +{ + unsigned long rq_util; + unsigned long max_util; + + if (!static_branch_likely(&sched_uclamp_used)) + return false; + + rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); + max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; +} + +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. * - * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time - * need to be aggregated differently depending on the usage made of them. This - * enum is used within schedutil_freq_util() to differentiate the types of - * utilization expected by the callers, and adjust the aggregation accordingly. + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. */ -enum schedutil_type { - FREQUENCY_UTIL, - ENERGY_UTIL, -}; +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p); +extern unsigned int sysctl_sched_uclamp_util_min_rt_default; -static inline unsigned long cpu_bw_dl(struct rq *rq) + +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { - return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; } -static inline unsigned long cpu_util_dl(struct rq *rq) +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) { - return READ_ONCE(rq->avg_dl.util_avg); + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } -static inline unsigned long cpu_util_cfs(struct rq *rq) +static inline void +uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined) { - unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; +} - if (sched_feat(UTIL_EST)) { - util = max_t(unsigned long, util, - READ_ONCE(rq->cfs.avg.util_est.enqueued)); - } +#else /* !CONFIG_UCLAMP_TASK: */ - return util; +static inline unsigned long +uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + + return SCHED_CAPACITY_SCALE; } -static inline unsigned long cpu_util_rt(struct rq *rq) +static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; } + +static inline bool uclamp_is_used(void) { - return READ_ONCE(rq->avg_rt.util_avg); + return false; } -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) + +static inline unsigned long +uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { - return 0; + if (clamp_id == UCLAMP_MIN) + return 0; + + return SCHED_CAPACITY_SCALE; +} + +static inline void +uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value) +{ +} + +static inline bool uclamp_rq_is_idle(struct rq *rq) +{ + return false; } -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ + +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + static inline unsigned long cpu_util_irq(struct rq *rq) { - return rq->avg_irq.util_avg; + return READ_ONCE(rq->avg_irq.util_avg); } static inline @@ -2466,7 +3523,9 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned return util; } -#else + +#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */ + static inline unsigned long cpu_util_irq(struct rq *rq) { return 0; @@ -2477,7 +3536,10 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned { return util; } -#endif + +#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */ + +extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -2490,14 +3552,18 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } +extern struct cpufreq_governor schedutil_gov; + #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL + static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_MEMBARRIER + /* * The scheduler provides memory barriers required by membarrier between: * - prior user-space memory accesses and store to rq->membarrier_state, @@ -2519,13 +3585,16 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else + +#else /* !CONFIG_MEMBARRIER :*/ + static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, struct mm_struct *next_mm) { } -#endif + +#endif /* !CONFIG_MEMBARRIER */ #ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) @@ -2540,5 +3609,398 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } #endif -void swake_up_all_locked(struct swait_queue_head *q); -void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern void swake_up_all_locked(struct swait_queue_head *q); +extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); + +extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags); + +#ifdef CONFIG_PREEMPT_DYNAMIC +extern int preempt_dynamic_mode; +extern int sched_dynamic_mode(const char *str); +extern void sched_dynamic_update(int mode); +#endif + +#ifdef CONFIG_SCHED_MM_CID + +#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ +#define MM_CID_SCAN_DELAY 100 /* 100ms */ + +extern raw_spinlock_t cid_lock; +extern int use_cid_lock; + +extern void sched_mm_cid_migrate_from(struct task_struct *t); +extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); +extern void init_sched_mm_cid(struct task_struct *t); + +static inline void __mm_cid_put(struct mm_struct *mm, int cid) +{ + if (cid < 0) + return; + cpumask_clear_cpu(cid, mm_cidmask(mm)); +} + +/* + * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to + * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to + * be held to transition to other states. + * + * State transitions synchronized with cmpxchg or try_cmpxchg need to be + * consistent across CPUs, which prevents use of this_cpu_cmpxchg. + */ +static inline void mm_cid_put_lazy(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid; + + lockdep_assert_irqs_disabled(); + cid = __this_cpu_read(pcpu_cid->cid); + if (!mm_cid_is_lazy_put(cid) || + !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + return; + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +} + +static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +{ + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid, res; + + lockdep_assert_irqs_disabled(); + cid = __this_cpu_read(pcpu_cid->cid); + for (;;) { + if (mm_cid_is_unset(cid)) + return MM_CID_UNSET; + /* + * Attempt transition from valid or lazy-put to unset. + */ + res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); + if (res == cid) + break; + cid = res; + } + return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm) +{ + int cid; + + lockdep_assert_irqs_disabled(); + cid = mm_cid_pcpu_unset(mm); + if (cid == MM_CID_UNSET) + return; + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +} + +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) +{ + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid, max_nr_cid, allowed_max_nr_cid; + + /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } + /* Try to re-use recent cid. This improves cache locality. */ + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = max_nr_cid; + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } + /* + * Find the first available concurrency id. + * Retry finding first zero bit if the mask is temporarily + * filled. This only happens during concurrent remote-clear + * which owns a cid without holding a rq lock. + */ + for (;;) { + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) + break; + cpu_relax(); + } + if (cpumask_test_and_set_cpu(cid, cidmask)) + return -1; + + return cid; +} + +/* + * Save a snapshot of the current runqueue time of this cpu + * with the per-cpu cid value, allowing to estimate how recently it was used. + */ +static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) +{ + struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + + lockdep_assert_rq_held(rq); + WRITE_ONCE(pcpu_cid->time, rq->clock); +} + +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) +{ + int cid; + + /* + * All allocations (even those using the cid_lock) are lock-free. If + * use_cid_lock is set, hold the cid_lock to perform cid allocation to + * guarantee forward progress. + */ + if (!READ_ONCE(use_cid_lock)) { + cid = __mm_cid_try_get(t, mm); + if (cid >= 0) + goto end; + raw_spin_lock(&cid_lock); + } else { + raw_spin_lock(&cid_lock); + cid = __mm_cid_try_get(t, mm); + if (cid >= 0) + goto unlock; + } + + /* + * cid concurrently allocated. Retry while forcing following + * allocations to use the cid_lock to ensure forward progress. + */ + WRITE_ONCE(use_cid_lock, 1); + /* + * Set use_cid_lock before allocation. Only care about program order + * because this is only required for forward progress. + */ + barrier(); + /* + * Retry until it succeeds. It is guaranteed to eventually succeed once + * all newcoming allocations observe the use_cid_lock flag set. + */ + do { + cid = __mm_cid_try_get(t, mm); + cpu_relax(); + } while (cid < 0); + /* + * Allocate before clearing use_cid_lock. Only care about + * program order because this is for forward progress. + */ + barrier(); + WRITE_ONCE(use_cid_lock, 0); +unlock: + raw_spin_unlock(&cid_lock); +end: + mm_cid_snapshot_time(rq, mm); + + return cid; +} + +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) +{ + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + struct cpumask *cpumask; + int cid; + + lockdep_assert_rq_held(rq); + cpumask = mm_cidmask(mm); + cid = __this_cpu_read(pcpu_cid->cid); + if (mm_cid_is_valid(cid)) { + mm_cid_snapshot_time(rq, mm); + return cid; + } + if (mm_cid_is_lazy_put(cid)) { + if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) + __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + } + cid = __mm_cid_get(rq, t, mm); + __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); + + return cid; +} + +static inline void switch_mm_cid(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + /* + * Provide a memory barrier between rq->curr store and load of + * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. + * + * Should be adapted if context_switch() is modified. + */ + if (!next->mm) { // to kernel + /* + * user -> kernel transition does not guarantee a barrier, but + * we can use the fact that it performs an atomic operation in + * mmgrab(). + */ + if (prev->mm) // from user + smp_mb__after_mmgrab(); + /* + * kernel -> kernel transition does not change rq->curr->mm + * state. It stays NULL. + */ + } else { // to user + /* + * kernel -> user transition does not provide a barrier + * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. + * Provide it here. + */ + if (!prev->mm) { // from kernel + smp_mb(); + } else { // from user + /* + * user->user transition relies on an implicit + * memory barrier in switch_mm() when + * current->mm changes. If the architecture + * switch_mm() does not have an implicit memory + * barrier, it is emitted here. If current->mm + * is unchanged, no barrier is needed. + */ + smp_mb__after_switch_mm(); + } + } + if (prev->mm_cid_active) { + mm_cid_snapshot_time(rq, prev->mm); + mm_cid_put_lazy(prev); + prev->mm_cid = -1; + } + if (next->mm_cid_active) + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); +} + +#else /* !CONFIG_SCHED_MM_CID: */ +static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } +static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } +static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } +static inline void init_sched_mm_cid(struct task_struct *t) { } +#endif /* !CONFIG_SCHED_MM_CID */ + +extern u64 avg_vruntime(struct cfs_rq *cfs_rq); +extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SMP +static inline +void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) +{ + lockdep_assert_rq_held(src_rq); + lockdep_assert_rq_held(dst_rq); + + deactivate_task(src_rq, task, 0); + set_task_cpu(task, dst_rq->cpu); + activate_task(dst_rq, task, 0); +} + +static inline +bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) + return true; + + return false; +} +#endif + +#ifdef CONFIG_RT_MUTEXES + +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + +#else /* !CONFIG_RT_MUTEXES: */ + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} + +#endif /* !CONFIG_RT_MUTEXES */ + +extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); +extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); +extern const struct sched_class *__setscheduler_class(int policy, int prio); +extern void set_load_weight(struct task_struct *p, bool update_load); +extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + +#ifdef CONFIG_SMP +extern struct balance_callback *splice_balance_callbacks(struct rq *rq); +extern void balance_callbacks(struct rq *rq, struct balance_callback *head); +#else + +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +{ + return NULL; +} + +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +{ +} + +#endif + +#ifdef CONFIG_SCHED_CLASS_EXT +/* + * Used by SCX in the enable/disable paths to move tasks between sched_classes + * and establish invariants. + */ +struct sched_enq_and_set_ctx { + struct task_struct *p; + int queue_flags; + bool queued; + bool running; +}; + +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx); +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#include "ext.h" + +#endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 9620e323162c..21ac44428bb0 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -6,4 +6,10 @@ extern void sched_ttwu_pending(void *arg); -extern void send_call_function_single_ipi(int cpu); +extern bool call_function_single_prep_ipi(int cpu); + +#ifdef CONFIG_SMP +extern void flush_smp_call_function_queue(void); +#else +static inline void flush_smp_call_function_queue(void) { } +#endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 750fb3c67eed..4346fd81c31f 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,7 +2,100 @@ /* * /proc/schedstat implementation */ -#include "sched.h" + +void __update_stats_wait_start(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 wait_start, prev_wait_start; + + wait_start = rq_clock(rq); + prev_wait_start = schedstat_val(stats->wait_start); + + if (p && likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; + + __schedstat_set(stats->wait_start, wait_start); +} + +void __update_stats_wait_end(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start); + + if (p) { + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + __schedstat_set(stats->wait_start, delta); + + return; + } + + trace_sched_stat_wait(p, delta); + } + + __schedstat_set(stats->wait_max, + max(schedstat_val(stats->wait_max), delta)); + __schedstat_inc(stats->wait_count); + __schedstat_add(stats->wait_sum, delta); + __schedstat_set(stats->wait_start, 0); +} + +void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 sleep_start, block_start; + + sleep_start = schedstat_val(stats->sleep_start); + block_start = schedstat_val(stats->block_start); + + if (sleep_start) { + u64 delta = rq_clock(rq) - sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(stats->sleep_max))) + __schedstat_set(stats->sleep_max, delta); + + __schedstat_set(stats->sleep_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); + + if (p) { + account_scheduler_latency(p, delta >> 10, 1); + trace_sched_stat_sleep(p, delta); + } + } + + if (block_start) { + u64 delta = rq_clock(rq) - block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(stats->block_max))) + __schedstat_set(stats->block_max, delta); + + __schedstat_set(stats->block_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); + __schedstat_add(stats->sum_block_runtime, delta); + + if (p) { + if (p->in_iowait) { + __schedstat_add(stats->iowait_sum, delta); + __schedstat_inc(stats->iowait_count); + trace_sched_stat_iowait(p, delta); + } + + trace_sched_stat_blocked(p, delta); + + account_scheduler_latency(p, delta >> 10, 0); + } + } +} /* * Current schedstat API version. @@ -10,7 +103,7 @@ * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 15 +#define SCHEDSTAT_VERSION 17 static int show_schedstat(struct seq_file *seq, void *v) { @@ -45,15 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - seq_printf(seq, "domain%d %*pb", dcount++, + seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name, cpumask_pr_args(sched_domain_span(sd))); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", + for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { + seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], sd->lb_failed[itype], - sd->lb_imbalance[itype], + sd->lb_imbalance_load[itype], + sd->lb_imbalance_util[itype], + sd->lb_imbalance_task[itype], + sd->lb_imbalance_misfit[itype], sd->lb_gained[itype], sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], @@ -74,7 +169,7 @@ static int show_schedstat(struct seq_file *seq, void *v) } /* - * This itererator needs some explanation. + * This iterator needs some explanation. * It returns 1 for the header position. * This means 2 is cpu 0. * In a hotplugged system some CPUs, including cpu 0, may be missing so we have diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 33d0daf83842..19cdbe96f93d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,7 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_STATS_H +#define _KERNEL_STATS_H #ifdef CONFIG_SCHEDSTATS +extern struct static_key_false sched_schedstats; + /* * Expects runqueue lock to be held for atomicity of update */ @@ -25,7 +29,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) } static inline void -rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { if (rq) rq->rq_sched_info.run_delay += delta; @@ -40,9 +44,33 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +void __update_stats_wait_start(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +void __update_stats_wait_end(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); +void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +static inline void +check_schedstat_required(void) +{ + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n"); +} + #else /* !CONFIG_SCHEDSTATS: */ + static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } -static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { } static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } # define schedstat_enabled() 0 # define __schedstat_inc(var) do { } while (0) @@ -53,59 +81,114 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_set(var, val) do { } while (0) # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 + +# define __update_stats_wait_start(rq, p, stats) do { } while (0) +# define __update_stats_wait_end(rq, p, stats) do { } while (0) +# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0) +# define check_schedstat_required() do { } while (0) + #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_FAIR_GROUP_SCHED +struct sched_entity_stats { + struct sched_entity se; + struct sched_statistics stats; +} __no_randomize_layout; +#endif + +static inline struct sched_statistics * +__schedstats_from_se(struct sched_entity *se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) + return &container_of(se, struct sched_entity_stats, se)->stats; +#endif + return &task_of(se)->stats; +} + #ifdef CONFIG_PSI +void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); +#else +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, - * where a task's runnable state changes, and requeues, where a task - * and its state are being moved between CPUs and runqueues. + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) +static inline void psi_enqueue(struct task_struct *p, int flags) { - int clear = 0, set = TSK_RUNNING; + int clear = 0, set = 0; if (static_branch_likely(&psi_disabled)) return; - if (!wakeup || p->sched_psi_wake_requeue) { + /* Same runqueue, nothing changed for psi */ + if (flags & ENQUEUE_RESTORE) + return; + + /* psi_sched_switch() will handle the flags */ + if (task_on_cpu(task_rq(p), p)) + return; + + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; - if (p->sched_psi_wake_requeue) - p->sched_psi_wake_requeue = 0; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (flags & ENQUEUE_MIGRATED) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; } else { + /* Wakeup of new or sleeping task */ if (p->in_iowait) clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; } psi_task_change(p, clear, set); } -static inline void psi_dequeue(struct task_struct *p, bool sleep) +static inline void psi_dequeue(struct task_struct *p, int flags) { - int clear = TSK_RUNNING, set = 0; - if (static_branch_likely(&psi_disabled)) return; - if (!sleep) { - if (p->in_memstall) - clear |= TSK_MEMSTALL; - } else { - /* - * When a task sleeps, schedule() dequeues it before - * switching to the next one. Merge the clearing of - * TSK_RUNNING and TSK_ONCPU to save an unnecessary - * psi_task_change() call in psi_sched_switch(). - */ - clear |= TSK_ONCPU; + /* Same runqueue, nothing changed for psi */ + if (flags & DEQUEUE_SAVE) + return; - if (p->in_iowait) - set |= TSK_IOWAIT; - } + /* + * A voluntary sleep is a dequeue followed by a task switch. To + * avoid walking all ancestors twice, psi_task_switch() handles + * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. + * Do nothing here. + */ + if (flags & DEQUEUE_SLEEP) + return; - psi_task_change(p, clear, set); + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ + psi_task_change(p, p->psi_flags, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) @@ -117,19 +200,12 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) * deregister its sleep-persistent psi states from the old * queue, and let psi_enqueue() know it has to requeue. */ - if (unlikely(p->in_iowait || p->in_memstall)) { + if (unlikely(p->psi_flags)) { struct rq_flags rf; struct rq *rq; - int clear = 0; - - if (p->in_iowait) - clear |= TSK_IOWAIT; - if (p->in_memstall) - clear |= TSK_MEMSTALL; rq = __task_rq_lock(p, &rf); - psi_task_change(p, clear, 0); - p->sched_psi_wake_requeue = 1; + psi_task_change(p, p->psi_flags, 0); __task_rq_unlock(rq, &rf); } } @@ -144,65 +220,63 @@ static inline void psi_sched_switch(struct task_struct *prev, psi_task_switch(prev, next, sleep); } -static inline void psi_task_tick(struct rq *rq) -{ - if (static_branch_likely(&psi_disabled)) - return; - - if (unlikely(rq->curr->in_memstall)) - psi_memstall_tick(rq->curr, cpu_of(rq)); -} #else /* CONFIG_PSI */ -static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} -static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} -static inline void psi_task_tick(struct rq *rq) {} +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO -static inline void sched_info_reset_dequeued(struct task_struct *t) -{ - t->sched_info.last_queued = 0; -} - /* * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a CPU, we call this routine * from dequeue_task() to account for possible rq->clock skew across CPUs. The * delta taken on each CPU would annul the skew. */ -static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long delta = 0; - if (sched_info_on()) { - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - } - sched_info_reset_dequeued(t); - t->sched_info.run_delay += delta; + if (!t->sched_info.last_queued) + return; - rq_sched_info_dequeued(rq, delta); + delta = rq_clock(rq) - t->sched_info.last_queued; + t->sched_info.last_queued = 0; + t->sched_info.run_delay += delta; + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; + rq_sched_info_dequeue(rq, delta); } /* * Called when a task finally hits the CPU. We can now calculate how * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. + * can keep stats on how long its time-slice is. */ static void sched_info_arrive(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long now, delta = 0; + + if (!t->sched_info.last_queued) + return; - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - sched_info_reset_dequeued(t); + now = rq_clock(rq); + delta = now - t->sched_info.last_queued; + t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; rq_sched_info_arrive(rq, delta); } @@ -210,14 +284,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) /* * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. + * sched_info_dequeue() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) { - if (sched_info_on()) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(rq); - } + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); } /* @@ -225,7 +297,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) * due, typically, to expiring its time slice (this may also be called when * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call - * sched_info_queued() to mark that it has now again started waiting on + * sched_info_enqueue() to mark that it has now again started waiting on * the runqueue. */ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) @@ -234,8 +306,8 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) rq_sched_info_depart(rq, delta); - if (t->state == TASK_RUNNING) - sched_info_queued(rq, t); + if (task_is_running(t)) + sched_info_enqueue(rq, t); } /* @@ -244,7 +316,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* * prev now departs the CPU. It's not interesting to record @@ -258,18 +330,10 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct sched_info_arrive(rq, next); } -static inline void -sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) -{ - if (sched_info_on()) - __sched_info_switch(rq, prev, next); -} - #else /* !CONFIG_SCHED_INFO: */ -# define sched_info_queued(rq, t) do { } while (0) -# define sched_info_reset_dequeued(t) do { } while (0) -# define sched_info_dequeued(rq, t) do { } while (0) -# define sched_info_depart(rq, t) do { } while (0) -# define sched_info_arrive(rq, next) do { } while (0) +# define sched_info_enqueue(rq, t) do { } while (0) +# define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) #endif /* CONFIG_SCHED_INFO */ + +#endif /* _KERNEL_STATS_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4c9e9975684f..058dd42e3d9b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,11 +7,10 @@ * * See kernel/stop_machine.c */ -#include "sched.h" #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } @@ -24,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SMP */ static void -check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) { /* we're never preempted */ } @@ -34,12 +33,11 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq) { if (!sched_stop_runnable(rq)) return NULL; - set_next_task_stop(rq, rq->stop, true); return rq->stop; } @@ -49,10 +47,11 @@ enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) add_nr_running(rq, 1); } -static void +static bool dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + return true; } static void yield_task_stop(struct rq *rq) @@ -60,23 +59,9 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - struct task_struct *curr = rq->curr; - u64 delta_exec; - - delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; - - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = rq_clock_task(rq); - cgroup_account_cputime(curr, delta_exec); + update_curr_common(rq); } /* @@ -102,12 +87,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) BUG(); /* how!?, what priority? */ } -static unsigned int -get_rr_interval_stop(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_stop(struct rq *rq) { } @@ -115,16 +94,15 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class = { - .next = &dl_sched_class, +DEFINE_SCHED_CLASS(stop) = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, - .check_preempt_curr = check_preempt_curr_stop, + .wakeup_preempt = wakeup_preempt_stop, - .pick_next_task = pick_next_task_stop, + .pick_task = pick_task_stop, .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, @@ -136,8 +114,6 @@ const struct sched_class stop_sched_class = { .task_tick = task_tick_stop, - .get_rr_interval = get_rr_interval_stop, - .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e1c655f928c7..72505cd3b60a 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,7 +2,6 @@ /* * <linux/swait.h> (simple wait queues ) implementation: */ -#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) @@ -19,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head); * If for some reason it would return 0, that means the previously waiting * task is already running, so it will observe condition true (or has already). */ -void swake_up_locked(struct swait_queue_head *q) +void swake_up_locked(struct swait_queue_head *q, int wake_flags) { struct swait_queue *curr; @@ -27,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q) return; curr = list_first_entry(&q->task_list, typeof(*curr), task_list); - wake_up_process(curr->task); + try_to_wake_up(curr->task, TASK_NORMAL, wake_flags); list_del_init(&curr->task_list); } EXPORT_SYMBOL(swake_up_locked); @@ -42,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked); void swake_up_all_locked(struct swait_queue_head *q) { while (!list_empty(&q->task_list)) - swake_up_locked(q); + swake_up_locked(q, 0); } void swake_up_one(struct swait_queue_head *q) @@ -50,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q) unsigned long flags; raw_spin_lock_irqsave(&q->lock, flags); - swake_up_locked(q); + swake_up_locked(q, 0); raw_spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(swake_up_one); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c new file mode 100644 index 000000000000..456d339be98f --- /dev/null +++ b/kernel/sched/syscalls.c @@ -0,0 +1,1594 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kernel/sched/syscalls.c + * + * Core kernel scheduler syscalls related code + * + * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 1998-2024 Ingo Molnar, Red Hat + */ +#include <linux/sched.h> +#include <linux/cpuset.h> +#include <linux/sched/debug.h> + +#include <uapi/linux/sched/types.h> + +#include "sched.h" +#include "autogroup.h" + +static inline int __normal_prio(int policy, int rt_prio, int nice) +{ + int prio; + + if (dl_policy(policy)) + prio = MAX_DL_PRIO - 1; + else if (rt_policy(policy)) + prio = MAX_RT_PRIO - 1 - rt_prio; + else + prio = NICE_TO_PRIO(nice); + + return prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_or_dl_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +void set_user_nice(struct task_struct *p, long nice) +{ + bool queued, running; + struct rq *rq; + int old_prio; + + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + CLASS(task_rq_lock, rq_guard)(p); + rq = rq_guard.rq; + + update_rq_clock(rq); + + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it won't have any effect on scheduling until the task is + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: + */ + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + return; + } + + queued = task_on_rq_queued(p); + running = task_current_donor(rq, p); + if (queued) + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + if (running) + put_prev_task(rq, p); + + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + if (running) + set_next_task(rq, p); + + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + p->sched_class->prio_changed(rq, p, old_prio); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * is_nice_reduction - check if nice value is an actual reduction + * + * Similar to can_nice() but does not perform a capability check. + * + * @p: task + * @nice: nice value + */ +static bool is_nice_reduction(const struct task_struct *p, const int nice) +{ + /* Convert nice value [19,-20] to rlimit style value [1,40]: */ + int nice_rlim = nice_to_rlimit(nice); + + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); +} + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); + nice = task_nice(current) + increment; + + nice = clamp_val(nice, MIN_NICE, MAX_NICE); + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * Return: The priority value as seen by users in /proc. + * + * sched policy return value kernel prio user prio/nice + * + * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] + * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] + * deadline -101 -1 0 + */ +int task_prio(const struct task_struct *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * idle_cpu - is a given CPU idle currently? + * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP + if (rq->ttwu_pending) + return 0; +#endif + + return 1; +} + +/** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int available_idle_cpu(int cpu) +{ + if (!idle_cpu(cpu)) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + +/** + * idle_task - return the idle task for a given CPU. + * @cpu: the processor in question. + * + * Return: The idle task for the CPU @cpu. + */ +struct task_struct *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +#ifdef CONFIG_SCHED_CORE +int sched_core_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (sched_core_enabled(rq) && rq->curr == rq->idle) + return 1; + + return idle_cpu(cpu); +} + +#endif + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_vpid(pid) : current; +} + +static struct task_struct *find_get_task(pid_t pid) +{ + struct task_struct *p; + guard(rcu)(); + + p = find_process_by_pid(pid); + if (likely(p)) + get_task_struct(p); + + return p; +} + +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), + find_get_task(pid), pid_t pid) + +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) +{ + int policy = attr->sched_policy; + + if (policy == SETPARAM_POLICY) + policy = p->policy; + + p->policy = policy; + + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (fair_policy(policy)) + __setparam_fair(p, attr); + + /* rt-policy tasks do not have a timerslack */ + if (rt_or_dl_task_policy(p)) { + p->timer_slack_ns = 0; + } else if (p->timer_slack_ns == 0) { + /* when switching back to non-rt policy, restore timerslack */ + p->timer_slack_ns = p->default_timer_slack_ns; + } + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. + */ + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); + set_load_weight(p, true); +} + +/* + * Check the target process has a UID that matches the current process's: + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + guard(rcu)(); + + pcred = __task_cred(p); + return (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); +} + +#ifdef CONFIG_UCLAMP_TASK + +static int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + int util_min = p->uclamp_req[UCLAMP_MIN].value; + int util_max = p->uclamp_req[UCLAMP_MAX].value; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + util_min = attr->sched_util_min; + + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + util_max = attr->sched_util_max; + + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (util_min != -1 && util_max != -1 && util_min > util_max) + return -EINVAL; + + /* + * We have valid uclamp attributes; make sure uclamp is enabled. + * + * We need to do that here, because enabling static branches is a + * blocking operation which obviously cannot be done while holding + * scheduler locks. + */ + static_branch_enable(&sched_uclamp_used); + + return 0; +} + +static bool uclamp_reset(const struct sched_attr *attr, + enum uclamp_id clamp_id, + struct uclamp_se *uc_se) +{ + /* Reset on sched class change for a non user-defined clamp value. */ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && + !uc_se->user_defined) + return true; + + /* Reset on sched_util_{min,max} == -1. */ + if (clamp_id == UCLAMP_MIN && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min == -1) { + return true; + } + + if (clamp_id == UCLAMP_MAX && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max == -1) { + return true; + } + + return false; +} + +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) +{ + enum uclamp_id clamp_id; + + for_each_clamp_id(clamp_id) { + struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int value; + + if (!uclamp_reset(attr, clamp_id, uc_se)) + continue; + + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + value = sysctl_sched_uclamp_util_min_rt_default; + else + value = uclamp_none(clamp_id); + + uclamp_se_set(uc_se, value, false); + + } + + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) + return; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min != -1) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], + attr->sched_util_min, true); + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max != -1) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], + attr->sched_util_max, true); + } +} + +#else /* !CONFIG_UCLAMP_TASK: */ + +static inline int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + return -EOPNOTSUPP; +} +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) { } +#endif + +/* + * Allow unprivileged RT tasks to decrease priority. + * Only issue a capable test if needed and only once to avoid an audit + * event on permitted non-privileged operations: + */ +static int user_check_sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + int policy, int reset_on_fork) +{ + if (fair_policy(policy)) { + if (attr->sched_nice < task_nice(p) && + !is_nice_reduction(p, attr->sched_nice)) + goto req_priv; + } + + if (rt_policy(policy)) { + unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); + + /* Can't set/change the rt policy: */ + if (policy != p->policy && !rlim_rtprio) + goto req_priv; + + /* Can't increase priority: */ + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) + goto req_priv; + } + + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + goto req_priv; + + /* + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. + */ + if (task_has_idle_policy(p) && !idle_policy(policy)) { + if (!is_nice_reduction(p, task_nice(p))) + goto req_priv; + } + + /* Can't change other user's priorities: */ + if (!check_same_owner(p)) + goto req_priv; + + /* Normal users shall not reset the sched_reset_on_fork flag: */ + if (p->sched_reset_on_fork && !reset_on_fork) + goto req_priv; + + return 0; + +req_priv: + if (!capable(CAP_SYS_NICE)) + return -EPERM; + + return 0; +} + +int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user, bool pi) +{ + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; + const struct sched_class *prev_class, *next_class; + struct balance_callback *head; + struct rq_flags rf; + int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + struct rq *rq; + bool cpuset_locked = false; + + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); +recheck: + /* Double check policy once rq lock held: */ + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; + policy = oldpolicy = p->policy; + } else { + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); + + if (!valid_policy(policy)) + return -EINVAL; + } + + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) + return -EINVAL; + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. + */ + if (attr->sched_priority > MAX_RT_PRIO-1) + return -EINVAL; + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) + return -EINVAL; + + if (user) { + retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork); + if (retval) + return retval; + + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return -EINVAL; + + retval = security_task_setscheduler(p); + if (retval) + return retval; + } + + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); + if (retval) + return retval; + } + + /* + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets + * information. + */ + if (dl_policy(policy) || dl_policy(p->policy)) { + cpuset_locked = true; + cpuset_lock(); + } + + /* + * Make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + * + * To be able to change p->policy safely, the appropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + + /* + * Changing the policy of the stop threads its a very bad idea: + */ + if (p == rq->stop) { + retval = -EINVAL; + goto unlock; + } + + retval = scx_check_setscheduler(p, policy); + if (retval) + goto unlock; + + /* + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && + (attr->sched_nice != task_nice(p) || + (attr->sched_runtime != p->se.slice))) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + if (dl_policy(policy) && dl_param_changed(p, attr)) + goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; + + p->sched_reset_on_fork = reset_on_fork; + retval = 0; + goto unlock; + } +change: + + if (user) { +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow real-time tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { + retval = -EPERM; + goto unlock; + } +#endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy) && + !(attr->sched_flags & SCHED_FLAG_SUGOV)) { + cpumask_t *span = rq->rd->span; + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_subset(span, p->cpus_ptr) || + rq->rd->dl_bw.bw == 0) { + retval = -EPERM; + goto unlock; + } + } +#endif + } + + /* Re-check policy now with rq lock held: */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &rf); + if (cpuset_locked) + cpuset_unlock(); + goto recheck; + } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { + retval = -EBUSY; + goto unlock; + } + + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); + if (pi) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + newprio = rt_effective_prio(p, newprio); + if (newprio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; + } + + prev_class = p->sched_class; + next_class = __setscheduler_class(policy, newprio); + + if (prev_class != next_class && p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); + + queued = task_on_rq_queued(p); + running = task_current_donor(rq, p); + if (queued) + dequeue_task(rq, p, queue_flags); + if (running) + put_prev_task(rq, p); + + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); + check_class_changing(rq, p, prev_class); + + if (queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, queue_flags); + } + if (running) + set_next_task(rq, p); + + check_class_changed(rq, p, prev_class, oldprio); + + /* Avoid rq from going away on us: */ + preempt_disable(); + head = splice_balance_callbacks(rq); + task_rq_unlock(rq, p, &rf); + + if (pi) { + if (cpuset_locked) + cpuset_unlock(); + rt_mutex_adjust_pi(p); + } + + /* Run balance callbacks after we've adjusted the PI chain: */ + balance_callbacks(rq, head); + preempt_enable(); + + return 0; + +unlock: + task_rq_unlock(rq, p, &rf); + if (cpuset_locked) + cpuset_unlock(); + return retval; +} + +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) +{ + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + + if (p->se.custom_slice) + attr.sched_runtime = p->se.slice; + + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + + return __sched_setscheduler(p, &attr, check, true); +} +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Use sched_set_fifo(), read its comment. + * + * Return: 0 on success. An error code otherwise. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return _sched_setscheduler(p, policy, param, true); +} + +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true, true); +} + +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, false, true); +} +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + * + * Return: 0 on success. An error code otherwise. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return _sched_setscheduler(p, policy, param, false); +} + +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lparam; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + return sched_setscheduler(p, policy, &lparam); +} + +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) +{ + u32 size; + int ret; + + /* Zero the full structure, so that a short copy will be nice: */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + /* ABI compatibility quirk: */ + if (!size) + size = SCHED_ATTR_SIZE_VER0; + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) + goto err_size; + + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; + } + + if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + + /* + * XXX: Do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); + + return 0; + +err_size: + put_user(sizeof(*attr), &uattr->size); + return -E2BIG; +} + +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) { + __getparam_dl(p, attr); + } else if (task_has_rt_policy(p)) { + attr->sched_priority = p->rt_priority; + } else { + attr->sched_nice = task_nice(p); + attr->sched_runtime = p->se.slice; + } +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) +{ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); +} + +/** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @flags: for future extension. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) +{ + struct sched_attr attr; + int retval; + + if (!uattr || pid < 0 || flags) + return -EINVAL; + + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if ((int)attr.sched_policy < 0) + return -EINVAL; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + attr.sched_policy = SETPARAM_POLICY; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); + + return sched_setattr(p, &attr); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ + struct task_struct *p; + int retval; + + if (pid < 0) + return -EINVAL; + + guard(rcu)(); + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (!retval) { + retval = p->policy; + if (p->sched_reset_on_fork) + retval |= SCHED_RESET_ON_FORK; + } + return retval; +} + +/** + * sys_sched_getparam - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + * + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ + struct sched_param lp = { .sched_priority = 0 }; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + } + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @usize: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, usize, unsigned int, flags) +{ + struct sched_attr kattr = { }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags) + return -EINVAL; + + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + kattr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; + +#ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; +#endif + } + + kattr.size = min(usize, sizeof(kattr)); + return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); +} + +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * The special/sugov task isn't part of regular bandwidth/admission + * control so let userspace change affinities. + */ + if (dl_entity_is_special(&p->dl)) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + guard(rcu)(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + return -EBUSY; + + return 0; +} +#endif /* CONFIG_SMP */ + +int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) +{ + int retval; + cpumask_var_t cpus_allowed, new_mask; + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, ctx->new_mask, cpus_allowed); + + ctx->new_mask = new_mask; + ctx->flags |= SCA_CHECK; + + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; + + retval = __set_cpus_allowed_ptr(p, ctx); + if (retval) + goto out_free_new_mask; + + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + + /* + * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr() + * will restore the previous user_cpus_ptr value. + * + * In the unlikely event a previous user_cpus_ptr exists, + * we need to further restrict the mask to what is allowed + * by that old user_cpus_ptr. + */ + if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) { + bool empty = !cpumask_and(new_mask, new_mask, + ctx->user_mask); + + if (empty) + cpumask_copy(new_mask, cpus_allowed); + } + __set_cpus_allowed_ptr(p, ctx); + retval = -EINVAL; + } + +out_free_new_mask: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + struct affinity_context ac; + struct cpumask *user_mask; + int retval; + + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; + + if (p->flags & PF_NO_SETAFFINITY) + return -EINVAL; + + if (!check_same_owner(p)) { + guard(rcu)(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) + return -EPERM; + } + + retval = security_task_setscheduler(p); + if (retval) + return retval; + + /* + * With non-SMP configs, user_cpus_ptr/user_mask isn't used and + * alloc_user_cpus_ptr() returns NULL. + */ + user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); + if (user_mask) { + cpumask_copy(user_mask, in_mask); + } else if (IS_ENABLED(CONFIG_SMP)) { + return -ENOMEM; + } + + ac = (struct affinity_context){ + .new_mask = in_mask, + .user_mask = user_mask, + .flags = SCA_USER, + }; + + retval = __sched_setaffinity(p, &ac); + kfree(ac.user_mask); + + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + struct cpumask *new_mask) +{ + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the CPU affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new CPU mask + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ + struct task_struct *p; + int retval; + + guard(rcu)(); + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + guard(raw_spinlock_irqsave)(&p->pi_lock); + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); + + return 0; +} + +/** + * sys_sched_getaffinity - get the CPU affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current CPU mask + * + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) + return -EINVAL; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + unsigned int retlen = min(len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) + ret = -EFAULT; + else + ret = retlen; + } + free_cpumask_var(mask); + + return ret; +} + +static void do_sched_yield(void) +{ + struct rq_flags rf; + struct rq *rq; + + rq = this_rq_lock_irq(&rf); + + schedstat_inc(rq->yld_count); + current->sched_class->yield_task(rq); + + preempt_disable(); + rq_unlock_irq(rq, &rf); + sched_preempt_enable_no_resched(); + + schedule(); +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + * + * Return: 0. + */ +SYSCALL_DEFINE0(sched_yield) +{ + do_sched_yield(); + return 0; +} + +/** + * yield - yield the current processor to other threads. + * + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, it's already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + do_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Return: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. + */ +int __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + int yielded = 0; + + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + rq = this_rq(); + +again: + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) + return -ESRCH; + + guard(double_rq_lock)(rq, p_rq); + if (task_rq(p) != p_rq) + goto again; + + if (!curr->sched_class->yield_to_task) + return 0; + + if (curr->sched_class != p->sched_class) + return 0; + + if (task_on_cpu(p_rq, p) || !task_is_running(p)) + return 0; + + yielded = curr->sched_class->yield_to_task(rq, p); + if (yielded) { + schedstat_inc(rq->yld_count); + /* + * Make p's CPU reschedule; pick_next_entity + * takes care of fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } + } + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_RT_PRIO-1; + break; + case SCHED_DEADLINE: + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + case SCHED_EXT: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_DEADLINE: + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + case SCHED_EXT: + ret = 0; + } + return ret; +} + +static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) +{ + unsigned int time_slice = 0; + int retval; + + if (pid < 0) + return -EINVAL; + + scoped_guard (rcu) { + struct task_struct *p = find_process_by_pid(pid); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (retval) + return retval; + + scoped_guard (task_rq_lock, p) { + struct rq *rq = scope.rq; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + } + } + + jiffies_to_timespec64(time_slice, t); + return 0; +} + +/** + * sys_sched_rr_get_interval - return the default time-slice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the time-slice value. + * + * this syscall writes the default time-slice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the time-slice is in @interval. Otherwise, + * an error code. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct __kernel_timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_timespec64(&t, interval); + + return retval; +} + +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, + struct old_timespec32 __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_old_timespec32(&t, interval); + return retval; +} +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ba81187bb7af..c49aea8c1025 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2,7 +2,8 @@ /* * Scheduler topology setup/handling methods */ -#include "sched.h" + +#include <linux/bsearch.h> DEFINE_MUTEX(sched_domains_mutex); @@ -14,21 +15,29 @@ static cpumask_var_t sched_domains_tmpmask2; static int __init sched_debug_setup(char *str) { - sched_debug_enabled = true; + sched_debug_verbose = true; return 0; } -early_param("sched_debug", sched_debug_setup); +early_param("sched_verbose", sched_debug_setup); static inline bool sched_debug(void) { - return sched_debug_enabled; + return sched_debug_verbose; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include <linux/sched/sd_flags.h> +}; +#undef SD_FLAG + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; + unsigned long flags = sd->flags; + unsigned int idx; cpumask_clear(groupmask); @@ -43,6 +52,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + unsigned int flag = BIT(idx); + unsigned int meta_flags = sd_flag_debug[idx].meta_flags; + + if ((meta_flags & SDF_SHARED_CHILD) && sd->child && + !(sd->child->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in child\n", + sd_flag_debug[idx].name); + + if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && + !(sd->parent->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", + sd_flag_debug[idx].name); + } + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { @@ -51,7 +75,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_span(group))) { + if (cpumask_empty(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; @@ -108,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_debug_enabled) + if (!sched_debug_verbose) return; if (!sd) { @@ -129,7 +153,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ -# define sched_debug_enabled 0 +# define sched_debug_verbose 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -137,22 +161,22 @@ static inline bool sched_debug(void) } #endif /* CONFIG_SCHED_DEBUG */ +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include <linux/sched/sd_flags.h> +0; +#undef SD_FLAG + static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { - if (sd->groups != sd->groups->next) - return 0; - } + if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && + (sd->groups != sd->groups->next)) + return 0; /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) @@ -173,18 +197,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 0; /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } + if (parent->groups == parent->groups->next) + pflags &= ~SD_DEGENERATE_GROUPS_MASK; + if (~cflags & pflags) return 0; @@ -193,12 +208,84 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); -unsigned int sysctl_sched_energy_aware = 1; -DEFINE_MUTEX(sched_energy_mutex); -bool sched_energy_update; +static unsigned int sysctl_sched_energy_aware = 1; +static DEFINE_MUTEX(sched_energy_mutex); +static bool sched_energy_update; + +static bool sched_is_eas_possible(const struct cpumask *cpu_mask) +{ + bool any_asym_capacity = false; + struct cpufreq_policy *policy; + struct cpufreq_governor *gov; + int i; + + /* EAS is enabled for asymmetric CPU capacity topologies. */ + for_each_cpu(i, cpu_mask) { + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) { + any_asym_capacity = true; + break; + } + } + if (!any_asym_capacity) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* EAS definitely does *not* handle SMT */ + if (sched_smt_active()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, SMT is not supported\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + if (!arch_scale_freq_invariant()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* Do not attempt EAS if schedutil is not being used. */ + for_each_cpu(i, cpu_mask) { + policy = cpufreq_cpu_get(i); + if (!policy) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", + cpumask_pr_args(cpu_mask), i); + } + return false; + } + gov = policy->governor; + cpufreq_cpu_put(policy); + if (gov != &schedutil_gov) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + } + + return true; +} + +void rebuild_sched_domains_energy(void) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} #ifdef CONFIG_PROC_SYSCTL -int sched_energy_aware_handler(struct ctl_table *table, int write, +static int sched_energy_aware_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; @@ -206,20 +293,44 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!sched_is_eas_possible(cpu_active_mask)) { + if (write) { + return -EOPNOTSUPP; + } else { + *lenp = 0; + return 0; + } + } + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = 1; - rebuild_sched_domains(); - sched_energy_update = 0; - mutex_unlock(&sched_energy_mutex); - } + if (state != sysctl_sched_energy_aware) + rebuild_sched_domains_energy(); } return ret; } + +static const struct ctl_table sched_energy_aware_sysctls[] = { + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init sched_energy_aware_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_energy_aware_sysctls); + return 0; +} + +late_initcall(sched_energy_aware_sysctl_init); #endif static void free_pd(struct perf_domain *pd) @@ -272,10 +383,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map, printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); while (pd) { - printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", cpumask_first(perf_domain_span(pd)), cpumask_pr_args(perf_domain_span(pd)), - em_pd_nr_cap_states(pd->em_pd)); + em_pd_nr_perf_states(pd->em_pd)); pd = pd->next; } @@ -308,94 +419,33 @@ static void sched_energy_set(bool has_eas) * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. no SMT is detected. - * 4. the EM complexity is low enough to keep scheduling overheads low; - * 5. schedutil is driving the frequency of all CPUs of the rd; - * - * The complexity of the Energy Model is defined as: - * - * C = nr_pd * (nr_cpus + nr_cs) - * - * with parameters defined as: - * - nr_pd: the number of performance domains - * - nr_cpus: the number of CPUs - * - nr_cs: the sum of the number of capacity states of all performance - * domains (for example, on a system with 2 performance domains, - * with 10 capacity states each, nr_cs = 2 * 10 = 20). - * - * It is generally not a good idea to use such a model in the wake-up path on - * very complex platforms because of the associated scheduling overheads. The - * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 capacity states each, for example. + * 4. schedutil is driving the frequency of all CPUs of the rd; + * 5. frequency invariance support is present; */ -#define EM_MAX_COMPLEXITY 2048 - -extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); + int i; struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; if (!sysctl_sched_energy_aware) goto free; - /* EAS is enabled for asymmetric CPU capacity topologies. */ - if (!per_cpu(sd_asym_cpucapacity, cpu)) { - if (sched_debug()) { - pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", - cpumask_pr_args(cpu_map)); - } + if (!sched_is_eas_possible(cpu_map)) goto free; - } - - /* EAS definitely does *not* handle SMT */ - if (sched_smt_active()) { - pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", - cpumask_pr_args(cpu_map)); - goto free; - } for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) continue; - /* Do not attempt EAS if schedutil is not being used. */ - policy = cpufreq_cpu_get(i); - if (!policy) - goto free; - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (rd->pd) - pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_map)); - goto free; - } - /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) goto free; tmp->next = pd; pd = tmp; - - /* - * Count performance domains and capacity states for the - * complexity check. - */ - nr_pd++; - nr_cs += em_pd_nr_cap_states(pd->em_pd); - } - - /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { - WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", - cpumask_pr_args(cpu_map)); - goto free; } perf_domain_debug(cpu_map, pd); @@ -438,9 +488,9 @@ static void free_rootdomain(struct rcu_head *rcu) void rq_attach_root(struct rq *rq, struct root_domain *rd) { struct root_domain *old_rd = NULL; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { old_rd = rq->rd; @@ -451,7 +501,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rd yet then + * If we don't want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ @@ -466,7 +516,15 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Because the rq is not a task, dl_add_task_root_domain() did not + * move the fair server bw to the rd if it already started. + * Add it now. + */ + if (rq->fair_server.dl_server) + __dl_server_attach_root(&rq->fair_server, rq); + + rq_unlock_irqrestore(rq, &rf); if (old_rd) call_rcu(&old_rd->rcu, free_rootdomain); @@ -499,9 +557,10 @@ static int init_rootdomain(struct root_domain *rd) #ifdef HAVE_RT_PUSH_IPI rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); + rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif + rd->visit_gen = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -530,7 +589,7 @@ out: */ struct root_domain def_root_domain; -void init_defrootdomain(void) +void __init init_defrootdomain(void) { init_rootdomain(&def_root_domain); @@ -605,22 +664,25 @@ static void destroy_sched_domains(struct sched_domain *sd) } /* - * Keep a special pointer to the highest sched_domain that has - * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this - * allows us to avoid some pointer chasing select_idle_sibling(). + * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set + * (Last Level Cache Domain) for this allows us to avoid some pointer chasing + * select_idle_sibling(). * - * Also keep a unique ID per domain (we use the first CPU number in - * the cpumask of the domain), this allows us to quickly tell if - * two CPUs are in the same cache domain, see cpus_share_cache(). + * Also keep a unique ID per domain (we use the first CPU number in the cpumask + * of the domain), this allows us to quickly tell if two CPUs are in the same + * cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -629,7 +691,7 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1; - sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); + sd = highest_flag_domain(cpu, SD_SHARE_LLC); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); @@ -641,13 +703,24 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY); + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } @@ -669,8 +742,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; - if (parent->parent) + + if (parent->parent) { parent->parent->child = tmp; + parent->parent->groups->flags = tmp->flags; + } + /* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this @@ -687,8 +764,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = sd; sd = sd->parent; destroy_sched_domain(tmp); - if (sd) + if (sd) { + struct sched_group *sg = sd->groups; + + /* + * sched groups hold the flags of the child sched + * domain for convenience. Clear such flags since + * the child is being destroyed. + */ + do { + sg->flags = 0; + } while (sg != sd->groups); + sd->child = NULL; + } } sched_domain_debug(sd, cpu); @@ -884,10 +973,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) return NULL; sg_span = sched_group_span(sg); - if (sd->child) + if (sd->child) { cpumask_copy(sg_span, sched_domain_span(sd->child)); - else + sg->flags = sd->child->flags; + } else { cpumask_copy(sg_span, sched_domain_span(sd)); + } atomic_inc(&sg->ref); return sg; @@ -902,7 +993,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, int cpu; build_balance_mask(sd, sg, mask); - cpu = cpumask_first_and(sched_group_span(sg), mask); + cpu = cpumask_first(mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) @@ -921,6 +1012,31 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } +static struct sched_domain * +find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) +{ + /* + * The proper descendant would be the one whose child won't span out + * of sd + */ + while (sibling->child && + !cpumask_subset(sched_domain_span(sibling->child), + sched_domain_span(sd))) + sibling = sibling->child; + + /* + * As we are referencing sgc across different topology level, we need + * to go down to skip those sched_domains which don't contribute to + * scheduling because they will be degenerated in cpu_attach_domain + */ + while (sibling->child && + cpumask_equal(sched_domain_span(sibling->child), + sched_domain_span(sibling))) + sibling = sibling->child; + + return sibling; +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -954,6 +1070,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; + /* + * Usually we build sched_group by sibling's child sched_domain + * But for machines whose NUMA diameter are 3 or above, we move + * to build sched_group by sibling's proper descendant's child + * domain because sibling's child sched_domain will span out of + * the sched_domain being built as below. + * + * Smallest diameter=3 topology is: + * + * node 0 1 2 3 + * 0: 10 20 30 40 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 40 30 20 10 + * + * 0 --- 1 --- 2 --- 3 + * + * NUMA-3 0-3 N/A N/A 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-2 0-2 0-3 0-3 1-3 + * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} + * + * NUMA-1 0-1 0-2 1-3 2-3 + * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} + * + * NUMA-0 0 1 2 3 + * + * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the + * group span isn't a subset of the domain span. + */ + if (sibling->child && + !cpumask_subset(sched_domain_span(sibling->child), span)) + sibling = find_descended_sibling(sd, sibling); + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; @@ -961,7 +1112,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); - init_overlap_sched_group(sd, sg); + init_overlap_sched_group(sibling, sg); if (!first) first = sg; @@ -989,7 +1140,7 @@ fail: * * - Simultaneous multithreading (SMT) * - Multi-Core Cache (MC) - * - Package (DIE) + * - Package (PKG) * * Where the last one more or less denotes everything up to a NUMA node. * @@ -1011,13 +1162,13 @@ fail: * * CPU 0 1 2 3 4 5 6 7 * - * DIE [ ] + * PKG [ ] * MC [ ] [ ] * SMT [ ] [ ] [ ] [ ] * * - or - * - * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 * @@ -1033,7 +1184,7 @@ fail: * uniquely identify each group (for a given domain): * * - The first is the balance_cpu (see should_we_balance() and the - * load-balance blub in fair.c); for each group we only want 1 CPU to + * load-balance blurb in fair.c); for each group we only want 1 CPU to * continue balancing at a higher domain. * * - The second is the sched_group_capacity; we want all identical groups @@ -1077,6 +1228,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + sg->flags = child->flags; } else { cpumask_set_cpu(cpu, sched_group_span(sg)); cpumask_set_cpu(cpu, group_balance_mask(sg)); @@ -1145,14 +1297,24 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + struct cpumask *mask = sched_domains_tmpmask2; WARN_ON(!sg); do { - int cpu, max_cpu = -1; + int cpu, cores = 0, max_cpu = -1; sg->group_weight = cpumask_weight(sched_group_span(sg)); + cpumask_copy(mask, sched_group_span(sg)); + for_each_cpu(cpu, mask) { + cores++; +#ifdef CONFIG_SCHED_SMT + cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); +#endif + } + sg->cores = cores; + if (!(sd->flags & SD_ASYM_PACKING)) goto next; @@ -1175,6 +1337,123 @@ next: } /* + * Set of available CPUs grouped by their corresponding capacities + * Each list entry contains a CPU mask reflecting CPUs that share the same + * capacity. + * The lifespan of data is unlimited. + */ +LIST_HEAD(asym_cap_list); + +/* + * Verify whether there is any CPU capacity asymmetry in a given sched domain. + * Provides sd_flags reflecting the asymmetry scope. + */ +static inline int +asym_cpu_capacity_classify(const struct cpumask *sd_span, + const struct cpumask *cpu_map) +{ + struct asym_cap_data *entry; + int count = 0, miss = 0; + + /* + * Count how many unique CPU capacities this domain spans across + * (compare sched_domain CPUs mask with ones representing available + * CPUs capacities). Take into account CPUs that might be offline: + * skip those. + */ + list_for_each_entry(entry, &asym_cap_list, link) { + if (cpumask_intersects(sd_span, cpu_capacity_span(entry))) + ++count; + else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry))) + ++miss; + } + + WARN_ON_ONCE(!count && !list_empty(&asym_cap_list)); + + /* No asymmetry detected */ + if (count < 2) + return 0; + /* Some of the available CPU capacity values have not been detected */ + if (miss) + return SD_ASYM_CPUCAPACITY; + + /* Full asymmetry */ + return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL; + +} + +static void free_asym_cap_entry(struct rcu_head *head) +{ + struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu); + kfree(entry); +} + +static inline void asym_cpu_capacity_update_data(int cpu) +{ + unsigned long capacity = arch_scale_cpu_capacity(cpu); + struct asym_cap_data *insert_entry = NULL; + struct asym_cap_data *entry; + + /* + * Search if capacity already exits. If not, track which the entry + * where we should insert to keep the list ordered descending. + */ + list_for_each_entry(entry, &asym_cap_list, link) { + if (capacity == entry->capacity) + goto done; + else if (!insert_entry && capacity > entry->capacity) + insert_entry = list_prev_entry(entry, link); + } + + entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); + if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) + return; + entry->capacity = capacity; + + /* If NULL then the new capacity is the smallest, add last. */ + if (!insert_entry) + list_add_tail_rcu(&entry->link, &asym_cap_list); + else + list_add_rcu(&entry->link, &insert_entry->link); +done: + __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); +} + +/* + * Build-up/update list of CPUs grouped by their capacities + * An update requires explicit request to rebuild sched domains + * with state indicating CPU topology changes. + */ +static void asym_cpu_capacity_scan(void) +{ + struct asym_cap_data *entry, *next; + int cpu; + + list_for_each_entry(entry, &asym_cap_list, link) + cpumask_clear(cpu_capacity_span(entry)); + + for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) + asym_cpu_capacity_update_data(cpu); + + list_for_each_entry_safe(entry, next, &asym_cap_list, link) { + if (cpumask_empty(cpu_capacity_span(entry))) { + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); + } + } + + /* + * Only one capacity value has been detected i.e. this system is symmetric. + * No need to keep this data around. + */ + if (list_is_singular(&asym_cap_list)) { + entry = list_first_entry(&asym_cap_list, typeof(*entry), link); + list_del_rcu(&entry->link); + call_rcu(&entry->rcu, free_asym_cap_entry); + } +} + +/* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ @@ -1203,7 +1482,7 @@ static void set_domain_attribute(struct sched_domain *sd, } else request = attr->relax_domain_level; - if (sd->level > request) { + if (sd->level >= request) { /* Turn off idle balance on this domain: */ sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } @@ -1219,13 +1498,13 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_rootdomain: if (!atomic_read(&d->rd->refcount)) free_rootdomain(&d->rd->rcu); - /* Fall through */ + fallthrough; case sa_sd: free_percpu(d->sd); - /* Fall through */ + fallthrough; case sa_sd_storage: __sdt_free(cpu_map); - /* Fall through */ + fallthrough; case sa_none: break; } @@ -1279,7 +1558,6 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; #endif /* @@ -1287,12 +1565,12 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; * * These flags are purely descriptive of the topology and do not prescribe * behaviour. Behaviour is artificial and mapped in the below sd_init() - * function: + * function. For details, see include/linux/sched/sd_flags.h. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_SHARE_CPUCAPACITY + * SD_SHARE_LLC + * SD_CLUSTER + * SD_NUMA * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1301,19 +1579,20 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ - SD_SHARE_PKG_RESOURCES | \ + SD_CLUSTER | \ + SD_SHARE_LLC | \ SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_ASYM_PACKING) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, - struct sched_domain *child, int dflags, int cpu) + struct sched_domain *child, int cpu) { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; + struct cpumask *sd_span; #ifdef CONFIG_NUMA /* @@ -1328,16 +1607,13 @@ sd_init(struct sched_domain_topology_level *tl, sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; - - /* Apply detected topology flags */ - sd_flags |= dflags; + sd_flags &= TOPOLOGY_SD_FLAGS; *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, - .busy_factor = 32, - .imbalance_pct = 125, + .busy_factor = 16, + .imbalance_pct = 117, .cache_nice_tries = 0, @@ -1347,7 +1623,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_BALANCE_WAKE | 1*SD_WAKE_AFFINE | 0*SD_SHARE_CPUCAPACITY - | 0*SD_SHARE_PKG_RESOURCES + | 0*SD_SHARE_LLC | 0*SD_SERIALIZE | 1*SD_PREFER_SIBLING | 0*SD_NUMA @@ -1357,20 +1633,24 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, + .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; - cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - sd_id = cpumask_first(sched_domain_span(sd)); + sd_span = sched_domain_span(sd); + cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + sd_id = cpumask_first(sd_span); + + sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); + + WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == + (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), + "CPU capacity asymmetry not supported on SMT\n"); /* * Convert topological properties into behaviour. */ - /* Don't attempt to spread across CPUs of different capacities. */ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) sd->child->flags &= ~SD_PREFER_SIBLING; @@ -1378,7 +1658,7 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + } else if (sd->flags & SD_SHARE_LLC) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; @@ -1403,7 +1683,7 @@ sd_init(struct sched_domain_topology_level *tl, * For all levels sharing cache; connect a sched_domain_shared * instance. */ - if (sd->flags & SD_SHARE_PKG_RESOURCES) { + if (sd->flags & SD_SHARE_LLC) { sd->shared = *per_cpu_ptr(sdd->sds, sd_id); atomic_inc(&sd->shared->ref); atomic_set(&sd->shared->nr_busy_cpus, sd_weight); @@ -1421,25 +1701,32 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif + +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, +#endif + #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; static struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology_saved; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) -void set_sched_topology(struct sched_domain_topology_level *tl) +void __init set_sched_topology(struct sched_domain_topology_level *tl) { if (WARN_ON_ONCE(sched_smp_initialized)) return; sched_domain_topology = tl; + sched_domain_topology_saved = NULL; } #ifdef CONFIG_NUMA @@ -1463,8 +1750,12 @@ static void sched_numa_warn(const char *str) for (i = 0; i < nr_node_ids; i++) { printk(KERN_WARNING " "); - for (j = 0; j < nr_node_ids; j++) - printk(KERN_CONT "%02d ", node_distance(i,j)); + for (j = 0; j < nr_node_ids; j++) { + if (!node_state(i, N_CPU) || !node_state(j, N_CPU)) + printk(KERN_CONT "(%02d) ", node_distance(i,j)); + else + printk(KERN_CONT " %02d ", node_distance(i,j)); + } printk(KERN_CONT "\n"); } printk(KERN_WARNING "\n"); @@ -1472,19 +1763,34 @@ static void sched_numa_warn(const char *str) bool find_numa_distance(int distance) { - int i; + bool found = false; + int i, *distances; if (distance == node_distance(0, 0)) return true; + rcu_read_lock(); + distances = rcu_dereference(sched_domains_numa_distance); + if (!distances) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - if (sched_domains_numa_distance[i] == distance) - return true; + if (distances[i] == distance) { + found = true; + break; + } } +unlock: + rcu_read_unlock(); - return false; + return found; } +#define for_each_cpu_node_but(n, nbut) \ + for_each_node_state(n, N_CPU) \ + if (n == nbut) \ + continue; \ + else + /* * A system can have three types of NUMA topology: * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system @@ -1504,7 +1810,7 @@ bool find_numa_distance(int distance) * there is an intermediary node C, which is < N hops away from both * nodes A and B, the system is a glueless mesh. */ -static void init_numa_topology_type(void) +static void init_numa_topology_type(int offline_node) { int a, b, c, n; @@ -1515,14 +1821,14 @@ static void init_numa_topology_type(void) return; } - for_each_online_node(a) { - for_each_online_node(b) { + for_each_cpu_node_but(a, offline_node) { + for_each_cpu_node_but(b, offline_node) { /* Find two nodes furthest removed from each other. */ if (node_distance(a, b) < n) continue; /* Is there an intermediary node between a and b? */ - for_each_online_node(c) { + for_each_cpu_node_but(c, offline_node) { if (node_distance(a, c) < n && node_distance(b, c) < n) { sched_numa_topology_type = @@ -1535,68 +1841,67 @@ static void init_numa_topology_type(void) return; } } + + pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n"); + sched_numa_topology_type = NUMA_DIRECT; } -void sched_init_numa(void) -{ - int next_distance, curr_distance = node_distance(0, 0); - struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); - if (!sched_domains_numa_distance) - return; +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) - /* Includes NUMA identity node at level 0. */ - sched_domains_numa_distance[level++] = curr_distance; - sched_domains_numa_levels = level; +void sched_init_numa(int offline_node) +{ + struct sched_domain_topology_level *tl; + unsigned long *distance_map; + int nr_levels = 0; + int i, j; + int *distances; + struct cpumask ***masks; /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * O(nr_nodes^2) de-duplicating selection sort -- in order to find the * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. */ - next_distance = curr_distance; - for (i = 0; i < nr_node_ids; i++) { - for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); + if (!distance_map) + return; - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); + bitmap_zero(distance_map, NR_DISTANCE_VALUES); + for_each_cpu_node_but(i, offline_node) { + for_each_cpu_node_but(j, offline_node) { + int distance = node_distance(i, j); - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { + sched_numa_warn("Invalid distance value range"); + bitmap_free(distance_map); + return; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + bitmap_set(distance_map, distance, 1); } + } + /* + * We can now figure out how many unique distance values there are and + * allocate memory accordingly. + */ + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!distances) { + bitmap_free(distance_map); + return; + } + + for (i = 0, j = 0; i < nr_levels; i++, j++) { + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); + distances[i] = j; } + rcu_assign_pointer(sched_domains_numa_distance, distances); + + bitmap_free(distance_map); /* - * 'level' contains the number of unique distances + * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1605,36 +1910,40 @@ void sched_init_numa(void) /* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be + * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * - * We reset it to 'level' at the end of this function. + * We reset it to 'nr_levels' at the end of this function. */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); - if (!sched_domains_numa_masks) + masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); + if (!masks) return; /* * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us. */ - for (i = 0; i < level; i++) { - sched_domains_numa_masks[i] = - kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); - if (!sched_domains_numa_masks[i]) + for (i = 0; i < nr_levels; i++) { + masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!masks[i]) return; - for (j = 0; j < nr_node_ids; j++) { + for_each_cpu_node_but(j, offline_node) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + int k; + if (!mask) return; - sched_domains_numa_masks[i][j] = mask; + masks[i][j] = mask; + + for_each_cpu_node_but(k, offline_node) { + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + sched_numa_warn("Node-distance not symmetric"); - for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -1642,11 +1951,12 @@ void sched_init_numa(void) } } } + rcu_assign_pointer(sched_domains_numa_masks, masks); /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level + 1) * + tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1669,7 +1979,7 @@ void sched_init_numa(void) /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 1; j < level; i++, j++) { + for (j = 1; j < nr_levels; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1679,12 +1989,67 @@ void sched_init_numa(void) }; } + sched_domain_topology_saved = sched_domain_topology; sched_domain_topology = tl; - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + sched_domains_numa_levels = nr_levels; + WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); + + init_numa_topology_type(offline_node); +} + + +static void sched_reset_numa(void) +{ + int nr_levels, *distances; + struct cpumask ***masks; + + nr_levels = sched_domains_numa_levels; + sched_domains_numa_levels = 0; + sched_max_numa_distance = 0; + sched_numa_topology_type = NUMA_DIRECT; + distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_domains_numa_distance, NULL); + masks = sched_domains_numa_masks; + rcu_assign_pointer(sched_domains_numa_masks, NULL); + if (distances || masks) { + int i, j; + + synchronize_rcu(); + kfree(distances); + for (i = 0; i < nr_levels && masks; i++) { + if (!masks[i]) + continue; + for_each_node(j) + kfree(masks[i][j]); + kfree(masks[i]); + } + kfree(masks); + } + if (sched_domain_topology_saved) { + kfree(sched_domain_topology); + sched_domain_topology = sched_domain_topology_saved; + sched_domain_topology_saved = NULL; + } +} + +/* + * Call with hotplug lock held + */ +void sched_update_numa(int cpu, bool online) +{ + int node; + + node = cpu_to_node(cpu); + /* + * Scheduler NUMA topology is updated when the first CPU of a + * node is onlined or the last CPU of a node is offlined. + */ + if (cpumask_weight(cpumask_of_node(node)) != 1) + return; - init_numa_topology_type(); + sched_reset_numa(); + sched_init_numa(online ? NUMA_NO_NODE : node); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1694,6 +2059,10 @@ void sched_domains_numa_masks_set(unsigned int cpu) for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { + if (!node_state(j, N_CPU)) + continue; + + /* Set ourselves in the remote node's masks */ if (node_distance(j, node) <= sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } @@ -1705,8 +2074,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu) int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { - for (j = 0; j < nr_node_ids; j++) - cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + for (j = 0; j < nr_node_ids; j++) { + if (sched_domains_numa_masks[i][j]) + cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); + } } } @@ -1720,15 +2091,129 @@ void sched_domains_numa_masks_clear(unsigned int cpu) */ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { - int i, j = cpu_to_node(cpu); + int i, j = cpu_to_node(cpu), found = nr_cpu_ids; + struct cpumask ***masks; + rcu_read_lock(); + masks = rcu_dereference(sched_domains_numa_masks); + if (!masks) + goto unlock; for (i = 0; i < sched_domains_numa_levels; i++) { - cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); - if (cpu < nr_cpu_ids) - return cpu; + if (!masks[i][j]) + break; + cpu = cpumask_any_and(cpus, masks[i][j]); + if (cpu < nr_cpu_ids) { + found = cpu; + break; + } + } +unlock: + rcu_read_unlock(); + + return found; +} + +struct __cmp_key { + const struct cpumask *cpus; + struct cpumask ***masks; + int node; + int cpu; + int w; +}; + +static int hop_cmp(const void *a, const void *b) +{ + struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b; + struct __cmp_key *k = (struct __cmp_key *)a; + + if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu) + return 1; + + if (b == k->masks) { + k->w = 0; + return 0; } - return nr_cpu_ids; + + prev_hop = *((struct cpumask ***)b - 1); + k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]); + if (k->w <= k->cpu) + return 0; + + return -1; +} + +/** + * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU + * from @cpus to @cpu, taking into account distance + * from a given @node. + * @cpus: cpumask to find a cpu from + * @cpu: CPU to start searching + * @node: NUMA node to order CPUs by distance + * + * Return: cpu, or nr_cpu_ids when nothing found. + */ +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) +{ + struct __cmp_key k = { .cpus = cpus, .cpu = cpu }; + struct cpumask ***hop_masks; + int hop, ret = nr_cpu_ids; + + if (node == NUMA_NO_NODE) + return cpumask_nth_and(cpu, cpus, cpu_online_mask); + + rcu_read_lock(); + + /* CPU-less node entries are uninitialized in sched_domains_numa_masks */ + node = numa_nearest_node(node, N_CPU); + k.node = node; + + k.masks = rcu_dereference(sched_domains_numa_masks); + if (!k.masks) + goto unlock; + + hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp); + hop = hop_masks - k.masks; + + ret = hop ? + cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) : + cpumask_nth_and(cpu, cpus, k.masks[0][node]); +unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu); + +/** + * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from + * @node + * @node: The node to count hops from. + * @hops: Include CPUs up to that many hops away. 0 means local node. + * + * Return: On success, a pointer to a cpumask of CPUs at most @hops away from + * @node, an error value otherwise. + * + * Requires rcu_lock to be held. Returned cpumask is only valid within that + * read-side section, copy it if required beyond that. + * + * Note that not all hops are equal in distance; see sched_init_numa() for how + * distances and masks are handled. + * Also note that this is a reflection of sched_domains_numa_masks, which may change + * during the lifetime of the system (offline nodes are taken out of the masks). + */ +const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops) +{ + struct cpumask ***masks; + + if (node >= nr_node_ids || hops >= sched_domains_numa_levels) + return ERR_PTR(-EINVAL); + + masks = rcu_dereference(sched_domains_numa_masks); + if (!masks) + return ERR_PTR(-EBUSY); + + return masks[hops][node]; } +EXPORT_SYMBOL_GPL(sched_numa_hop_mask); #endif /* CONFIG_NUMA */ @@ -1839,9 +2324,9 @@ static void __sdt_free(const struct cpumask *cpu_map) static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int dflags, int cpu) + struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); if (child) { sd->level = child->level + 1; @@ -1851,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -1874,7 +2357,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve static bool topology_span_sane(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, int cpu) { - int i; + int i = cpu + 1; /* NUMA levels are allowed to overlap */ if (tl->flags & SDTL_OVERLAP) @@ -1886,9 +2369,7 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl, * breaking the sched_group lists - i.e. a later get_group() pass * breaks the linking done for an earlier span. */ - for_each_cpu(i, cpu_map) { - if (i == cpu) - continue; + for_each_cpu_from(i, cpu_map) { /* * We should 'and' all those masks with 'cpu_map' to exactly * match the topology we're about to build, but that can only @@ -1904,65 +2385,6 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl, } /* - * Find the sched_domain_topology_level where all CPU capacities are visible - * for all CPUs. - */ -static struct sched_domain_topology_level -*asym_cpu_capacity_level(const struct cpumask *cpu_map) -{ - int i, j, asym_level = 0; - bool asym = false; - struct sched_domain_topology_level *tl, *asym_tl = NULL; - unsigned long cap; - - /* Is there any asymmetry? */ - cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); - - for_each_cpu(i, cpu_map) { - if (arch_scale_cpu_capacity(i) != cap) { - asym = true; - break; - } - } - - if (!asym) - return NULL; - - /* - * Examine topology from all CPU's point of views to detect the lowest - * sched_domain_topology_level where a highest capacity CPU is visible - * to everyone. - */ - for_each_cpu(i, cpu_map) { - unsigned long max_capacity = arch_scale_cpu_capacity(i); - int tl_id = 0; - - for_each_sd_topology(tl) { - if (tl_id < asym_level) - goto next_level; - - for_each_cpu_and(j, tl->mask(i), cpu_map) { - unsigned long capacity; - - capacity = arch_scale_cpu_capacity(j); - - if (capacity <= max_capacity) - continue; - - max_capacity = capacity; - asym_level = tl_id; - asym_tl = tl; - } -next_level: - tl_id++; - } - } - - return asym_tl; -} - - -/* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ @@ -1974,8 +2396,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; - struct sched_domain_topology_level *tl_asym; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -1984,25 +2406,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (alloc_state != sa_rootdomain) goto error; - tl_asym = asym_cpu_capacity_level(cpu_map); - /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { - int dflags = 0; - - if (tl == tl_asym) { - dflags |= SD_ASYM_CPUCAPACITY; - has_asym = true; - } if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + sd = build_sched_domain(tl, cpu_map, attr, sd, i); + + has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; @@ -2027,6 +2443,64 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + /* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ + for_each_cpu(i, cpu_map) { + unsigned int imb = 0; + unsigned int imb_span = 1; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + struct sched_domain *child = sd->child; + + if (!(sd->flags & SD_SHARE_LLC) && child && + (child->flags & SD_SHARE_LLC)) { + struct sched_domain __rcu *top_p; + unsigned int nr_llcs; + + /* + * For a single LLC per node, allow an + * imbalance up to 12.5% of the node. This is + * arbitrary cutoff based two factors -- SMT and + * memory channels. For SMT-2, the intent is to + * avoid premature sharing of HT resources but + * SMT-4 or SMT-8 *may* benefit from a different + * cutoff. For memory channels, this is a very + * rough estimate of how many channels may be + * active and is based on recent CPUs with + * many cores. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. This assumes that there are + * enough logical CPUs per LLC to avoid SMT + * factors and that there is a correlation + * between LLCs and memory channels. + */ + nr_llcs = sd->span_weight / child->span_weight; + if (nr_llcs == 1) + imb = sd->span_weight >> 3; + else + imb = nr_llcs; + imb = max(1U, imb); + sd->imb_numa_nr = imb; + + /* Set span based on the first NUMA domain. */ + top_p = sd->parent; + while (top_p && !(top_p->flags & SD_NUMA)) { + top_p = top_p->parent; + } + imb_span = top_p ? top_p->span_weight : sd->span_weight; + } else { + int factor = max(1U, (sd->span_weight / imb_span)); + + sd->imb_numa_nr = imb * factor; + } + } + } + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) @@ -2044,21 +2518,21 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); - cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster = true; } rcu_read_unlock(); if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); - if (rq && sched_debug_enabled) { - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); ret = 0; error: @@ -2073,7 +2547,7 @@ static cpumask_var_t *doms_cur; /* Number of sched domains in 'doms_cur': */ static int ndoms_cur; -/* Attribues of custom domains in 'doms_cur' */ +/* Attributes of custom domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* @@ -2122,7 +2596,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * Set up scheduler domains and groups. For now this just excludes isolated * CPUs, but could be used to exclude other special cases in the future. */ -int sched_init_domains(const struct cpumask *cpu_map) +int __init sched_init_domains(const struct cpumask *cpu_map) { int err; @@ -2131,13 +2605,13 @@ int sched_init_domains(const struct cpumask *cpu_map) zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); arch_update_cpu_topology(); + asym_cpu_capacity_scan(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; - cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); err = build_sched_domains(doms_cur[0], NULL); - register_sched_domain_sysctl(); return err; } @@ -2154,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (static_branch_unlikely(&sched_cluster_active)) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); @@ -2212,11 +2689,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], lockdep_assert_held(&sched_domains_mutex); - /* Always unregister in case we don't destroy any domains: */ - unregister_sched_domain_sysctl(); - /* Let the architecture update CPU core mappings: */ new_topology = arch_update_cpu_topology(); + /* Trigger rebuilding CPU capacity asymmetry data */ + if (new_topology) + asym_cpu_capacity_scan(); if (!doms_new) { WARN_ON_ONCE(dattr_new); @@ -2225,7 +2702,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], if (doms_new) { n = 1; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } } else { n = ndoms_new; @@ -2240,9 +2717,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], /* * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. It - * will be recomputed in function - * update_tasks_root_domain(). + * its dl_bw->total_bw needs to be cleared. + * Tasks contribution will be then recomputed + * in function dl_update_tasks_root_domain(), + * dl_servers contribution in function + * dl_restore_server_root_domain(). */ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; dl_clear_root_domain(rd); @@ -2260,7 +2739,7 @@ match1: n = 0; doms_new = &fallback_doms; cpumask_and(doms_new[0], cpu_active_mask, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); } /* Build new domains: */ @@ -2277,7 +2756,7 @@ match2: } #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - /* Build perf. domains: */ + /* Build perf domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && @@ -2286,7 +2765,7 @@ match2: goto match3; } } - /* No match - add perf. domains for a new rd */ + /* No match - add perf domains for a new rd */ has_eas |= build_perf_domains(doms_new[i]); match3: ; @@ -2303,7 +2782,7 @@ match3: dattr_cur = dattr_new; ndoms_cur = ndoms_new; - register_sched_domain_sysctl(); + update_sched_domain_debugfs(); } /* diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index ba059fbfc53a..51e38f5f4701 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,7 +4,6 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ -#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { @@ -37,6 +36,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue } EXPORT_SYMBOL(add_wait_queue_exclusive); +void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -48,37 +58,26 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry EXPORT_SYMBOL(remove_wait_queue); /* - * Scan threshold to break wait queue walk. - * This allows a waker to take a break from holding the - * wait queue lock during the wait queue walk. - */ -#define WAITQUEUE_WALK_BREAK_CNT 64 - -/* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. + * number) then we wake that number of exclusive tasks, and potentially all + * the non-exclusive tasks. Normally, exclusive tasks will be at the end of + * the list and any non-exclusive tasks will be woken first. A priority task + * may be at the head of the list, and can consume the event without any other + * tasks being woken. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, int wake_flags, void *key, - wait_queue_entry_t *bookmark) + int nr_exclusive, int wake_flags, void *key) { wait_queue_entry_t *curr, *next; - int cnt = 0; lockdep_assert_held(&wq_head->lock); - if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { - curr = list_next_entry(bookmark, entry); - - list_del(&bookmark->entry); - bookmark->flags = 0; - } else - curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); if (&curr->entry == &wq_head->head) return nr_exclusive; @@ -87,43 +86,28 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, unsigned flags = curr->flags; int ret; - if (flags & WQ_FLAG_BOOKMARK) - continue; - ret = curr->func(curr, mode, wake_flags, key); if (ret < 0) break; if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; - - if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && - (&next->entry != &wq_head->head)) { - bookmark->flags = WQ_FLAG_BOOKMARK; - list_add_tail(&bookmark->entry, &next->entry); - break; - } } return nr_exclusive; } -static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, +static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { unsigned long flags; - wait_queue_entry_t bookmark; + int remaining; - bookmark.flags = 0; - bookmark.private = NULL; - bookmark.func = NULL; - INIT_LIST_HEAD(&bookmark.entry); + spin_lock_irqsave(&wq_head->lock, flags); + remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, + key); + spin_unlock_irqrestore(&wq_head->lock, flags); - do { - spin_lock_irqsave(&wq_head->lock, flags); - nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, - wake_flags, key, &bookmark); - spin_unlock_irqrestore(&wq_head->lock, flags); - } while (bookmark.flags & WQ_FLAG_BOOKMARK); + return nr_exclusive - remaining; } /** @@ -133,38 +117,37 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function * - * If this function wakes up a task, it executes a full memory barrier before - * accessing the task state. + * If this function wakes up a task, it executes a full memory barrier + * before accessing the task state. Returns the number of exclusive + * tasks that were awaken. */ -void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, void *key) +int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, + int nr_exclusive, void *key) { - __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); + return __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); } EXPORT_SYMBOL(__wake_up); +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key) +{ + __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key); +} + /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(wq_head, mode, nr, 0, NULL, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, 0, key, NULL); + __wake_up_common(wq_head, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); -void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, - unsigned int mode, void *key, wait_queue_entry_t *bookmark) -{ - __wake_up_common(wq_head, mode, 1, 0, key, bookmark); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); - /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue @@ -210,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL); + __wake_up_common(wq_head, mode, 1, WF_SYNC, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key); @@ -223,6 +206,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +void __wake_up_pollfree(struct wait_queue_head *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any @@ -249,17 +239,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent } EXPORT_SYMBOL(prepare_to_wait); -void +/* Returns true if we are the first waiter in the queue, false otherwise. */ +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; + bool was_empty = false; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->entry)) + if (list_empty(&wq_entry->entry)) { + was_empty = list_empty(&wq_head->head); __add_wait_queue_entry_tail(wq_head, wq_entry); + } set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); + return was_empty; } EXPORT_SYMBOL(prepare_to_wait_exclusive); @@ -389,17 +384,12 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->entry); + list_del_init_careful(&wq_entry->entry); return ret; } EXPORT_SYMBOL(autoremove_wake_function); -static inline bool is_kthread_should_stop(void) -{ - return (current->flags & PF_KTHREAD) && kthread_should_stop(); -} - /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * @@ -429,7 +419,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) * or woken_wake_function() sees our store to current->state. */ set_current_state(mode); /* A */ - if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 02ce292b9bc0..b410b61cec95 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,15 +1,15 @@ // SPDX-License-Identifier: GPL-2.0-only + /* * The implementation of the wait_bit*() and related waiting APIs: */ -#include "sched.h" #define WAIT_TABLE_BITS 8 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; -wait_queue_head_t *bit_waitqueue(void *word, int bit) +wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit) { const int shift = BITS_PER_LONG == 32 ? 5 : 6; unsigned long val = (unsigned long)word << shift | bit; @@ -33,7 +33,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync EXPORT_SYMBOL(wake_bit_function); /* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * To allow interruptible waiting and asynchronous (i.e. non-blocking) * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are * permitted return codes. Nonzero return codes halt waiting and return. */ @@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); finish_wait(wq_head, &wbq_entry->wq_entry); @@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ } EXPORT_SYMBOL(__wait_on_bit); -int __sched out_of_line_wait_on_bit(void *word, int bit, +int __sched out_of_line_wait_on_bit(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, EXPORT_SYMBOL(out_of_line_wait_on_bit); int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, + unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode, unsigned long timeout) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry } EXPORT_SYMBOL(__wait_on_bit_lock); -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, +int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); @@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); @@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) EXPORT_SYMBOL(__wake_up_bit); /** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on + * wake_up_bit - wake up waiters on a bit + * @word: the address containing the bit being waited on + * @bit: the bit at that address being waited on + * + * Wake up any process waiting in wait_on_bit() or similar for the + * given bit to be cleared. + * + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address and bit will be woken, and only if the + * bit is clear. * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. + * In order for this to function properly there must be a full memory + * barrier after the bit is cleared and before this function is called. + * If the bit was cleared atomically, such as a by clear_bit() then + * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed. + * If the bit was cleared with a fully-ordered operation, no further + * barrier is required. * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. + * Normally the bit should be cleared by an operation with RELEASE + * semantics so that any changes to memory made before the bit is + * cleared are guaranteed to be visible after the matching wait_on_bit() + * completes. */ -void wake_up_bit(void *word, int bit) +void wake_up_bit(unsigned long *word, int bit) { __wake_up_bit(bit_waitqueue(word, bit), word, bit); } @@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int } EXPORT_SYMBOL(init_wait_var_entry); +/** + * wake_up_var - wake up waiters on a variable (kernel address) + * @var: the address of the variable being waited on + * + * Wake up any process waiting in wait_var_event() or similar for the + * given variable to change. wait_var_event() can be waiting for an + * arbitrary condition to be true and associates that condition with an + * address. Calling wake_up_var() suggests that the condition has been + * made true, but does not strictly require the condtion to use the + * address given. + * + * The wake-up is sent to tasks in a waitqueue selected by hash from a + * shared pool. Only those tasks on that queue which have requested + * wake_up on this specific address will be woken. + * + * In order for this to function properly there must be a full memory + * barrier after the variable is updated (or more accurately, after the + * condition waited on has been made to be true) and before this function + * is called. If the variable was updated atomically, such as a by + * atomic_dec() then smb_mb__after_atomic() can be used. If the + * variable was updated by a fully ordered operation such as + * atomic_dec_and_test() then no extra barrier is required. Otherwise + * smb_mb() is needed. + * + * Normally the variable should be updated (the condition should be made + * to be true) by an operation with RELEASE semantics such as + * smp_store_release() so that any changes to memory made before the + * variable was updated are guaranteed to be visible after the matching + * wait_var_event() completes. + */ void wake_up_var(void *var) { __wake_up_bit(__var_waitqueue(var), var, -1); @@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); - void __init wait_bit_init(void) { int i; |