summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile38
-rw-r--r--kernel/sched/autogroup.c34
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c66
-rw-r--r--kernel/sched/build_utility.c109
-rw-r--r--kernel/sched/clock.c81
-rw-r--r--kernel/sched/completion.c40
-rw-r--r--kernel/sched/core.c8259
-rw-r--r--kernel/sched/core_sched.c300
-rw-r--r--kernel/sched/cpuacct.c127
-rw-r--r--kernel/sched/cpudeadline.c31
-rw-r--r--kernel/sched/cpufreq.c3
-rw-r--r--kernel/sched/cpufreq_schedutil.c476
-rw-r--r--kernel/sched/cpupri.c60
-rw-r--r--kernel/sched/cpupri.h8
-rw-r--r--kernel/sched/cputime.c163
-rw-r--r--kernel/sched/deadline.c1979
-rw-r--r--kernel/sched/debug.c860
-rw-r--r--kernel/sched/ext.c7870
-rw-r--r--kernel/sched/ext.h91
-rw-r--r--kernel/sched/fair.c7304
-rw-r--r--kernel/sched/features.h67
-rw-r--r--kernel/sched/idle.c172
-rw-r--r--kernel/sched/isolation.c182
-rw-r--r--kernel/sched/loadavg.c13
-rw-r--r--kernel/sched/membarrier.c394
-rw-r--r--kernel/sched/pelt.c64
-rw-r--r--kernel/sched/pelt.h83
-rw-r--r--kernel/sched/psi.c1246
-rw-r--r--kernel/sched/rt.c856
-rw-r--r--kernel/sched/sched.h2532
-rw-r--r--kernel/sched/smp.h8
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h240
-rw-r--r--kernel/sched/stop_task.c44
-rw-r--r--kernel/sched/swait.c9
-rw-r--r--kernel/sched/syscalls.c1594
-rw-r--r--kernel/sched/topology.c1163
-rw-r--r--kernel/sched/wait.c120
-rw-r--r--kernel/sched/wait_bit.c96
40 files changed, 28056 insertions, 8843 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5fc9c9b70862..976092b7bd45 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,17 +1,17 @@
# SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
-endif
+
+# The compilers are complaining about unused variables inside an if(0) scope
+# block. This is daft, shut them up.
+ccflags-y += $(call cc-disable-warning, unused-but-set-variable)
# These files are disabled because they produce non-interesting flaky coverage
# that is not a function of syscall inputs. E.g. involuntary context switches.
KCOV_INSTRUMENT := n
-# There are numerous data races here, however, most of them are due to plain accesses.
-# This would make it even harder for syzbot to find reproducers, because these
-# bugs trigger without specific input. Disable by default, but should re-enable
-# eventually.
+# Disable KCSAN to avoid excessive noise and performance degradation. To avoid
+# false positives ensure barriers implied by sched functions are instrumented.
KCSAN_SANITIZE := n
+KCSAN_INSTRUMENT_BARRIERS := y
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -22,17 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
-
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
-obj-$(CONFIG_SCHED_DEBUG) += debug.o
-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
-obj-$(CONFIG_CPU_FREQ) += cpufreq.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-obj-$(CONFIG_PSI) += psi.o
+#
+# Build efficiency:
+#
+# These compilation units have roughly the same size and complexity - so their
+# build parallelizes well and finishes roughly at once:
+#
+obj-y += core.o
+obj-y += fair.o
+obj-y += build_policy.o
+obj-y += build_utility.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2067080bb235..2b331822c7e7 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,20 +1,41 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Auto-group scheduling implementation:
*/
-#include <linux/nospec.h>
-#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table sched_autogroup_sysctls[] = {
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static void __init sched_autogroup_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_autogroup_sysctls);
+}
+#else
+#define sched_autogroup_sysctl_init() do { } while (0)
+#endif
+
void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
kref_init(&autogroup_default.kref);
init_rwsem(&autogroup_default.lock);
init_task->signal->autogroup = &autogroup_default;
+ sched_autogroup_sysctl_init();
}
void autogroup_free(struct task_group *tg)
@@ -31,7 +52,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
- sched_offline_group(ag->tg);
+ sched_release_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -129,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p)
* see this thread after that: we can no longer use signal->autogroup.
* See the PF_EXITING check in task_wants_autogroup().
*/
- sched_move_task(p);
+ sched_move_task(p, true);
}
static void
@@ -139,7 +160,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
struct task_struct *t;
unsigned long flags;
- BUG_ON(!lock_task_sighand(p, &flags));
+ if (WARN_ON_ONCE(!lock_task_sighand(p, &flags)))
+ return;
prev = p->signal->autogroup;
if (prev == ag) {
@@ -160,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
* sched_autogroup_exit_task().
*/
for_each_thread(p, t)
- sched_move_task(t);
+ sched_move_task(t, true);
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index b96419974a1f..90d69f2c5eaf 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_SCHED_AUTOGROUP_H
+#define _KERNEL_SCHED_AUTOGROUP_H
+
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
@@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
+ extern unsigned int sysctl_sched_autogroup_enabled;
int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
@@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
}
#endif /* CONFIG_SCHED_AUTOGROUP */
+
+#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
new file mode 100644
index 000000000000..fae1f5c921eb
--- /dev/null
+++ b/kernel/sched/build_policy.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are the scheduling policy related scheduler files, built
+ * in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c and fair.c, the other two big
+ * compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ *
+ * core.c and fair.c are built separately.
+ */
+
+/* Headers: */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/posix-timers.h>
+#include <linux/sched/rt.h>
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/kobject.h>
+#include <linux/livepatch.h>
+#include <linux/pm.h>
+#include <linux/psi.h>
+#include <linux/rhashtable.h>
+#include <linux/seq_buf.h>
+#include <linux/seqlock_api.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/tsacct_kern.h>
+#include <linux/vtime.h>
+#include <linux/sysrq.h>
+#include <linux/percpu-rwsem.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+#include "smp.h"
+
+#include "autogroup.h"
+#include "stats.h"
+#include "pelt.h"
+
+/* Source code modules: */
+
+#include "idle.c"
+
+#include "rt.c"
+
+#ifdef CONFIG_SMP
+# include "cpudeadline.c"
+# include "pelt.c"
+#endif
+
+#include "cputime.c"
+#include "deadline.c"
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext.c"
+#endif
+
+#include "syscalls.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
new file mode 100644
index 000000000000..80a3df49ab47
--- /dev/null
+++ b/kernel/sched/build_utility.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are various utility functions of the scheduler,
+ * built in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c, fair.c, smp.c and policy.c, the other
+ * big compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/task_stack.h>
+
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/hashtable_api.h>
+#include <linux/irq.h>
+#include <linux/kobject_api.h>
+#include <linux/membarrier.h>
+#include <linux/mempolicy.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/ptrace_api.h>
+#include <linux/sched_clock.h>
+#include <linux/security.h>
+#include <linux/spinlock_api.h>
+#include <linux/swait_api.h>
+#include <linux/timex.h>
+#include <linux/utsname.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#include <uapi/linux/prctl.h>
+#include <uapi/linux/sched/types.h>
+
+#include <asm/switch_to.h>
+
+#include "sched.h"
+#include "sched-pelt.h"
+#include "stats.h"
+#include "autogroup.h"
+
+#include "clock.c"
+
+#ifdef CONFIG_CGROUP_CPUACCT
+# include "cpuacct.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+# include "cpufreq.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+# include "cpufreq_schedutil.c"
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+# include "debug.c"
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+# include "stats.c"
+#endif
+
+#include "loadavg.c"
+#include "completion.c"
+#include "swait.c"
+#include "wait_bit.c"
+#include "wait.c"
+
+#ifdef CONFIG_SMP
+# include "cpupri.c"
+# include "stop_task.c"
+# include "topology.c"
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+# include "core_sched.c"
+#endif
+
+#ifdef CONFIG_PSI
+# include "psi.c"
+#endif
+
+#ifdef CONFIG_MEMBARRIER
+# include "membarrier.c"
+#endif
+
+#ifdef CONFIG_CPU_ISOLATION
+# include "isolation.c"
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+# include "autogroup.c"
+#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 12bca64dff73..a09655b48140 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -41,7 +41,7 @@
* Otherwise it tries to create a semi stable clock from a mixture of other
* clocks, including:
*
- * - GTOD (clock monotomic)
+ * - GTOD (clock monotonic)
* - sched_clock()
* - explicit idle events
*
@@ -53,15 +53,13 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
-#include "sched.h"
-#include <linux/sched_clock.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __weak sched_clock(void)
+notrace unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
@@ -95,28 +93,28 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
-static inline struct sched_clock_data *cpu_sdc(int cpu)
+notrace static inline struct sched_clock_data *cpu_sdc(int cpu)
{
return &per_cpu(sched_clock_data, cpu);
}
-int sched_clock_stable(void)
+notrace int sched_clock_stable(void)
{
return static_branch_likely(&__sched_clock_stable);
}
-static void __scd_stamp(struct sched_clock_data *scd)
+notrace static void __scd_stamp(struct sched_clock_data *scd)
{
scd->tick_gtod = ktime_get_ns();
scd->tick_raw = sched_clock();
}
-static void __set_sched_clock_stable(void)
+notrace static void __set_sched_clock_stable(void)
{
struct sched_clock_data *scd;
@@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void)
* The only way to fully avoid random clock jumps is to boot with:
* "tsc=unstable".
*/
-static void __sched_clock_work(struct work_struct *work)
+notrace static void __sched_clock_work(struct work_struct *work)
{
struct sched_clock_data *scd;
int cpu;
@@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work)
static DECLARE_WORK(sched_clock_work, __sched_clock_work);
-static void __clear_sched_clock_stable(void)
+notrace static void __clear_sched_clock_stable(void)
{
if (!sched_clock_stable())
return;
@@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void)
schedule_work(&sched_clock_work);
}
-void clear_sched_clock_stable(void)
+notrace void clear_sched_clock_stable(void)
{
__sched_clock_stable_early = 0;
@@ -196,7 +194,7 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-static void __sched_clock_gtod_offset(void)
+notrace static void __sched_clock_gtod_offset(void)
{
struct sched_clock_data *scd = this_scd();
@@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -262,13 +260,13 @@ static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
again:
- now = sched_clock();
+ now = sched_clock_noinstr();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
@@ -289,13 +287,38 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+ if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock_noinstr(void)
+{
+ u64 clock;
+
+ if (static_branch_likely(&__sched_clock_stable))
+ return sched_clock_noinstr() + __sched_clock_offset;
+
+ if (!static_branch_likely(&sched_clock_running))
+ return sched_clock_noinstr();
+
+ clock = sched_clock_local(this_scd());
+
+ return clock;
+}
+
+u64 local_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = local_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
@@ -317,7 +340,7 @@ again:
this_clock = sched_clock_local(my_scd);
/*
* We must enforce atomic readout on 32-bit, otherwise the
- * update on the remote CPU can hit inbetween the readout of
+ * update on the remote CPU can hit in between the readout of
* the low 32-bit and the high 32-bit portion.
*/
remote_clock = cmpxchg64(&scd->clock, 0, 0);
@@ -351,7 +374,7 @@ again:
val = remote_clock;
}
- if (cmpxchg64(ptr, old_val, val) != old_val)
+ if (!try_cmpxchg64(ptr, &old_val, val))
goto again;
return val;
@@ -362,7 +385,7 @@ again:
*
* See cpu_clock().
*/
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
u64 clock;
@@ -386,7 +409,7 @@ u64 sched_clock_cpu(int cpu)
}
EXPORT_SYMBOL_GPL(sched_clock_cpu);
-void sched_clock_tick(void)
+notrace void sched_clock_tick(void)
{
struct sched_clock_data *scd;
@@ -403,7 +426,7 @@ void sched_clock_tick(void)
sched_clock_local(scd);
}
-void sched_clock_tick_stable(void)
+notrace void sched_clock_tick_stable(void)
{
if (!sched_clock_stable())
return;
@@ -421,9 +444,9 @@ void sched_clock_tick_stable(void)
}
/*
- * We are going deep-idle (irqs are disabled):
+ * We are going deep-idle (IRQs are disabled):
*/
-void sched_clock_idle_sleep_event(void)
+notrace void sched_clock_idle_sleep_event(void)
{
sched_clock_cpu(smp_processor_id());
}
@@ -432,7 +455,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(void)
+notrace void sched_clock_idle_wakeup_event(void)
{
unsigned long flags;
@@ -458,7 +481,7 @@ void __init sched_clock_init(void)
local_irq_enable();
}
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
if (!static_branch_likely(&sched_clock_running))
return 0;
@@ -476,7 +499,7 @@ u64 sched_clock_cpu(int cpu)
* On bare metal this function should return the same as local_clock.
* Architectures and sub-architectures can override this.
*/
-u64 __weak running_clock(void)
+notrace u64 __weak running_clock(void)
{
return local_clock();
}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a778554f9dad..3561ab533dd4 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Generic wait-for-completion handler;
*
@@ -11,7 +12,23 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include "sched.h"
+
+static void complete_with_flags(struct completion *x, int wake_flags)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&x->wait.lock, flags);
+
+ if (x->done != UINT_MAX)
+ x->done++;
+ swake_up_locked(&x->wait, wake_flags);
+ raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+
+void complete_on_current_cpu(struct completion *x)
+{
+ return complete_with_flags(x, WF_CURRENT_CPU);
+}
/**
* complete: - signals a single thread waiting on this completion
@@ -27,14 +44,7 @@
*/
void complete(struct completion *x)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&x->wait.lock, flags);
-
- if (x->done != UINT_MAX)
- x->done++;
- swake_up_locked(&x->wait);
- raw_spin_unlock_irqrestore(&x->wait.lock, flags);
+ complete_with_flags(x, 0);
}
EXPORT_SYMBOL(complete);
@@ -204,6 +214,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
@@ -241,12 +252,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+
if (t == -ERESTARTSYS)
return t;
return 0;
}
EXPORT_SYMBOL(wait_for_completion_killable);
+int __sched wait_for_completion_state(struct completion *x, unsigned int state)
+{
+ long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state);
+
+ if (t == -ERESTARTSYS)
+ return t;
+ return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_state);
+
/**
* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
* @x: holds the state of this particular completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2142c6767682..042351c7afce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2,29 +2,103 @@
/*
* kernel/sched/core.c
*
- * Core kernel scheduler code and related syscalls
+ * Core kernel CPU scheduler code
*
* Copyright (C) 1991-2002 Linus Torvalds
- */
-#include "sched.h"
-
-#include <linux/nospec.h>
-
+ * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
+ */
+#include <linux/highmem.h>
+#include <linux/hrtimer_api.h>
+#include <linux/ktime_api.h>
+#include <linux/sched/signal.h>
+#include <linux/syscalls_api.h>
+#include <linux/debug_locks.h>
+#include <linux/prefetch.h>
+#include <linux/capability.h>
+#include <linux/pgtable_api.h>
+#include <linux/wait_bit.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/hardirq.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/rt.h>
+
+#include <linux/blkdev.h>
+#include <linux/context_tracking.h>
+#include <linux/cpuset.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/interrupt.h>
+#include <linux/ioprio.h>
+#include <linux/kallsyms.h>
#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/llist_api.h>
+#include <linux/mmu_context.h>
+#include <linux/mmzone.h>
+#include <linux/mutex_api.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/perf_event_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcuwait_api.h>
+#include <linux/rseq.h>
+#include <linux/sched/wake_q.h>
#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/vtime.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_GENERIC_ENTRY
+# include <linux/entry-common.h>
+# endif
+#endif
+#include <uapi/linux/sched/types.h>
+
+#include <asm/irq_regs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include "../workqueue_internal.h"
-#include "../../fs/io-wq.h"
-#include "../smpboot.h"
+#define CREATE_TRACE_POINTS
+#include <linux/sched/rseq_api.h>
+#include <trace/events/sched.h>
+#include <trace/events/ipi.h>
+#undef CREATE_TRACE_POINTS
+#include "sched.h"
+#include "stats.h"
+
+#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
+#include "stats.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
+#include "../workqueue_internal.h"
+#include "../../io_uring/io-wq.h"
+#include "../smpboot.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -35,11 +109,17 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#ifdef CONFIG_SCHED_DEBUG
/*
* Debugging: various feature bits
*
@@ -53,27 +133,533 @@ const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
-#endif
+
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * LATENCY_WARN is enabled).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ */
+__read_mostly int sysctl_resched_latency_warn_ms = 100;
+__read_mostly int sysctl_resched_latency_warn_once = 1;
+#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
+
+__read_mostly int scheduler_running;
+
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/* kernel prio, less is more */
+static inline int __task_prio(const struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class) /* trumps deadline */
+ return -2;
+
+ if (p->dl_server)
+ return -1; /* deadline */
+
+ if (rt_or_dl_prio(p->prio))
+ return p->prio; /* [-1, 99] */
+
+ if (p->sched_class == &idle_sched_class)
+ return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+ if (task_on_scx(p))
+ return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
+
+ return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
+}
/*
- * period over which we measure -rt task CPU usage in us.
- * default: 1s
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b) := l(b,a)
+ * ge(a,b) := !l(a,b)
*/
-unsigned int sysctl_sched_rt_period = 1000000;
-__read_mostly int scheduler_running;
+/* real prio, less is less */
+static inline bool prio_less(const struct task_struct *a,
+ const struct task_struct *b, bool in_fi)
+{
+
+ int pa = __task_prio(a), pb = __task_prio(b);
+
+ if (-pa < -pb)
+ return true;
+
+ if (-pb < -pa)
+ return false;
+
+ if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */
+ const struct sched_dl_entity *a_dl, *b_dl;
+
+ a_dl = &a->dl;
+ /*
+ * Since,'a' and 'b' can be CFS tasks served by DL server,
+ * __task_prio() can return -1 (for DL) even for those. In that
+ * case, get to the dl_server's DL entity.
+ */
+ if (a->dl_server)
+ a_dl = a->dl_server;
+
+ b_dl = &b->dl;
+ if (b->dl_server)
+ b_dl = b->dl_server;
+
+ return !dl_time_before(a_dl->deadline, b_dl->deadline);
+ }
+
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return cfs_prio_less(a, b, in_fi);
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */
+ return scx_prio_less(a, b, in_fi);
+#endif
+
+ return false;
+}
+
+static inline bool __sched_core_less(const struct task_struct *a,
+ const struct task_struct *b)
+{
+ if (a->core_cookie < b->core_cookie)
+ return true;
+
+ if (a->core_cookie > b->core_cookie)
+ return false;
+
+ /* flip prio, so high prio is leftmost */
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
+ return true;
+
+ return false;
+}
+
+#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
+
+static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
+{
+ return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
+}
+
+static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+{
+ const struct task_struct *p = __node_2_sc(node);
+ unsigned long cookie = (unsigned long)key;
+
+ if (cookie < p->core_cookie)
+ return -1;
+
+ if (cookie > p->core_cookie)
+ return 1;
+
+ return 0;
+}
+
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+ if (p->se.sched_delayed)
+ return;
+
+ rq->core->core_task_seq++;
+
+ if (!p->core_cookie)
+ return;
+
+ rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
+}
+
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (p->se.sched_delayed)
+ return;
+
+ rq->core->core_task_seq++;
+
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }
+
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
+}
+
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
+{
+ if (p->sched_class->task_is_throttled)
+ return p->sched_class->task_is_throttled(p, cpu);
+
+ return 0;
+}
+
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+{
+ struct rb_node *node = &p->core_node;
+ int cpu = task_cpu(p);
+
+ do {
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ } while (sched_task_is_throttled(p, cpu));
+
+ return p;
+}
+
+/*
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct task_struct *p;
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (!sched_task_is_throttled(p, rq->cpu))
+ return p;
+
+ return sched_core_next(p, cookie);
+}
+
+/*
+ * Magic required such that:
+ *
+ * raw_spin_rq_lock(rq);
+ * ...
+ * raw_spin_rq_unlock(rq);
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+
+static DEFINE_MUTEX(sched_core_mutex);
+static atomic_t sched_core_count;
+static struct cpumask sched_core_mask;
+
+static void sched_core_lock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t, i = 0;
+
+ local_irq_save(*flags);
+ for_each_cpu(t, smt_mask)
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+}
+
+static void sched_core_unlock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_restore(*flags);
+}
+
+static void __sched_core_flip(bool enabled)
+{
+ unsigned long flags;
+ int cpu, t;
+
+ cpus_read_lock();
+
+ /*
+ * Toggle the online cores, one by one.
+ */
+ cpumask_copy(&sched_core_mask, cpu_online_mask);
+ for_each_cpu(cpu, &sched_core_mask) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ sched_core_lock(cpu, &flags);
+
+ for_each_cpu(t, smt_mask)
+ cpu_rq(t)->core_enabled = enabled;
+
+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
+ sched_core_unlock(cpu, &flags);
+
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
+ }
+
+ /*
+ * Toggle the offline CPUs.
+ */
+ for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ cpus_read_unlock();
+}
+
+static void sched_core_assert_empty(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
+}
+
+static void __sched_core_enable(void)
+{
+ static_branch_enable(&__sched_core_enabled);
+ /*
+ * Ensure all previous instances of raw_spin_rq_*lock() have finished
+ * and future ones will observe !sched_core_disabled().
+ */
+ synchronize_rcu();
+ __sched_core_flip(true);
+ sched_core_assert_empty();
+}
+
+static void __sched_core_disable(void)
+{
+ sched_core_assert_empty();
+ __sched_core_flip(false);
+ static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+ if (atomic_inc_not_zero(&sched_core_count))
+ return;
+
+ mutex_lock(&sched_core_mutex);
+ if (!atomic_read(&sched_core_count))
+ __sched_core_enable();
+
+ smp_mb__before_atomic();
+ atomic_inc(&sched_core_count);
+ mutex_unlock(&sched_core_mutex);
+}
+
+static void __sched_core_put(struct work_struct *work)
+{
+ if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+ }
+}
+
+void sched_core_put(void)
+{
+ static DECLARE_WORK(_work, __sched_core_put);
+
+ /*
+ * "There can be only one"
+ *
+ * Either this is the last one, or we don't actually need to do any
+ * 'work'. If it is the last *again*, we rely on
+ * WORK_STRUCT_PENDING_BIT.
+ */
+ if (!atomic_add_unless(&sched_core_count, -1, 1))
+ schedule_work(&_work);
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
+
+#endif /* CONFIG_SCHED_CORE */
+
+/*
+ * Serialization rules:
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
+ *
+ * rq1->lock
+ * rq2->lock where: rq1 < rq2
+ *
+ * Regular state:
+ *
+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the
+ * local CPU's rq->lock, it optionally removes the task from the runqueue and
+ * always looks at the local rq data structures to find the most eligible task
+ * to run next.
+ *
+ * Task enqueue is also under rq->lock, possibly taken from another CPU.
+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
+ * the local CPU to avoid bouncing the runqueue state around [ see
+ * ttwu_queue_wakelist() ]
+ *
+ * Task wakeup, specifically wakeups that involve migration, are horribly
+ * complicated to avoid having to take two rq->locks.
+ *
+ * Special state:
+ *
+ * System-calls and anything external will use task_rq_lock() which acquires
+ * both p->pi_lock and rq->lock. As a consequence the state they change is
+ * stable while holding either lock:
+ *
+ * - sched_setaffinity()/
+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
+ * - set_user_nice(): p->se.load, p->*prio
+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
+ * p->se.load, p->rt_priority,
+ * p->dl.dl_{runtime, deadline, period, flags, bw, density}
+ * - sched_setnuma(): p->numa_preferred_nid
+ * - sched_move_task(): p->sched_task_group
+ * - uclamp_update_active() p->uclamp*
+ *
+ * p->state <- TASK_*:
+ *
+ * is changed locklessly using set_current_state(), __set_current_state() or
+ * set_special_state(), see their respective comments, or by
+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against
+ * concurrent self.
+ *
+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
+ *
+ * is set by activate_task() and cleared by deactivate_task(), under
+ * rq->lock. Non-zero indicates the task is runnable, the special
+ * ON_RQ_MIGRATING state is used for migration without holding both
+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
+ *
+ * Additionally it is possible to be ->on_rq but still be considered not
+ * runnable when p->se.sched_delayed is true. These tasks are on the runqueue
+ * but will be dequeued as soon as they get picked again. See the
+ * task_is_runnable() helper.
+ *
+ * p->on_cpu <- { 0, 1 }:
+ *
+ * is set by prepare_task() and cleared by finish_task() such that it will be
+ * set before p is scheduled-in and cleared after p is scheduled-out, both
+ * under rq->lock. Non-zero indicates the task is running on its CPU.
+ *
+ * [ The astute reader will observe that it is possible for two tasks on one
+ * CPU to have ->on_cpu = 1 at the same time. ]
+ *
+ * task_cpu(p): is changed by set_task_cpu(), the rules are:
+ *
+ * - Don't call set_task_cpu() on a blocked task:
+ *
+ * We don't care what CPU we're not running on, this simplifies hotplug,
+ * the CPU assignment of blocked tasks isn't required to be valid.
+ *
+ * - for try_to_wake_up(), called under p->pi_lock:
+ *
+ * This allows try_to_wake_up() to only take one rq->lock, see its comment.
+ *
+ * - for migration called under rq->lock:
+ * [ see task_on_rq_migrating() in task_rq_lock() ]
+ *
+ * o move_queued_task()
+ * o detach_task()
+ *
+ * - for migration called under double_rq_lock():
+ *
+ * o __migrate_swap_task()
+ * o push_rt_task() / pull_rt_task()
+ * o push_dl_task() / pull_dl_task()
+ * o dl_task_offline_migration()
+ *
+ */
+
+void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
+{
+ raw_spinlock_t *lock;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ raw_spin_lock_nested(&rq->__lock, subclass);
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ raw_spin_lock_nested(lock, subclass);
+ if (likely(lock == __rq_lockp(rq))) {
+ /* preempt_count *MUST* be > 1 */
+ preempt_enable_no_resched();
+ return;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+
+bool raw_spin_rq_trylock(struct rq *rq)
+{
+ raw_spinlock_t *lock;
+ bool ret;
+
+ /* Matches synchronize_rcu() in __sched_core_enable() */
+ preempt_disable();
+ if (sched_core_disabled()) {
+ ret = raw_spin_trylock(&rq->__lock);
+ preempt_enable();
+ return ret;
+ }
+
+ for (;;) {
+ lock = __rq_lockp(rq);
+ ret = raw_spin_trylock(lock);
+ if (!ret || (likely(lock == __rq_lockp(rq)))) {
+ preempt_enable();
+ return ret;
+ }
+ raw_spin_unlock(lock);
+ }
+}
+void raw_spin_rq_unlock(struct rq *rq)
+{
+ raw_spin_unlock(rq_lockp(rq));
+}
+
+#ifdef CONFIG_SMP
/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
+ * double_rq_lock - safely lock two runqueues
*/
-int sysctl_sched_rt_runtime = 950000;
+void double_rq_lock(struct rq *rq1, struct rq *rq2)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (rq_order_less(rq2, rq1))
+ swap(rq1, rq2);
+
+ raw_spin_rq_lock(rq1);
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+
+ double_rq_clock_clear_update(rq1, rq2);
+}
+#endif
/*
* __task_rq_lock - lock the rq @p resides on.
@@ -87,12 +673,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
@@ -111,7 +697,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
/*
* move_queued_task() task_rq_lock()
*
@@ -133,7 +719,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
@@ -154,38 +740,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ if (irqtime_enabled()) {
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}irq region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}irq
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}IRQ region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}IRQ
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ delayacct_irq(rq->curr, irq_delta);
+ }
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
@@ -202,8 +793,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
void update_rq_clock(struct rq *rq)
{
s64 delta;
+ u64 clock;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
@@ -213,20 +805,15 @@ void update_rq_clock(struct rq *rq)
SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
+ clock = sched_clock_cpu(cpu_of(rq));
+ scx_rq_clock_update(rq, clock);
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ delta = clock - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
- update_rq_clock_task(rq, delta);
-}
-static inline void
-rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
-{
- csd->flags = 0;
- csd->func = func;
- csd->info = rq;
+ update_rq_clock_task(rq, delta);
}
#ifdef CONFIG_SCHED_HRTICK
@@ -253,7 +840,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
rq_lock(rq, &rf);
update_rq_clock(rq);
- rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+ rq->donor->sched_class->task_tick(rq, rq->curr, 1);
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
@@ -264,8 +851,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = rq->hrtick_time;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -284,12 +872,11 @@ static void __hrtick_start(void *arg)
/*
* Called to set the hrtick timer state.
*
- * called with rq->lock held and irqs disabled
+ * called with rq->lock held and IRQs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time;
s64 delta;
/*
@@ -297,9 +884,7 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- time = ktime_add_ns(timer->base->get_time(), delta);
-
- hrtimer_set_expires(timer, time);
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq())
__hrtick_restart(rq);
@@ -311,7 +896,7 @@ void hrtick_start(struct rq *rq, u64 delay)
/*
* Called to set the hrtick timer state.
*
- * called with rq->lock held and irqs disabled
+ * called with rq->lock held and IRQs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
@@ -329,7 +914,7 @@ void hrtick_start(struct rq *rq, u64 delay)
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
+ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
@@ -345,21 +930,17 @@ static inline void hrtick_rq_init(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
/*
- * cmpxchg based fetch_or, macro so it works for different integer types
+ * try_cmpxchg based fetch_or() macro so it works for different integer types:
*/
#define fetch_or(ptr, mask) \
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
- typeof(*_ptr) _old, _val = *_ptr; \
+ typeof(*_ptr) _val = *_ptr; \
\
- for (;;) { \
- _old = cmpxchg(_ptr, _val, _val | _mask); \
- if (_old == _val) \
- break; \
- _val = _old; \
- } \
- _old; \
+ do { \
+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
+ _val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -368,10 +949,9 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- struct thread_info *ti = task_thread_info(p);
- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
}
/*
@@ -383,30 +963,27 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+ typeof(ti->flags) val = READ_ONCE(ti->flags);
- for (;;) {
+ do {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
- if (old == val)
- break;
- val = old;
- }
+ } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
return true;
}
#else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- set_tsk_need_resched(p);
+ set_ti_thread_flag(ti, tif);
return true;
}
#ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
@@ -419,7 +996,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
/*
* Atomically grab the task, if ->wake_q is !nil already it means
- * its already queued (either by us or someone else) and will get the
+ * it's already queued (either by us or someone else) and will get the
* wakeup due to that.
*
* In order to ensure that a pending wakeup will observe our pending
@@ -486,10 +1063,10 @@ void wake_up_q(struct wake_q_head *head)
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
- BUG_ON(!task);
- /* Task can safely be re-inserted now: */
node = node->next;
- task->wake_q.next = NULL;
+ /* pairs with cmpxchg_relaxed() in __wake_q_add() */
+ WRITE_ONCE(task->wake_q.next, NULL);
+ /* Task can safely be re-inserted now. */
/*
* wake_up_process() executes a full barrier, which pairs with
@@ -507,28 +1084,70 @@ void wake_up_q(struct wake_q_head *head)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
{
struct task_struct *curr = rq->curr;
+ struct thread_info *cti = task_thread_info(curr);
int cpu;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Always immediately preempt the idle task; no point in delaying doing
+ * actual work.
+ */
+ if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+ tif = TIF_NEED_RESCHED;
- if (test_tsk_need_resched(curr))
+ if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
return;
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
- set_tsk_need_resched(curr);
- set_preempt_need_resched();
+ set_ti_thread_flag(cti, tif);
+ if (tif == TIF_NEED_RESCHED)
+ set_preempt_need_resched();
return;
}
- if (set_nr_and_not_polling(curr))
- smp_send_reschedule(cpu);
- else
+ if (set_nr_and_not_polling(cti, tif)) {
+ if (tif == TIF_NEED_RESCHED)
+ smp_send_reschedule(cpu);
+ } else {
trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void resched_curr(struct rq *rq)
+{
+ __resched_curr(rq, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int get_lazy_tif_bit(void)
+{
+ if (dynamic_preempt_lazy())
+ return TIF_NEED_RESCHED_LAZY;
+
+ return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ __resched_curr(rq, get_lazy_tif_bit());
}
void resched_cpu(int cpu)
@@ -536,10 +1155,10 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_rq_lock_irqsave(rq, flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
}
#ifdef CONFIG_SMP
@@ -550,39 +1169,38 @@ void resched_cpu(int cpu)
*
* We don't do similar optimization for completely idle system, as
* selecting an idle CPU will add more delays to the timers than intended
- * (as that CPU's timer base may not be uptodate wrt jiffies etc).
+ * (as that CPU's timer base may not be up to date wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
+ const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- rcu_read_lock();
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
+
+ guard(rcu)();
+
for_each_domain(cpu, sd) {
- for_each_cpu_and(i, sched_domain_span(sd),
- housekeeping_cpumask(HK_FLAG_TIMER)) {
+ for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
- if (!idle_cpu(i)) {
- cpu = i;
- goto unlock;
- }
+ if (!idle_cpu(i))
+ return i;
}
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
- cpu = default_cpu;
-unlock:
- rcu_read_unlock();
- return cpu;
+ default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
+
+ return default_cpu;
}
/*
@@ -602,7 +1220,29 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
- if (set_nr_and_not_polling(rq->idle))
+ /*
+ * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+ * part of the idle loop. This forces an exit from the idle loop
+ * and a round trip to schedule(). Now this could be optimized
+ * because a simple new idle loop iteration is enough to
+ * re-evaluate the next tick. Provided some re-ordering of tick
+ * nohz functions that would need to follow TIF_NR_POLLING
+ * clearing:
+ *
+ * - On most architectures, a simple fetch_or on ti::flags with a
+ * "0" value would be enough to know if an IPI needs to be sent.
+ *
+ * - x86 needs to perform a last need_resched() check between
+ * monitor and mwait which doesn't take timers into account.
+ * There a dedicated TIF_TIMER flag would be required to
+ * fetch_or here and be checked along with TIF_NEED_RESCHED
+ * before mwait().
+ *
+ * However, remote timer enqueue is not such a frequent event
+ * and testing of the above solutions didn't appear to report
+ * much benefits.
+ */
+ if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -648,19 +1288,33 @@ static void nohz_csd_func(void *info)
/*
* Release the rq::nohz_csd.
*/
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
rq->idle_balance = idle_cpu(cpu);
- if (rq->idle_balance && !need_resched()) {
+ if (rq->idle_balance) {
rq->nohz_idle_balance = flags;
- raise_softirq_irqoff(SCHED_SOFTIRQ);
+ __raise_softirq_irqoff(SCHED_SOFTIRQ);
}
}
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
+static inline bool __need_bw_check(struct rq *rq, struct task_struct *p)
+{
+ if (rq->nr_running != 1)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (!task_on_rq_queued(p))
+ return false;
+
+ return true;
+}
+
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
@@ -670,7 +1324,7 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
/*
- * If there are more than one RR tasks, we need the tick to effect the
+ * If there are more than one RR tasks, we need the tick to affect the
* actual RR behaviour.
*/
if (rq->rt.rr_nr_running) {
@@ -689,13 +1343,28 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
/*
- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
- * if there's more than one we need the tick for involuntary
- * preemption.
+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+ * left. For CFS, if there's more than one we need the tick for
+ * involuntary preemption. For SCX, ask.
*/
- if (rq->nr_running > 1)
+ if (scx_enabled() && !scx_can_stop_tick(rq))
return false;
+ if (rq->cfs.h_nr_queued > 1)
+ return false;
+
+ /*
+ * If there is one task and it has CFS runtime bandwidth constraints
+ * and it's on the cpu now we don't want to stop the tick.
+ * This check prevents clearing the bit if a newly enqueued task here is
+ * dequeued by migrating while the constrained task continues to run.
+ * E.g. going from 2->1 without going through pick_next_task().
+ */
+ if (__need_bw_check(rq, rq->curr)) {
+ if (cfs_task_bw_constrained(rq->curr))
+ return false;
+ }
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -746,30 +1415,27 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p, bool update_load)
+void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
- struct load_weight *load = &p->se.load;
+ struct load_weight lw;
- /*
- * SCHED_IDLE tasks get minimal weight:
- */
if (task_has_idle_policy(p)) {
- load->weight = scale_load(WEIGHT_IDLEPRIO);
- load->inv_weight = WMULT_IDLEPRIO;
- return;
+ lw.weight = scale_load(WEIGHT_IDLEPRIO);
+ lw.inv_weight = WMULT_IDLEPRIO;
+ } else {
+ lw.weight = scale_load(sched_prio_to_weight[prio]);
+ lw.inv_weight = sched_prio_to_wmult[prio];
}
/*
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
- if (update_load && p->sched_class == &fair_sched_class) {
- reweight_task(p, prio);
- } else {
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
- }
+ if (update_load && p->sched_class->reweight_task)
+ p->sched_class->reweight_task(task_rq(p), p, &lw);
+ else
+ p->se.load = lw;
}
#ifdef CONFIG_UCLAMP_TASK
@@ -783,47 +1449,53 @@ static void set_load_weight(struct task_struct *p, bool update_load)
* requests are serialized using a mutex to reduce the risk of conflicting
* updates or API abuses.
*/
-static DEFINE_MUTEX(uclamp_mutex);
+static __maybe_unused DEFINE_MUTEX(uclamp_mutex);
/* Max allowed minimum utilization */
-unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
-unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+
+/*
+ * By default RT tasks run at the maximum performance point/capacity of the
+ * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
+ * SCHED_CAPACITY_SCALE.
+ *
+ * This knob allows admins to change the default behavior when uclamp is being
+ * used. In battery powered devices, particularly, running at the maximum
+ * capacity and frequency will increase energy consumption and shorten the
+ * battery life.
+ *
+ * This knob only affects RT tasks that their uclamp_se->user_defined == false.
+ *
+ * This knob will not override the system default sched_util_clamp_min defined
+ * above.
+ */
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
-/* Integer rounded range for each bucket */
-#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
-
-#define for_each_clamp_id(clamp_id) \
- for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
-
-static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
-{
- return clamp_value / UCLAMP_BUCKET_DELTA;
-}
-
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
-{
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
-}
-
-static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
-{
- if (clamp_id == UCLAMP_MIN)
- return 0;
- return SCHED_CAPACITY_SCALE;
-}
-
-static inline void uclamp_se_set(struct uclamp_se *uc_se,
- unsigned int value, bool user_defined)
-{
- uc_se->value = value;
- uc_se->bucket_id = uclamp_bucket_id(value);
- uc_se->user_defined = user_defined;
-}
+/*
+ * This static key is used to reduce the uclamp overhead in the fast path. It
+ * primarily disables the call to uclamp_rq_{inc, dec}() in
+ * enqueue/dequeue_task().
+ *
+ * This allows users to continue to enable uclamp in their kernel config with
+ * minimum uclamp overhead in the fast path.
+ *
+ * As soon as userspace modifies any of the uclamp knobs, the static key is
+ * enabled, since we have an actual users that make use of uclamp
+ * functionality.
+ *
+ * The knobs that would enable this static key are:
+ *
+ * * A task modifying its uclamp value with sched_setattr().
+ * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
+ * * An admin modifying the cgroup cpu.uclamp.{min, max}
+ */
+DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
static inline unsigned int
uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
@@ -849,7 +1521,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+ uclamp_rq_set(rq, clamp_id, clamp_value);
}
static inline
@@ -873,12 +1545,40 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static void __uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ unsigned int default_util_min;
+ struct uclamp_se *uc_se;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ uc_se = &p->uclamp_req[UCLAMP_MIN];
+
+ /* Only sync if user didn't override the default */
+ if (uc_se->user_defined)
+ return;
+
+ default_util_min = sysctl_sched_uclamp_util_min_rt_default;
+ uclamp_se_set(uc_se, default_util_min, false);
+}
+
+static void uclamp_update_util_min_rt_default(struct task_struct *p)
+{
+ if (!rt_task(p))
+ return;
+
+ /* Protect updates to p->uclamp_* */
+ guard(task_rq_lock)(p);
+ __uclamp_update_util_min_rt_default(p);
+}
+
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
+ /* Copy by value as we could modify it */
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP
- struct uclamp_se uc_max;
+ unsigned int tg_min, tg_max, value;
/*
* Tasks in autogroups or root task group will be
@@ -889,9 +1589,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
if (task_group(p) == &root_task_group)
return uc_req;
- uc_max = task_group(p)->uclamp[clamp_id];
- if (uc_req.value > uc_max.value || !uc_req.user_defined)
- return uc_max;
+ tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
+ tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
+ value = uc_req.value;
+ value = clamp(value, tg_min, tg_max);
+ uclamp_se_set(&uc_req, value, false);
#endif
return uc_req;
@@ -948,7 +1650,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/* Update task effective clamp */
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
@@ -966,8 +1668,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
- if (uc_se->value > READ_ONCE(uc_rq->value))
- WRITE_ONCE(uc_rq->value, uc_se->value);
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
}
/*
@@ -988,12 +1690,40 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
unsigned int bkt_clamp;
unsigned int rq_clamp;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * If sched_uclamp_used was enabled after task @p was enqueued,
+ * we could end up with unbalanced call to uclamp_rq_dec_id().
+ *
+ * In this case the uc_se->active flag should be false since no uclamp
+ * accounting was performed at enqueue time and we can just return
+ * here.
+ *
+ * Need to be careful of the following enqueue/dequeue ordering
+ * problem too
+ *
+ * enqueue(taskA)
+ * // sched_uclamp_used gets enabled
+ * enqueue(taskB)
+ * dequeue(taskA)
+ * // Must not decrement bucket->tasks here
+ * dequeue(taskB)
+ *
+ * where we could end up with stale data in uc_se and
+ * bucket[uc_se->bucket_id].
+ *
+ * The following check here eliminates the possibility of such race.
+ */
+ if (unlikely(!uc_se->active))
+ return;
bucket = &uc_rq->bucket[uc_se->bucket_id];
+
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
+
uc_se->active = false;
/*
@@ -1005,15 +1735,15 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
if (likely(bucket->tasks))
return;
- rq_clamp = READ_ONCE(uc_rq->value);
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
/*
* Defensive programming: this should never happen. If it happens,
- * e.g. due to future modification, warn and fixup the expected value.
+ * e.g. due to future modification, warn and fix up the expected value.
*/
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
- WRITE_ONCE(uc_rq->value, bkt_clamp);
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
@@ -1021,9 +1751,21 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!static_branch_unlikely(&sched_uclamp_used))
+ return;
+
if (unlikely(!p->sched_class->uclamp_enabled))
return;
+ if (p->se.sched_delayed)
+ return;
+
for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
@@ -1036,16 +1778,46 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
+ /*
+ * Avoid any overhead until uclamp is actually used by the userspace.
+ *
+ * The condition is constructed such that a NOP is generated when
+ * sched_uclamp_used is disabled.
+ */
+ if (!static_branch_unlikely(&sched_uclamp_used))
+ return;
+
if (unlikely(!p->sched_class->uclamp_enabled))
return;
+ if (p->se.sched_delayed)
+ return;
+
for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (!p->uclamp[clamp_id].active)
+ return;
+
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
static inline void
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+uclamp_update_active(struct task_struct *p)
{
+ enum uclamp_id clamp_id;
struct rq_flags rf;
struct rq *rq;
@@ -1065,34 +1837,30 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- if (p->uclamp[clamp_id].active) {
- uclamp_rq_dec_id(rq, p, clamp_id);
- uclamp_rq_inc_id(rq, p, clamp_id);
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
static inline void
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
- unsigned int clamps)
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
{
- enum uclamp_id clamp_id;
struct css_task_iter it;
struct task_struct *p;
css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it))) {
- for_each_clamp_id(clamp_id) {
- if ((0x1 << clamp_id) & clamps)
- uclamp_update_active(p, clamp_id);
- }
- }
+ while ((p = css_task_iter_next(&it)))
+ uclamp_update_active(p);
css_task_iter_end(&it);
}
static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+#endif
+
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
struct task_group *tg = &root_task_group;
@@ -1102,33 +1870,62 @@ static void uclamp_update_root_tg(void)
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
- rcu_read_lock();
+ guard(rcu)();
cpu_util_update_eff(&root_task_group.css);
- rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
#endif
-int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+static void uclamp_sync_util_min_rt_default(void)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ guard(rcu)();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
+}
+
+static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
- int old_min, old_max;
+ int old_min, old_max, old_min_rt;
int result;
- mutex_lock(&uclamp_mutex);
+ guard(mutex)(&uclamp_mutex);
+
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
+ old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
goto undo;
if (!write)
- goto done;
+ return 0;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
- sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
+ sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
+
result = -EINVAL;
goto undo;
}
@@ -1144,97 +1941,68 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
update_root_tg = true;
}
- if (update_root_tg)
+ if (update_root_tg) {
+ static_branch_enable(&sched_uclamp_used);
uclamp_update_root_tg();
+ }
+
+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
+ static_branch_enable(&sched_uclamp_used);
+ uclamp_sync_util_min_rt_default();
+ }
/*
* We update all RUNNABLE tasks only when task groups are in use.
* Otherwise, keep it simple and do just a lazy update at each next
* task enqueue time.
*/
-
- goto done;
+ return 0;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
-done:
- mutex_unlock(&uclamp_mutex);
-
+ sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
return result;
}
+#endif
-static int uclamp_validate(struct task_struct *p,
- const struct sched_attr *attr)
-{
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
- lower_bound = attr->sched_util_min;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
- upper_bound = attr->sched_util_max;
-
- if (lower_bound > upper_bound)
- return -EINVAL;
- if (upper_bound > SCHED_CAPACITY_SCALE)
- return -EINVAL;
-
- return 0;
-}
-
-static void __setscheduler_uclamp(struct task_struct *p,
- const struct sched_attr *attr)
+static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
/*
- * On scheduling class change, reset to default clamps for tasks
- * without a task-specific value.
+ * We don't need to hold task_rq_lock() when updating p->uclamp_* here
+ * as the task is still at its early fork stages.
*/
- for_each_clamp_id(clamp_id) {
- struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
- unsigned int clamp_value = uclamp_none(clamp_id);
-
- /* Keep using defined clamps across class changes */
- if (uc_se->user_defined)
- continue;
-
- /* By default, RT tasks always get 100% boost */
- if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- clamp_value = uclamp_none(UCLAMP_MAX);
-
- uclamp_se_set(uc_se, clamp_value, false);
- }
+ for_each_clamp_id(clamp_id)
+ p->uclamp[clamp_id].active = false;
- if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+ if (likely(!p->sched_reset_on_fork))
return;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
- uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
- attr->sched_util_min, true);
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&p->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
}
+}
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
- uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
- attr->sched_util_max, true);
- }
+static void uclamp_post_fork(struct task_struct *p)
+{
+ uclamp_update_util_min_rt_default(p);
}
-static void uclamp_fork(struct task_struct *p)
+static void __init init_uclamp_rq(struct rq *rq)
{
enum uclamp_id clamp_id;
-
- for_each_clamp_id(clamp_id)
- p->uclamp[clamp_id].active = false;
-
- if (likely(!p->sched_reset_on_fork))
- return;
+ struct uclamp_rq *uc_rq = rq->uclamp;
for_each_clamp_id(clamp_id) {
- uclamp_se_set(&p->uclamp_req[clamp_id],
- uclamp_none(clamp_id), false);
+ uc_rq[clamp_id] = (struct uclamp_rq) {
+ .value = uclamp_none(clamp_id)
+ };
}
+
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
}
static void __init init_uclamp(void)
@@ -1243,13 +2011,8 @@ static void __init init_uclamp(void)
enum uclamp_id clamp_id;
int cpu;
- mutex_init(&uclamp_mutex);
-
- for_each_possible_cpu(cpu) {
- memset(&cpu_rq(cpu)->uclamp, 0,
- sizeof(struct uclamp_rq)*UCLAMP_CNT);
- cpu_rq(cpu)->uclamp_flags = 0;
- }
+ for_each_possible_cpu(cpu)
+ init_uclamp_rq(cpu_rq(cpu));
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
@@ -1267,108 +2030,115 @@ static void __init init_uclamp(void)
}
}
-#else /* CONFIG_UCLAMP_TASK */
+#else /* !CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
-static inline int uclamp_validate(struct task_struct *p,
- const struct sched_attr *attr)
-{
- return -EOPNOTSUPP;
-}
-static void __setscheduler_uclamp(struct task_struct *p,
- const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
+static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
-static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+bool sched_task_on_rq(struct task_struct *p)
+{
+ return task_on_rq_queued(p);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
+
+ if (!p || p == current)
+ return 0;
+
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return ip;
+}
+
+void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & ENQUEUE_RESTORE)) {
- sched_info_queued(rq, p);
- psi_enqueue(p, flags & ENQUEUE_WAKEUP);
- }
-
- uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+ /*
+ * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
+ * ->sched_delayed.
+ */
+ uclamp_rq_inc(rq, p);
+
+ psi_enqueue(p, flags);
+
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_enqueue(rq, p);
+
+ if (sched_core_enabled(rq))
+ sched_core_enqueue(rq, p);
}
-static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+/*
+ * Must only return false when DEQUEUE_SLEEP.
+ */
+inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (sched_core_enabled(rq))
+ sched_core_dequeue(rq, p, flags);
+
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & DEQUEUE_SAVE)) {
- sched_info_dequeued(rq, p);
- psi_dequeue(p, flags & DEQUEUE_SLEEP);
- }
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeue(rq, p);
+
+ psi_dequeue(p, flags);
+ /*
+ * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
+ * and mark the task ->sched_delayed.
+ */
uclamp_rq_dec(rq, p);
- p->sched_class->dequeue_task(rq, p, flags);
+ return p->sched_class->dequeue_task(rq, p, flags);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_on_rq_migrating(p))
+ flags |= ENQUEUE_MIGRATED;
+ if (flags & ENQUEUE_MIGRATED)
+ sched_mm_cid_migrate_to(rq, p);
+
enqueue_task(rq, p, flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+ SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
- dequeue_task(rq, p, flags);
-}
-
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
-{
- return p->static_prio;
-}
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
-/*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
- */
-static inline int normal_prio(struct task_struct *p)
-{
- int prio;
+ /*
+ * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before*
+ * dequeue_task() and cleared *after* enqueue_task().
+ */
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ dequeue_task(rq, p, flags);
}
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
+static void block_task(struct rq *rq, struct task_struct *p, int flags)
{
- p->normal_prio = normal_prio(p);
- /*
- * If we are RT tasks or we were boosted to RT priority,
- * keep the priority unchanged. Otherwise, update priority
- * to the normal priority:
- */
- if (!rt_prio(p->prio))
- return p->normal_prio;
- return p->prio;
+ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
+ __block_task(rq, p);
}
/**
@@ -1383,15 +2153,26 @@ inline int task_curr(const struct task_struct *p)
}
/*
+ * ->switching_to() is called with the pi_lock and rq_lock held and must not
+ * mess with locking.
+ */
+void check_class_changing(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class)
+{
+ if (prev_class != p->sched_class && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+}
+
+/*
* switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
* use the balance_callback list if you want balancing.
*
* this means any call to check_class_changed() must be followed by a call to
* balance_callback().
*/
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
+void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
@@ -1402,46 +2183,279 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
p->sched_class->prio_changed(rq, p, oldprio);
}
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
- const struct sched_class *class;
+ struct task_struct *donor = rq->donor;
- if (p->sched_class == rq->curr->sched_class) {
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- } else {
- for_each_class(class) {
- if (class == rq->curr->sched_class)
- break;
- if (class == p->sched_class) {
- resched_curr(rq);
- break;
- }
- }
- }
+ if (p->sched_class == donor->sched_class)
+ donor->sched_class->wakeup_preempt(rq, p, flags);
+ else if (sched_class_above(p->sched_class, donor->sched_class))
+ resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
+ if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr))
rq_clock_skip_update(rq);
}
+static __always_inline
+int __task_state_match(struct task_struct *p, unsigned int state)
+{
+ if (READ_ONCE(p->__state) & state)
+ return 1;
+
+ if (READ_ONCE(p->saved_state) & state)
+ return -1;
+
+ return 0;
+}
+
+static __always_inline
+int task_state_match(struct task_struct *p, unsigned int state)
+{
+ /*
+ * Serialize against current_save_and_set_rtlock_wait_state(),
+ * current_restore_rtlock_saved_state(), and __refrigerator().
+ */
+ guard(raw_spinlock_irq)(&p->pi_lock);
+ return __task_state_match(p, state);
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
+{
+ int running, queued, match;
+ struct rq_flags rf;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_on_cpu()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_on_cpu(rq, p)) {
+ if (!task_state_match(p, match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &rf);
+ trace_sched_wait_task(p);
+ running = task_on_cpu(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if ((match = __task_state_match(p, match_state))) {
+ /*
+ * When matching on p->saved_state, consider this task
+ * still queued so it will wait.
+ */
+ if (match < 0)
+ queued = 1;
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = NSEC_PER_SEC / HZ;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
#ifdef CONFIG_SMP
+static void
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+ struct affinity_context ac = {
+ .new_mask = cpumask_of(rq->cpu),
+ .flags = SCA_MIGRATE_DISABLE,
+ };
+
+ if (likely(!p->migration_disabled))
+ return;
+
+ if (p->cpus_ptr != &p->cpus_mask)
+ return;
+
+ /*
+ * Violates locking rules! See comment in __do_set_cpus_allowed().
+ */
+ __do_set_cpus_allowed(p, &ac);
+}
+
+void migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ *Warn about overflow half-way through the range.
+ */
+ WARN_ON_ONCE((s16)p->migration_disabled < 0);
+#endif
+ p->migration_disabled++;
+ return;
+ }
+
+ guard(preempt)();
+ this_rq()->nr_pinned++;
+ p->migration_disabled = 1;
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+ struct affinity_context ac = {
+ .new_mask = &p->cpus_mask,
+ .flags = SCA_MIGRATE_ENABLE,
+ };
+
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Check both overflow from migrate_disable() and superfluous
+ * migrate_enable().
+ */
+ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+ return;
+#endif
+
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
+ }
+
+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ guard(preempt)();
+ if (p->cpus_ptr != &p->cpus_mask)
+ __set_cpus_allowed_ptr(p, &ac);
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
+ barrier();
+ p->migration_disabled = 0;
+ this_rq()->nr_pinned--;
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned;
+}
+
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ /* When not in the task's cpumask, no point in looking further. */
+ if (!task_allowed_on_cpu(p, cpu))
return false;
- if (is_per_cpu_kthread(p))
+ /* migrate_disabled() must be allowed to finish. */
+ if (is_migration_disabled(p))
+ return cpu_online(cpu);
+
+ /* Non kernel threads are not allowed during either online or offline. */
+ if (!(p->flags & PF_KTHREAD))
+ return cpu_active(cpu);
+
+ /* KTHREAD_IS_PER_CPU is always allowed. */
+ if (kthread_is_per_cpu(p))
return cpu_online(cpu);
- return cpu_active(cpu);
+ /* Regular kernel threads don't get to stay during offline. */
+ if (cpu_dying(cpu))
+ return false;
+
+ /* But are allowed during online. */
+ return cpu_online(cpu);
}
/*
@@ -1466,27 +2480,38 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
- WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
- dequeue_task(rq, p, DEQUEUE_NOCLOCK);
+ deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
- BUG_ON(task_cpu(p) != new_cpu);
- enqueue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
- check_preempt_curr(rq, p, 0);
+ WARN_ON_ONCE(task_cpu(p) != new_cpu);
+ activate_task(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
return rq;
}
struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
+struct set_affinity_pending {
+ refcount_t refs;
+ unsigned int stop_pending;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
};
/*
@@ -1505,53 +2530,160 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;
- update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
}
/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * migration_cpu_stop - this will be executed by a high-prio stopper thread
* and performs thread migration by bumping thread off CPU then
* 'pushing' onto another runqueue.
*/
static int migration_cpu_stop(void *data)
{
struct migration_arg *arg = data;
+ struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
+ bool complete = false;
struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
- local_irq_disable();
+ local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
+
+ /*
+ * If we were passed a pending, then ->stop_pending was set, thus
+ * p->migration_pending must have remained stable.
+ */
+ WARN_ON_ONCE(pending && pending != p->migration_pending);
+
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
- if (task_on_rq_queued(p))
+ if (is_migration_disabled(p))
+ goto out;
+
+ if (pending) {
+ p->migration_pending = NULL;
+ complete = true;
+
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
+ goto out;
+ }
+
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
+ } else {
p->wake_cpu = arg->dest_cpu;
+ }
+
+ /*
+ * XXX __migrate_task() can fail, at which point we might end
+ * up running on a dodgy CPU, AFAICT this can only happen
+ * during CPU hotplug, at which point we'll get pushed out
+ * anyway, so it's probably not a big deal.
+ */
+
+ } else if (pending) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * The task moved before the stopper got to run. We're holding
+ * ->pi_lock, so the allowed mask is stable - if it got
+ * somewhere allowed, we're done.
+ */
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ p->migration_pending = NULL;
+ complete = true;
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ WARN_ON_ONCE(!pending->stop_pending);
+ preempt_disable();
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ preempt_enable();
+ return 0;
}
- rq_unlock(rq, &rf);
- raw_spin_unlock(&p->pi_lock);
+out:
+ if (pending)
+ pending->stop_pending = false;
+ task_rq_unlock(rq, p, &rf);
+
+ if (complete)
+ complete_all(&pending->done);
- local_irq_enable();
+ return 0;
+}
+
+int push_cpu_stop(void *arg)
+{
+ struct rq *lowest_rq = NULL, *rq = this_rq();
+ struct task_struct *p = arg;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_rq_lock(rq);
+
+ if (task_rq(p) != rq)
+ goto out_unlock;
+
+ if (is_migration_disabled(p)) {
+ p->migration_flags |= MDF_PUSH;
+ goto out_unlock;
+ }
+
+ p->migration_flags &= ~MDF_PUSH;
+
+ if (p->sched_class->find_lock_rq)
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+ if (!lowest_rq)
+ goto out_unlock;
+
+ // XXX validate p is still the highest prio task
+ if (task_rq(p) == rq) {
+ move_queued_task_locked(rq, lowest_rq, p);
+ resched_curr(lowest_rq);
+ }
+
+ double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+ rq->push_busy = false;
+ raw_spin_rq_unlock(rq);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
return 0;
}
@@ -1559,34 +2691,62 @@ static int migration_cpu_stop(void *data)
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
{
- cpumask_copy(&p->cpus_mask, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
+ if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = ctx->new_mask;
+ return;
+ }
+
+ cpumask_copy(&p->cpus_mask, ctx->new_mask);
+ p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+
+ /*
+ * Swap in a new user_cpus_ptr if SCA_USER flag set
+ */
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
struct rq *rq = task_rq(p);
bool queued, running;
- lockdep_assert_held(&p->pi_lock);
+ /*
+ * This here violates the locking rules for affinity, since we're only
+ * supposed to change these variables while holding both rq->lock and
+ * p->pi_lock.
+ *
+ * HOWEVER, it magically works, because ttwu() is the only code that
+ * accesses these variables under p->pi_lock and only does so after
+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+ * before finish_task().
+ *
+ * XXX do further audits, this smells like something putrid.
+ */
+ if (ctx->flags & SCA_MIGRATE_DISABLE)
+ SCHED_WARN_ON(!p->on_cpu);
+ else
+ lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued) {
/*
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
- p->sched_class->set_cpus_allowed(p, new_mask);
+ p->sched_class->set_cpus_allowed(p, ctx);
+ mm_set_cpus_allowed(p->mm, ctx->new_mask);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1595,113 +2755,558 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
}
/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
+ * Used for kthread_bind() and select_fallback_rq(), in both cases the user
+ * affinity (if any) should be destroyed too.
+ */
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .user_mask = NULL,
+ .flags = SCA_USER, /* clear the user requested mask */
+ };
+ union cpumask_rcuhead {
+ cpumask_t cpumask;
+ struct rcu_head rcu;
+ };
+
+ __do_set_cpus_allowed(p, &ac);
+
+ /*
+ * Because this is called with p->pi_lock held, it is not possible
+ * to use kfree() here (when PREEMPT_RT=y), therefore punt to using
+ * kfree_rcu().
+ */
+ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
+}
+
+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+ int node)
+{
+ cpumask_t *user_mask;
+ unsigned long flags;
+
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
+ return 0;
+
+ user_mask = alloc_user_cpus_ptr(node);
+ if (!user_mask)
+ return -ENOMEM;
+
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
+ raw_spin_lock_irqsave(&src->pi_lock, flags);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
+ raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
+ return 0;
+}
+
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = NULL;
+
+ swap(p->user_cpus_ptr, user_mask);
+
+ return user_mask;
+}
+
+void release_user_cpus_ptr(struct task_struct *p)
+{
+ kfree(clear_user_cpus_ptr(p));
+}
+
+/*
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * <resumes>
+ * migrate_enable();
+ * __set_cpus_allowed_ptr();
+ * <wakes local stopper>
+ * `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * occurs after the stopper bailed out due to the targeted task still being
+ * Migrate-Disable. Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * CPU0 P1 P2
+ * <P0>
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * <migration/0>
+ * migration_cpu_stop()
+ * is_migration_disabled()
+ * <bails>
+ * set_cpus_allowed_ptr(P0, [0, 1]);
+ * <signal completion>
+ * <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded by an uninstallation of
+ * p->migration_pending done with p->pi_lock held.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+ int dest_cpu, unsigned int flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
+ bool stop_pending, complete = false;
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ struct task_struct *push_task = NULL;
+
+ if ((flags & SCA_MIGRATE_ENABLE) &&
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+ rq->push_busy = true;
+ push_task = get_task_struct(p);
+ }
+
+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
+ pending = p->migration_pending;
+ if (pending && !pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+
+ preempt_disable();
+ task_rq_unlock(rq, p, rf);
+ if (push_task) {
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ p, &rq->push_work);
+ }
+ preempt_enable();
+
+ if (complete)
+ complete_all(&pending->done);
+
+ return 0;
+ }
+
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ /* serialized by p->pi_lock */
+ if (!p->migration_pending) {
+ /* Install the request */
+ refcount_set(&my_pending.refs, 1);
+ init_completion(&my_pending.done);
+ my_pending.arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = dest_cpu,
+ .pending = &my_pending,
+ };
+
+ p->migration_pending = &my_pending;
+ } else {
+ pending = p->migration_pending;
+ refcount_inc(&pending->refs);
+ /*
+ * Affinity has changed, but we've already installed a
+ * pending. migration_cpu_stop() *must* see this, else
+ * we risk a completion of the pending despite having a
+ * task on a disallowed CPU.
+ *
+ * Serialized by p->pi_lock, so this is safe.
+ */
+ pending->arg.dest_cpu = dest_cpu;
+ }
+ }
+ pending = p->migration_pending;
+ /*
+ * - !MIGRATE_ENABLE:
+ * we'll have installed a pending if there wasn't one already.
+ *
+ * - MIGRATE_ENABLE:
+ * we're here because the current CPU isn't matching anymore,
+ * the only way that can happen is because of a concurrent
+ * set_cpus_allowed_ptr() call, which should then still be
+ * pending completion.
+ *
+ * Either way, we really should have a @pending here.
+ */
+ if (WARN_ON_ONCE(!pending)) {
+ task_rq_unlock(rq, p, rf);
+ return -EINVAL;
+ }
+
+ if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+ /*
+ * MIGRATE_ENABLE gets here because 'p == current', but for
+ * anything else we cannot do is_migration_disabled(), punt
+ * and have the stopper function handle it all race-free.
+ */
+ stop_pending = pending->stop_pending;
+ if (!stop_pending)
+ pending->stop_pending = true;
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ p->migration_flags &= ~MDF_PUSH;
+
+ preempt_disable();
+ task_rq_unlock(rq, p, rf);
+ if (!stop_pending) {
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ }
+ preempt_enable();
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ return 0;
+ } else {
+
+ if (!is_migration_disabled(p)) {
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ }
+ task_rq_unlock(rq, p, rf);
+
+ if (complete)
+ complete_all(&pending->done);
+ }
+
+ wait_for_completion(&pending->done);
+
+ if (refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs); /* No UaF, just an address */
+
+ /*
+ * Block the original owner of &pending until all subsequent callers
+ * have seen the completion and decremented the refcount
+ */
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
+ return 0;
+}
+
+/*
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+ struct affinity_context *ctx,
+ struct rq *rq,
+ struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
{
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool kthread = p->flags & PF_KTHREAD;
unsigned int dest_cpu;
- struct rq_flags rf;
- struct rq *rq;
int ret = 0;
- rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
- if (p->flags & PF_KTHREAD) {
+ if (kthread || is_migration_disabled(p)) {
/*
- * Kernel threads are allowed on online && !active CPUs
+ * Kernel threads are allowed on online && !active CPUs,
+ * however, during cpu-hot-unplug, even these might get pushed
+ * away if not KTHREAD_IS_PER_CPU.
+ *
+ * Specifically, migration_disabled() tasks must not fail the
+ * cpumask_any_and_distribute() pick below, esp. so on
+ * SCA_MIGRATE_ENABLE, otherwise we'll not call
+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
*/
cpu_valid_mask = cpu_online_mask;
}
+ if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
- if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
- if (cpumask_equal(&p->cpus_mask, new_mask))
- goto out;
+ if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
+ if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) {
+ if (ctx->flags & SCA_USER)
+ swap(p->user_cpus_ptr, ctx->user_mask);
+ goto out;
+ }
+
+ if (WARN_ON_ONCE(p == current &&
+ is_migration_disabled(p) &&
+ !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
/*
* Picking a ~random cpu helps in cases where we are changing affinity
* for groups of tasks (ie. cpuset), so that load balancing is not
* immediately required to distribute the tasks within their new mask.
*/
- dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
- do_set_cpus_allowed(p, new_mask);
-
- if (p->flags & PF_KTHREAD) {
- /*
- * For kernel threads that do indeed end up on online &&
- * !active we want to ensure they are strict per-CPU threads.
- */
- WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
- !cpumask_intersects(new_mask, cpu_active_mask) &&
- p->nr_cpus_allowed != 1);
- }
+ __do_set_cpus_allowed(p, ctx);
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
+ return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- return 0;
- } else if (task_on_rq_queued(p)) {
- /*
- * OK, since we're going to drop the lock immediately
- * afterwards anyway.
- */
- rq = move_queued_task(rq, &rf, p, dest_cpu);
- }
out:
- task_rq_unlock(rq, p, &rf);
+ task_rq_unlock(rq, p, rf);
return ret;
}
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ /*
+ * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ * flags are set.
+ */
+ if (p->user_cpus_ptr &&
+ !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+ ctx->new_mask = rq->scratch_mask;
+
+ return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
+}
+
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, false);
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+
+ return __set_cpus_allowed_ptr(p, &ac);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
+ * If user_cpus_ptr is defined, use it as the basis for restricting CPU
+ * affinity or use cpu_online_mask instead.
+ *
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+ struct cpumask *new_mask,
+ const struct cpumask *subset_mask)
+{
+ struct affinity_context ac = {
+ .new_mask = new_mask,
+ .flags = 0,
+ };
+ struct rq_flags rf;
+ struct rq *rq;
+ int err;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Forcefully restricting the affinity of a deadline task is
+ * likely to cause problems, so fail and noisily override the
+ * mask entirely.
+ */
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ err = -EPERM;
+ goto err_unlock;
+ }
+
+ if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
+
+err_unlock:
+ task_rq_unlock(rq, p, &rf);
+ return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ cpumask_var_t new_mask;
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+ /*
+ * __migrate_task() can fail silently in the face of concurrent
+ * offlining of the chosen destination CPU, so take the hotplug
+ * lock to ensure that the migration succeeds.
+ */
+ cpus_read_lock();
+ if (!cpumask_available(new_mask))
+ goto out_set_mask;
+
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+ goto out_free_mask;
+
+ /*
+ * We failed to find a valid subset of the affinity mask for the
+ * task, so override it based on its cpuset hierarchy.
+ */
+ cpuset_cpus_allowed(p, new_mask);
+ override_mask = new_mask;
+
+out_set_mask:
+ if (printk_ratelimit()) {
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+ task_pid_nr(p), p->comm,
+ cpumask_pr_args(override_mask));
+ }
+
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+ cpus_read_unlock();
+ free_cpumask_var(new_mask);
+}
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr().
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ struct affinity_context ac = {
+ .new_mask = task_user_cpus(p),
+ .flags = 0,
+ };
+ int ret;
+
+ /*
+ * Try to restore the old affinity mask with __sched_setaffinity().
+ * Cpuset masking will be done there too.
+ */
+ ret = __sched_setaffinity(p, &ac);
+ WARN_ON_ONCE(ret);
+}
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
+ unsigned int state = READ_ONCE(p->__state);
+
/*
* We should never call set_task_cpu() on a blocked task,
* ttwu() will sort out the placement.
*/
- WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !p->on_rq);
+ WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
/*
* Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
* because schedstat_wait_{start,end} rebase migrating task's wait_start
* time relying on p->on_rq.
*/
- WARN_ON_ONCE(p->state == TASK_RUNNING &&
+ WARN_ON_ONCE(state == TASK_RUNNING &&
p->sched_class == &fair_sched_class &&
(p->on_rq && !task_on_rq_migrating(p)));
@@ -1717,12 +3322,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock)));
+ lockdep_is_held(__rq_lockp(task_rq(p)))));
#endif
/*
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
*/
WARN_ON_ONCE(!cpu_online(new_cpu));
+
+ WARN_ON_ONCE(is_migration_disabled(p));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -1732,6 +3339,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
+ sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -1751,10 +3359,8 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, cpu);
- activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
+ move_queued_task_locked(src_rq, dst_rq, p);
+ wakeup_preempt(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
@@ -1778,7 +3384,6 @@ static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
- int ret = -EAGAIN;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
@@ -1786,33 +3391,25 @@ static int migrate_swap_stop(void *data)
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
- double_raw_lock(&arg->src_task->pi_lock,
- &arg->dst_task->pi_lock);
- double_rq_lock(src_rq, dst_rq);
+ guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
+ guard(double_rq_lock)(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
- goto unlock;
+ return -EAGAIN;
if (task_cpu(arg->src_task) != arg->src_cpu)
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
- goto unlock;
+ return -EAGAIN;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
- ret = 0;
-
-unlock:
- double_rq_unlock(src_rq, dst_rq);
- raw_spin_unlock(&arg->dst_task->pi_lock);
- raw_spin_unlock(&arg->src_task->pi_lock);
-
- return ret;
+ return 0;
}
/*
@@ -1855,114 +3452,6 @@ out:
}
#endif /* CONFIG_NUMA_BALANCING */
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change. If it changes, i.e. @p might have woken up,
- * then return zero. When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count). If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
-{
- int running, queued;
- struct rq_flags rf;
- unsigned long ncsw;
- struct rq *rq;
-
- for (;;) {
- /*
- * We do the initial early heuristics without holding
- * any task-queue locks at all. We'll only try to get
- * the runqueue lock when things look like they will
- * work out!
- */
- rq = task_rq(p);
-
- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
- * return false if the runqueue has changed and p
- * is actually now running somewhere else!
- */
- while (task_running(rq, p)) {
- if (match_state && unlikely(p->state != match_state))
- return 0;
- cpu_relax();
- }
-
- /*
- * Ok, time to look more closely! We need the rq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_rq_lock(p, &rf);
- trace_sched_wait_task(p);
- running = task_running(rq, p);
- queued = task_on_rq_queued(p);
- ncsw = 0;
- if (!match_state || p->state == match_state)
- ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &rf);
-
- /*
- * If it changed from the expected state, bail out now.
- */
- if (unlikely(!ncsw))
- break;
-
- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- continue;
- }
-
- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it was still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(queued)) {
- ktime_t to = NSEC_PER_SEC / HZ;
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL);
- continue;
- }
-
- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
- break;
- }
-
- return ncsw;
-}
-
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
@@ -1978,13 +3467,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
*/
void kick_process(struct task_struct *p)
{
- int cpu;
+ guard(preempt)();
+ int cpu = task_cpu(p);
- preempt_disable();
- cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
- preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
@@ -2027,9 +3514,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+ if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
@@ -2046,17 +3531,21 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- if (IS_ENABLED(CONFIG_CPUSETS)) {
- cpuset_cpus_allowed_fallback(p);
+ if (cpuset_cpus_allowed_fallback(p)) {
state = possible;
break;
}
- /* Fall-through */
+ fallthrough;
case possible:
- do_set_cpus_allowed(p, cpu_possible_mask);
+ /*
+ * XXX When called from select_task_rq() we only
+ * hold p->pi_lock and again violate locking order.
+ *
+ * More yuck to audit.
+ */
+ do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
state = fail;
break;
-
case fail:
BUG();
break;
@@ -2083,14 +3572,16 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int *wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (p->nr_cpus_allowed > 1)
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
- else
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) {
+ cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
+ *wake_flags |= WF_RQ_SELECTED;
+ } else {
cpu = cpumask_any(p->cpus_ptr);
+ }
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2110,6 +3601,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
+ static struct lock_class_key stop_pi_lock;
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
struct task_struct *old_stop = cpu_rq(cpu)->stop;
@@ -2125,6 +3617,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
stop->sched_class = &stop_sched_class;
+
+ /*
+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+ * adjust the effective priority of a task. As a result,
+ * rt_mutex_setprio() can trigger (RT) balancing operations,
+ * which can then trigger wakeups of the stop thread to push
+ * around the current task.
+ *
+ * The stop task itself will never be part of the PI-chain, it
+ * never blocks, therefore that ->pi_lock recursion is safe.
+ * Tell lockdep about this by placing the stop->pi_lock in its
+ * own class.
+ */
+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
}
cpu_rq(cpu)->stop = stop;
@@ -2138,15 +3644,16 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
}
}
-#else
+#else /* CONFIG_SMP */
+
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
-static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+static inline bool rq_has_pinned_tasks(struct rq *rq)
{
- return set_cpus_allowed_ptr(p, new_mask);
+ return false;
}
-#endif /* CONFIG_SMP */
+#endif /* !CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -2161,46 +3668,73 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
- __schedstat_inc(p->se.statistics.nr_wakeups_local);
+ __schedstat_inc(p->stats.nr_wakeups_local);
} else {
struct sched_domain *sd;
- __schedstat_inc(p->se.statistics.nr_wakeups_remote);
- rcu_read_lock();
+ __schedstat_inc(p->stats.nr_wakeups_remote);
+
+ guard(rcu)();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
- rcu_read_unlock();
}
if (wake_flags & WF_MIGRATED)
- __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+ __schedstat_inc(p->stats.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
- __schedstat_inc(p->se.statistics.nr_wakeups);
+ __schedstat_inc(p->stats.nr_wakeups);
if (wake_flags & WF_SYNC)
- __schedstat_inc(p->se.statistics.nr_wakeups_sync);
+ __schedstat_inc(p->stats.nr_wakeups_sync);
}
/*
- * Mark the task runnable and perform wakeup-preemption.
+ * Mark the task runnable.
*/
-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
+static inline void ttwu_do_wakeup(struct task_struct *p)
{
- check_preempt_curr(rq, p, wake_flags);
- p->state = TASK_RUNNING;
+ WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+
+ lockdep_assert_rq_held(rq);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+#ifdef CONFIG_SMP
+ if (wake_flags & WF_RQ_SELECTED)
+ en_flags |= ENQUEUE_RQ_SELECTED;
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ wakeup_preempt(rq, p, wake_flags);
+
+ ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
- * Our task @p is fully woken up and running; so its safe to
+ * Our task @p is fully woken up and running; so it's safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
rq_unpin_lock(rq, rf);
@@ -2222,33 +3756,32 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
}
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
-{
- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
-
- lockdep_assert_held(&rq->lock);
-
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-
-#ifdef CONFIG_SMP
- if (wake_flags & WF_MIGRATED)
- en_flags |= ENQUEUE_MIGRATED;
-#endif
-
- activate_task(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, rf);
-}
-
/*
- * Called in case the task @p isn't fully descheduled from its runqueue,
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
- * since all we need to do is flip p->state to TASK_RUNNING, since
- * the task is still ->on_rq.
+ * Consider @p being inside a wait loop:
+ *
+ * for (;;) {
+ * set_current_state(TASK_UNINTERRUPTIBLE);
+ *
+ * if (CONDITION)
+ * break;
+ *
+ * schedule();
+ * }
+ * __set_current_state(TASK_RUNNING);
+ *
+ * between set_current_state() and schedule(). In this case @p is still
+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
+ * an atomic manner.
+ *
+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
+ * then schedule() must still happen and p->state can be changed to
+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
+ * need to do a full wakeup with enqueue.
+ *
+ * Returns: %true when the wakeup is done,
+ * %false otherwise.
*/
-static int ttwu_remote(struct task_struct *p, int wake_flags)
+static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
struct rq_flags rf;
struct rq *rq;
@@ -2256,9 +3789,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, &rf);
+ if (p->se.sched_delayed)
+ enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ wakeup_preempt(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -2277,13 +3818,6 @@ void sched_ttwu_pending(void *arg)
if (!llist)
return;
- /*
- * rq::ttwu_pending racy indication of out-standing wakeups.
- * Races such that false-negatives are possible, since they
- * are shorter lived that false-positives would be.
- */
- WRITE_ONCE(rq->ttwu_pending, 0);
-
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -2297,17 +3831,34 @@ void sched_ttwu_pending(void *arg)
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
+ /*
+ * Must be after enqueueing at least once task such that
+ * idle_cpu() does not observe a false-negative -- if it does,
+ * it is possible for select_idle_siblings() to stack a number
+ * of tasks on this CPU during that window.
+ *
+ * It is OK to clear ttwu_pending when another task pending.
+ * We will receive IPI after local IRQ enabled and then enqueue it.
+ * Since now nr_running > 0, idle_cpu() will always get correct result.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
rq_unlock_irqrestore(rq, &rf);
}
-void send_call_function_single_ipi(int cpu)
+/*
+ * Prepare the scene for sending an IPI for a remote smp_call
+ *
+ * Returns true if the caller can proceed with sending the IPI.
+ * Returns false otherwise.
+ */
+bool call_function_single_prep_ipi(int cpu)
{
- struct rq *rq = cpu_rq(cpu);
-
- if (!set_nr_if_polling(rq->idle))
- arch_send_call_function_single_ipi(cpu);
- else
+ if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
+ return false;
+ }
+
+ return true;
}
/*
@@ -2329,48 +3880,90 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- rcu_read_lock();
-
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
-
- if (set_nr_if_polling(rq->idle)) {
- trace_sched_wake_idle_without_ipi(cpu);
- } else {
- rq_lock_irqsave(rq, &rf);
+ guard(rcu)();
+ if (is_idle_task(rcu_dereference(rq->curr))) {
+ guard(rq_lock_irqsave)(rq);
if (is_idle_task(rq->curr))
- smp_send_reschedule(cpu);
- /* Else CPU is not idle, do nothing here: */
- rq_unlock_irqrestore(rq, &rf);
+ resched_curr(rq);
}
+}
-out:
- rcu_read_unlock();
+bool cpus_equal_capacity(int this_cpu, int that_cpu)
+{
+ if (!sched_asym_cpucap_active())
+ return true;
+
+ if (this_cpu == that_cpu)
+ return true;
+
+ return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu);
}
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+ if (this_cpu == that_cpu)
+ return true;
+
+ return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
+static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
+ * The BPF scheduler may depend on select_task_rq() being invoked during
+ * wakeups. In addition, @p may end up executing on a different CPU
+ * regardless of what happens in the wakeup path making the ttwu_queue
+ * optimization less meaningful. Skip if on SCX.
+ */
+ if (task_on_scx(p))
+ return false;
+
+ /*
+ * Do not complicate things with the async wake_list while the CPU is
+ * in hotplug state.
+ */
+ if (!cpu_active(cpu))
+ return false;
+
+ /* Ensure the task will still be allowed to run on the CPU. */
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
+ /*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
*/
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
+ if (cpu == smp_processor_id())
+ return false;
+
/*
- * If the task is descheduling and the only running task on the
- * CPU then use the wakelist to offload the task activation to
- * the soon-to-be-idle CPU as the current CPU is likely busy.
- * nr_running is checked to avoid unnecessary task stacking.
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ if (!cpu_rq(cpu)->nr_running)
return true;
return false;
@@ -2378,10 +3971,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
- if (WARN_ON_ONCE(cpu == smp_processor_id()))
- return false;
-
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -2389,6 +3979,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
return false;
}
+
+#else /* !CONFIG_SMP */
+
+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ return false;
+}
+
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2396,10 +3994,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
-#if defined(CONFIG_SMP)
if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
-#endif
rq_lock(rq, &rf);
update_rq_clock(rq);
@@ -2408,6 +4004,56 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
}
/*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of saved_state:
+ *
+ * The related locking code always holds p::pi_lock when updating
+ * p::saved_state, which means the code is fully serialized in both cases.
+ *
+ * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ * No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ * allows us to prevent early wakeup of tasks before they can be run on
+ * asymmetric ISA architectures (eg ARMv9).
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+ int match;
+
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+ state != TASK_RTLOCK_WAIT);
+ }
+
+ *success = !!(match = __task_state_match(p, state));
+
+ /*
+ * Saved state preserves the task state across blocking on
+ * an RT lock or TASK_FREEZABLE tasks. If the state matches,
+ * set p::saved_state to TASK_RUNNING, but do not wake the task
+ * because it waits for a lock wakeup or __thaw_task(). Also
+ * indicate success because from the regular waker's point of
+ * view this has succeeded.
+ *
+ * After acquiring the lock the task will restore p::__state
+ * from p::saved_state which ensures that the regular
+ * wakeup is not lost. The restore will also set
+ * p::saved_state to TASK_RUNNING so any further tests will
+ * not result in false positives vs. @success
+ */
+ if (match < 0)
+ p->saved_state = TASK_RUNNING;
+
+ return match > 0;
+}
+
+/*
* Notes on Program-Order guarantees on SMP systems.
*
* MIGRATION
@@ -2455,8 +4101,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* migration. However the means are completely different as there is no lock
* chain to provide order. Instead we do:
*
- * 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_load_acquire(!X->on_cpu)
+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
*
* Example:
*
@@ -2496,231 +4142,312 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
- * If (@state & @p->state) @p->state = TASK_RUNNING.
+ * Conceptually does:
+ *
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
*
* If the task was not queued/runnable, also place it back on a runqueue.
*
- * Atomic against schedule() which would dequeue a task, also see
- * set_current_state().
+ * This function is atomic against schedule() which would dequeue the task.
*
- * This function executes a full memory barrier before accessing the task
- * state; see set_current_state().
+ * It issues a full memory barrier before accessing @p->state, see the comment
+ * with set_current_state().
+ *
+ * Uses p->pi_lock to serialize against concurrent wake-ups.
+ *
+ * Relies on p->pi_lock stabilizing:
+ * - p->sched_class
+ * - p->cpus_ptr
+ * - p->sched_task_group
+ * in order to do migration, see its use of select_task_rq()/set_task_cpu().
+ *
+ * Tries really hard to only take one task_rq(p)->lock for performance.
+ * Takes rq->lock in:
+ * - ttwu_runnable() -- old rq, unavoidable, see comment there;
+ * - ttwu_queue() -- new rq, for enqueue of the task;
+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
+ *
+ * As a consequence we race really badly with just about everything. See the
+ * many memory barriers and their comments for details.
*
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- unsigned long flags;
+ guard(preempt)();
int cpu, success = 0;
- preempt_disable();
+ wake_flags |= WF_TTWU;
+
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
- * case the whole 'p->on_rq && ttwu_remote()' case below
+ * case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
+ * Specifically, given current runs ttwu() we must be before
+ * schedule()'s block_task(), as such this must not observe
+ * sched_delayed.
+ *
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- if (!(p->state & state))
+ SCHED_WARN_ON(p->se.sched_delayed);
+ if (!ttwu_state_match(p, state, &success))
goto out;
- success = 1;
trace_sched_waking(p);
- p->state = TASK_RUNNING;
- trace_sched_wakeup(p);
+ ttwu_do_wakeup(p);
goto out;
}
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
- * reordered with p->state check below. This pairs with mb() in
- * set_current_state() the waiting thread does.
+ * reordered with p->state check below. This pairs with smp_store_mb()
+ * in set_current_state() that the waiting thread does.
*/
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- smp_mb__after_spinlock();
- if (!(p->state & state))
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ smp_mb__after_spinlock();
+ if (!ttwu_state_match(p, state, &success))
+ break;
- trace_sched_waking(p);
+ trace_sched_waking(p);
- /* We're going to change ->state: */
- success = 1;
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * STORE p->on_rq = 1 LOAD p->state
+ * UNLOCK rq->lock
+ *
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smp_rmb() lives in __task_needs_rq_lock().
+ */
+ smp_rmb();
+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
+ break;
- /*
- * Ensure we load p->on_rq _after_ p->state, otherwise it would
- * be possible to, falsely, observe p->on_rq == 0 and get stuck
- * in smp_cond_load_acquire() below.
- *
- * sched_ttwu_pending() try_to_wake_up()
- * STORE p->on_rq = 1 LOAD p->state
- * UNLOCK rq->lock
- *
- * __schedule() (switch to task 'p')
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * UNLOCK rq->lock
- *
- * [task p]
- * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
- */
- smp_rmb();
- if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags))
- goto unlock;
+#ifdef CONFIG_SMP
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * __schedule() (switch to task 'p') try_to_wake_up()
+ * STORE p->on_cpu = 1 LOAD p->on_rq
+ * UNLOCK rq->lock
+ *
+ * __schedule() (put 'p' to sleep)
+ * LOCK rq->lock smp_rmb();
+ * smp_mb__after_spinlock();
+ * STORE p->on_rq = 0 LOAD p->on_cpu
+ *
+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
+ * __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
+ * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * care about it's own p->state. See the comment in __schedule().
+ */
+ smp_acquire__after_ctrl_dep();
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
+ /*
+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
+ * == 0), which means we need to do an enqueue, change p->state to
+ * TASK_WAKING such that we can unlock p->pi_lock before doing the
+ * enqueue, such as ttwu_queue_wakelist().
+ */
+ WRITE_ONCE(p->__state, TASK_WAKING);
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
+ */
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+ break;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, wait until it's done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_task().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+ if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ wake_flags |= WF_MIGRATED;
+ psi_ttwu_dequeue(p);
+ set_task_cpu(p, cpu);
+ }
+#else
+ cpu = task_cpu(p);
+#endif /* CONFIG_SMP */
+
+ ttwu_queue(p, cpu, wake_flags);
}
+out:
+ if (success)
+ ttwu_stat(p, task_cpu(p), wake_flags);
-#ifdef CONFIG_SMP
- /*
- * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
- * possible to, falsely, observe p->on_cpu == 0.
- *
- * One must be running (->on_cpu == 1) in order to remove oneself
- * from the runqueue.
- *
- * __schedule() (switch to task 'p') try_to_wake_up()
- * STORE p->on_cpu = 1 LOAD p->on_rq
- * UNLOCK rq->lock
- *
- * __schedule() (put 'p' to sleep)
- * LOCK rq->lock smp_rmb();
- * smp_mb__after_spinlock();
- * STORE p->on_rq = 0 LOAD p->on_cpu
- *
- * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
- * __schedule(). See the comment for smp_mb__after_spinlock().
- *
- * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
- * care about it's own p->state. See the comment in __schedule().
- */
- smp_acquire__after_ctrl_dep();
+ return success;
+}
+
+static bool __task_needs_rq_lock(struct task_struct *p)
+{
+ unsigned int state = READ_ONCE(p->__state);
/*
- * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
- * == 0), which means we need to do an enqueue, change p->state to
- * TASK_WAKING such that we can unlock p->pi_lock before doing the
- * enqueue, such as ttwu_queue_wakelist().
+ * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
+ * the task is blocked. Make sure to check @state since ttwu() can drop
+ * locks at the end, see ttwu_queue_wakelist().
*/
- p->state = TASK_WAKING;
+ if (state == TASK_RUNNING || state == TASK_WAKING)
+ return true;
/*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, considering queueing p on the remote CPUs wake_list
- * which potentially sends an IPI instead of spinning on p->on_cpu to
- * let the waker make forward progress. This is safe because IRQs are
- * disabled and the IPI will deliver after on_cpu is cleared.
- *
- * Ensure we load task_cpu(p) after p->on_cpu:
- *
- * set_task_cpu(p, cpu);
- * STORE p->cpu = @cpu
- * __schedule() (switch to task 'p')
- * LOCK rq->lock
- * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
- * STORE p->on_cpu = 1 LOAD p->cpu
+ * Ensure we load p->on_rq after p->__state, otherwise it would be
+ * possible to, falsely, observe p->on_rq == 0.
*
- * to ensure we observe the correct CPU on which the task is currently
- * scheduling.
+ * See try_to_wake_up() for a longer comment.
*/
- if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
- goto unlock;
+ smp_rmb();
+ if (p->on_rq)
+ return true;
+#ifdef CONFIG_SMP
/*
- * If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, wait until its done referencing the task.
- *
- * Pairs with the smp_store_release() in finish_task().
- *
- * This ensures that tasks getting woken will be fully ordered against
- * their previous state and preserve Program Order.
+ * Ensure the task has finished __schedule() and will not be referenced
+ * anymore. Again, see try_to_wake_up() for a longer comment.
*/
+ smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
+#endif
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
- if (task_cpu(p) != cpu) {
- wake_flags |= WF_MIGRATED;
- psi_ttwu_dequeue(p);
- set_task_cpu(p, cpu);
- }
-#else
- cpu = task_cpu(p);
-#endif /* CONFIG_SMP */
-
- ttwu_queue(p, cpu, wake_flags);
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-out:
- if (success)
- ttwu_stat(p, task_cpu(p), wake_flags);
- preempt_enable();
-
- return success;
+ return false;
}
/**
- * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
- * @p: Process for which the function is to be invoked.
+ * task_call_func - Invoke a function on task in fixed state
+ * @p: Process for which the function is to be invoked, can be @current.
* @func: Function to invoke.
* @arg: Argument to function.
*
- * If the specified task can be quickly locked into a definite state
- * (either sleeping or on a given runqueue), arrange to keep it in that
- * state while invoking @func(@arg). This function can use ->on_rq and
- * task_curr() to work out what the state is, if required. Given that
- * @func can be invoked with a runqueue lock held, it had better be quite
+ * Fix the task in it's current state by avoiding wakeups and or rq operations
+ * and call @func(@arg) on it. This function can use task_is_runnable() and
+ * task_curr() to work out what the state is, if required. Given that @func
+ * can be invoked with a runqueue lock held, it had better be quite
* lightweight.
*
* Returns:
- * @false if the task slipped out from under the locks.
- * @true if the task was locked onto a runqueue or is sleeping.
- * However, @func can override this by returning @false.
+ * Whatever @func returns
*/
-bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
+int task_call_func(struct task_struct *p, task_call_f func, void *arg)
{
- bool ret = false;
+ struct rq *rq = NULL;
struct rq_flags rf;
- struct rq *rq;
+ int ret;
- lockdep_assert_irqs_enabled();
- raw_spin_lock_irq(&p->pi_lock);
- if (p->on_rq) {
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+ if (__task_needs_rq_lock(p))
rq = __task_rq_lock(p, &rf);
- if (task_rq(p) == rq)
- ret = func(p, arg);
+
+ /*
+ * At this point the task is pinned; either:
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - running, and we're holding off de-schedule (rq->lock)
+ *
+ * The called function (@func) can use: task_curr(), p->on_rq and
+ * p->__state to differentiate between these states.
+ */
+ ret = func(p, arg);
+
+ if (rq)
rq_unlock(rq, &rf);
- } else {
- switch (p->state) {
- case TASK_RUNNING:
- case TASK_WAKING:
- break;
- default:
- smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
- if (!p->on_rq)
- ret = func(p, arg);
- }
- }
- raw_spin_unlock_irq(&p->pi_lock);
+
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
/**
+ * cpu_curr_snapshot - Return a snapshot of the currently running task
+ * @cpu: The CPU on which to snapshot the task.
+ *
+ * Returns the task_struct pointer of the task "currently" running on
+ * the specified CPU.
+ *
+ * If the specified CPU was offline, the return value is whatever it
+ * is, perhaps a pointer to the task_struct structure of that CPU's idle
+ * task, but there is no guarantee. Callers wishing a useful return
+ * value must take some action to ensure that the specified CPU remains
+ * online throughout.
+ *
+ * This function executes full memory barriers before and after fetching
+ * the pointer, which permits the caller to confine this function's fetch
+ * with respect to the caller's accesses to other shared variables.
+ */
+struct task_struct *cpu_curr_snapshot(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *t;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */
+ t = rcu_dereference(cpu_curr(cpu));
+ rq_unlock_irqrestore(rq, &rf);
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+
+ return t;
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -2746,7 +4473,8 @@ int wake_up_state(struct task_struct *p, unsigned int state)
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*
- * __sched_fork() is basic setup used by init_idle() too:
+ * __sched_fork() is basic setup which is also used by sched_init() to
+ * initialize the boot CPU's idle task.
*/
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
@@ -2758,21 +4486,22 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+ p->se.vlag = 0;
INIT_LIST_HEAD(&p->se.group_node);
+ /* A delayed task cannot be in clone(). */
+ SCHED_WARN_ON(p->se.sched_delayed);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
#ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
- RB_CLEAR_NODE(&p->dl.rb_node);
- init_dl_task_timer(&p->dl);
- init_dl_inactive_task_timer(&p->dl);
- __dl_clear_params(p);
+ init_dl_entity(&p->dl);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
@@ -2780,6 +4509,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->rt.on_rq = 0;
p->rt.on_list = 0;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ init_scx_entity(&p->scx);
+#endif
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
@@ -2790,14 +4523,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ p->migration_pending = NULL;
#endif
+ init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
@@ -2805,13 +4542,33 @@ void set_numabalancing_state(bool enabled)
static_branch_disable(&sched_numa_balancing);
}
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+ else
+ sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+ __set_numabalancing_state(enabled);
+}
+
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_numa_balancing(struct ctl_table *table, int write,
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
+static int sysctl_numa_balancing(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
- int state = static_branch_likely(&sched_numa_balancing);
+ int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2821,8 +4578,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
- if (write)
- set_numabalancing_state(state);
+ if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
+ sysctl_numa_balancing_mode = state;
+ __set_numabalancing_state(state);
+ }
return err;
}
#endif
@@ -2831,7 +4593,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#ifdef CONFIG_SCHEDSTATS
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
-static bool __initdata __sched_schedstats = false;
static void set_schedstats(bool enabled)
{
@@ -2855,16 +4616,11 @@ static int __init setup_schedstats(char *str)
if (!str)
goto out;
- /*
- * This code is called before jump labels have been set up, so we can't
- * change the static branch directly just yet. Instead set a temporary
- * variable so init_schedstats() can do it later.
- */
if (!strcmp(str, "enable")) {
- __sched_schedstats = true;
+ set_schedstats(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
- __sched_schedstats = false;
+ set_schedstats(false);
ret = 1;
}
out:
@@ -2875,13 +4631,8 @@ out:
}
__setup("schedstats=", setup_schedstats);
-static void __init init_schedstats(void)
-{
- set_schedstats(__sched_schedstats);
-}
-
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+static int sysctl_schedstats(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -2901,24 +4652,76 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
return err;
}
#endif /* CONFIG_PROC_SYSCTL */
-#else /* !CONFIG_SCHEDSTATS */
-static inline void init_schedstats(void) {}
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table sched_core_sysctls[] = {
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_UCLAMP_TASK
+ {
+ .procname = "sched_util_clamp_min",
+ .data = &sysctl_sched_uclamp_util_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_max",
+ .data = &sysctl_sched_uclamp_util_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_min_rt_default",
+ .data = &sysctl_sched_uclamp_util_min_rt_default,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+#endif /* CONFIG_UCLAMP_TASK */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_numa_balancing,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+};
+static int __init sched_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_core_sysctls);
+ return 0;
+}
+late_initcall(sched_core_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* fork()/clone()-time setup:
*/
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
- unsigned long flags;
-
__sched_fork(clone_flags, p);
/*
* We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_NEW;
+ p->__state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2938,8 +4741,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
/*
* We don't need the reset flag anymore after the fork. It has
@@ -2950,30 +4755,21 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (dl_prio(p->prio))
return -EAGAIN;
- else if (rt_prio(p->prio))
+
+ scx_pre_fork(p);
+
+ if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
- else
+#ifdef CONFIG_SCHED_CLASS_EXT
+ } else if (task_should_scx(p->policy)) {
+ p->sched_class = &ext_sched_class;
+#endif
+ } else {
p->sched_class = &fair_sched_class;
+ }
init_entity_runnable_average(&p->se);
- /*
- * The child is not yet in the pid-hash so no cgroup attach races,
- * and the cgroup is pinned to this child due to cgroup_fork()
- * is ran before sched_fork().
- *
- * Silence PROVE_RCU.
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- rseq_migrate(p);
- /*
- * We're setting the CPU for the first time, we don't migrate,
- * so use __set_task_cpu().
- */
- __set_task_cpu(p, smp_processor_id());
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
@@ -2990,6 +4786,48 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+{
+ unsigned long flags;
+
+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_CGROUP_SCHED
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
+#endif
+ rseq_migrate(p);
+ /*
+ * We're setting the CPU for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, smp_processor_id());
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return scx_fork(p);
+}
+
+void sched_cancel_fork(struct task_struct *p)
+{
+ scx_cancel_fork(p);
+}
+
+void sched_post_fork(struct task_struct *p)
+{
+ uclamp_post_fork(p);
+ scx_post_fork(p);
+}
+
unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
@@ -3017,9 +4855,10 @@ void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
+ int wake_flags = WF_FORK;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- p->state = TASK_RUNNING;
+ WRITE_ONCE(p->__state, TASK_RUNNING);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -3031,19 +4870,19 @@ void wake_up_new_task(struct task_struct *p)
*/
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
- activate_task(rq, p, ENQUEUE_NOCLOCK);
+ activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
- check_preempt_curr(rq, p, WF_FORK);
+ wakeup_preempt(rq, p, wake_flags);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
- * Nothing relies on rq->lock after this, so its fine to
+ * Nothing relies on rq->lock after this, so it's fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
@@ -3147,8 +4986,11 @@ static inline void prepare_task(struct task_struct *next)
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
+ *
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
*/
- next->on_cpu = 1;
+ WRITE_ONCE(next->on_cpu, 1);
#endif
}
@@ -3156,8 +4998,9 @@ static inline void finish_task(struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
+ * This must be the very last reference to @prev from this CPU. After
+ * p->on_cpu is cleared, the task can be moved to a different CPU. We
+ * must ensure this doesn't happen until the switch is completely
* finished.
*
* In particular, the load of prev->state in finish_task_switch() must
@@ -3169,6 +5012,97 @@ static inline void finish_task(struct task_struct *prev)
#endif
}
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+ void (*func)(struct rq *rq);
+ struct balance_callback *next;
+
+ lockdep_assert_rq_held(rq);
+
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+
+ func(rq);
+ }
+}
+
+static void balance_push(struct rq *rq);
+
+/*
+ * balance_push_callback is a right abuse of the callback interface and plays
+ * by significantly different rules.
+ *
+ * Where the normal balance_callback's purpose is to be ran in the same context
+ * that queued it (only later, when it's safe to drop rq->lock again),
+ * balance_push_callback is specifically targeted at __schedule().
+ *
+ * This abuse is tolerated because it places all the unlikely/odd cases behind
+ * a single test, namely: rq->balance_callback == NULL.
+ */
+struct balance_callback balance_push_callback = {
+ .next = NULL,
+ .func = balance_push,
+};
+
+static inline struct balance_callback *
+__splice_balance_callbacks(struct rq *rq, bool split)
+{
+ struct balance_callback *head = rq->balance_callback;
+
+ if (likely(!head))
+ return NULL;
+
+ lockdep_assert_rq_held(rq);
+ /*
+ * Must not take balance_push_callback off the list when
+ * splice_balance_callbacks() and balance_callbacks() are not
+ * in the same rq->lock section.
+ *
+ * In that case it would be possible for __schedule() to interleave
+ * and observe the list empty.
+ */
+ if (split && head == &balance_push_callback)
+ head = NULL;
+ else
+ rq->balance_callback = NULL;
+
+ return head;
+}
+
+struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return __splice_balance_callbacks(rq, true);
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
+}
+
+void balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+ unsigned long flags;
+
+ if (unlikely(head)) {
+ raw_spin_rq_lock_irqsave(rq, flags);
+ do_balance_callbacks(rq, head);
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+ }
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+#endif
+
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -3179,10 +5113,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, _THIS_IP_);
+ spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
- rq->lock.owner = next;
+ rq_lockp(rq)->owner = next;
#endif
}
@@ -3193,8 +5127,9 @@ static inline void finish_lock_switch(struct rq *rq)
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- raw_spin_unlock_irq(&rq->lock);
+ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock_irq(rq);
}
/*
@@ -3209,6 +5144,22 @@ static inline void finish_lock_switch(struct rq *rq)
# define finish_arch_post_lock_switch() do { } while (0)
#endif
+static inline void kmap_local_sched_out(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_out();
+#endif
+}
+
+static inline void kmap_local_sched_in(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_in();
+#endif
+}
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
@@ -3231,6 +5182,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
+ kmap_local_sched_out();
prepare_task(next);
prepare_arch_switch(next);
}
@@ -3251,7 +5203,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
*
* The context switch have flipped the stack from under us and restored the
* local variables which were saved when this task called schedule() in the
- * past. prev == current is still correct but we need to recalculate this_rq
+ * past. 'prev == current' is still correct but we need to recalculate this_rq
* because prev may have moved to another CPU.
*/
static struct rq *finish_task_switch(struct task_struct *prev)
@@ -3259,7 +5211,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;
/*
* The previous task will have left us with a preempt_count of 2
@@ -3290,13 +5242,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* running on another CPU and we could rave with its RUNNING -> DEAD
* transition, resulting in a double drop.
*/
- prev_state = prev->state;
+ prev_state = READ_ONCE(prev->__state);
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
finish_task(prev);
+ tick_nohz_task_switch();
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
+ /*
+ * kmap_local_sched_out() is invoked with rq::lock held and
+ * interrupts disabled. There is no requirement for that, but the
+ * sched out code does not have an interrupt enabled section.
+ * Restoring the maps on sched in does not require interrupts being
+ * disabled either.
+ */
+ kmap_local_sched_in();
fire_sched_in_preempt_notifiers(current);
/*
@@ -3308,70 +5269,27 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_lazy_tlb_sched(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
-
/* Task is done with its stack. */
put_task_stack(prev);
put_task_struct_rcu_user(prev);
}
- tick_nohz_task_switch();
return rq;
}
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
- struct callback_head *head, *next;
- void (*func)(struct rq *rq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- head = rq->balance_callback;
- rq->balance_callback = NULL;
- while (head) {
- func = (void (*)(struct rq *))head->func;
- next = head->next;
- head->next = NULL;
- head = next;
-
- func(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
- if (unlikely(rq->balance_callback))
- __balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -3379,8 +5297,6 @@ static inline void balance_callback(struct rq *rq)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq;
-
/*
* New tasks start with FORK_PREEMPT_COUNT, see there and
* finish_task_switch() for details.
@@ -3390,8 +5306,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
* PREEMPT_COUNT kernels).
*/
- rq = finish_task_switch(prev);
- balance_callback(rq);
+ finish_task_switch(prev);
preempt_enable();
if (current->set_child_tid)
@@ -3418,17 +5333,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
+ *
+ * switch_mm_cid() needs to be updated if the barriers provided
+ * by context_switch() are modified.
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -3442,15 +5360,17 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+ /* switch_mm_cid() requires the memory barriers above. */
+ switch_mm_cid(rq, prev, next);
prepare_lock_switch(rq, next, rf);
@@ -3467,9 +5387,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* externally visible scheduler statistics: current number of runnable
* threads, total number of context switches performed since bootup.
*/
-unsigned long nr_running(void)
+unsigned int nr_running(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_online_cpu(i)
sum += cpu_rq(i)->nr_running;
@@ -3496,6 +5416,11 @@ bool single_task_running(void)
}
EXPORT_SYMBOL(single_task_running);
+unsigned long long nr_context_switches_cpu(int cpu)
+{
+ return cpu_rq(cpu)->nr_switches;
+}
+
unsigned long long nr_context_switches(void)
{
int i;
@@ -3514,13 +5439,13 @@ unsigned long long nr_context_switches(void)
* it does become runnable.
*/
-unsigned long nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_cpu(int cpu)
{
return atomic_read(&cpu_rq(cpu)->nr_iowait);
}
/*
- * IO-wait accounting, and how its mostly bollocks (on SMP).
+ * IO-wait accounting, and how it's mostly bollocks (on SMP).
*
* The idea behind IO-wait account is to account the idle time that we could
* have spend running if it were not for IO. That is, if we were to improve the
@@ -3549,9 +5474,9 @@ unsigned long nr_iowait_cpu(int cpu)
* Task CPU affinities can make all that even more 'interesting'.
*/
-unsigned long nr_iowait(void)
+unsigned int nr_iowait(void)
{
- unsigned long i, sum = 0;
+ unsigned int i, sum = 0;
for_each_possible_cpu(i)
sum += nr_iowait_cpu(i);
@@ -3568,23 +5493,20 @@ unsigned long nr_iowait(void)
void sched_exec(void)
{
struct task_struct *p = current;
- unsigned long flags;
+ struct migration_arg arg;
int dest_cpu;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
- if (dest_cpu == smp_processor_id())
- goto unlock;
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
+ if (dest_cpu == smp_processor_id())
+ return;
- if (likely(cpu_active(dest_cpu))) {
- struct migration_arg arg = { p, dest_cpu };
+ if (unlikely(!cpu_active(dest_cpu)))
+ return;
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
- return;
+ arg = (struct migration_arg){ p, dest_cpu };
}
-unlock:
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
#endif
@@ -3604,9 +5526,9 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
static inline void prefetch_curr_exec_start(struct task_struct *p)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+ struct sched_entity *curr = p->se.cfs_rq->curr;
#else
- struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+ struct sched_entity *curr = task_rq(p)->cfs.curr;
#endif
prefetch(curr);
prefetch(&curr->exec_start);
@@ -3627,7 +5549,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
/*
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
- * Reading ->on_cpu is racy, but this is ok.
+ * Reading ->on_cpu is racy, but this is OK.
*
* If we race with it leaving CPU, we'll take a lock. So we're correct.
* If we race with it entering CPU, unaccounted time is 0. This is
@@ -3645,7 +5567,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
- if (task_current(rq, p) && task_on_rq_queued(p)) {
+ if (task_current_donor(rq, p) && task_on_rq_queued(p)) {
prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
@@ -3656,48 +5578,109 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
+#ifdef CONFIG_SCHED_DEBUG
+static u64 cpu_resched_latency(struct rq *rq)
+{
+ int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+ u64 resched_latency, now = rq_clock(rq);
+ static bool warned_once;
+
+ if (sysctl_resched_latency_warn_once && warned_once)
+ return 0;
+
+ if (!need_resched() || !latency_warn_ms)
+ return 0;
+
+ if (system_state == SYSTEM_BOOTING)
+ return 0;
+
+ if (!rq->last_seen_need_resched_ns) {
+ rq->last_seen_need_resched_ns = now;
+ rq->ticks_without_resched = 0;
+ return 0;
+ }
-void arch_set_thermal_pressure(struct cpumask *cpus,
- unsigned long th_pressure)
+ rq->ticks_without_resched++;
+ resched_latency = now - rq->last_seen_need_resched_ns;
+ if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+ return 0;
+
+ warned_once = true;
+
+ return resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
{
- int cpu;
+ long val;
- for_each_cpu(cpu, cpus)
- WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
+ if ((kstrtol(str, 0, &val))) {
+ pr_warn("Unable to set resched_latency_warn_ms\n");
+ return 1;
+ }
+
+ sysctl_resched_latency_warn_ms = val;
+ return 1;
}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
+#endif /* CONFIG_SCHED_DEBUG */
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
-void scheduler_tick(void)
+void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr = rq->curr;
+ /* accounting goes to the donor task */
+ struct task_struct *donor;
struct rq_flags rf;
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
+ u64 resched_latency;
+
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
+ arch_scale_freq_tick();
- arch_scale_freq_tick();
sched_clock_tick();
rq_lock(rq, &rf);
+ donor = rq->donor;
+
+ psi_account_irqtime(rq, donor, NULL);
update_rq_clock(rq);
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
- curr->sched_class->task_tick(rq, curr, 0);
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
+
+ if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+ resched_curr(rq);
+
+ donor->sched_class->task_tick(rq, donor, 0);
+ if (sched_feat(LATENCY_WARN))
+ resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
- psi_task_tick(rq);
+ sched_core_tick(rq);
+ task_tick_mm_cid(rq, donor);
+ scx_tick(rq);
rq_unlock(rq, &rf);
+ if (sched_feat(LATENCY_WARN) && resched_latency)
+ resched_latency_warn(cpu, resched_latency);
+
perf_event_task_tick();
+ if (donor->flags & PF_WQ_WORKER)
+ wq_worker_tick(donor);
+
#ifdef CONFIG_SMP
- rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq);
+ if (!scx_switched_all()) {
+ rq->idle_balance = idle_cpu(cpu);
+ sched_balance_trigger(rq);
+ }
#endif
}
@@ -3744,9 +5727,6 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr;
- struct rq_flags rf;
- u64 delta;
int os;
/*
@@ -3756,30 +5736,32 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
- if (!tick_nohz_tick_stopped_cpu(cpu))
- goto out_requeue;
-
- rq_lock_irq(rq, &rf);
- curr = rq->curr;
- if (cpu_is_offline(cpu))
- goto out_unlock;
+ if (tick_nohz_tick_stopped_cpu(cpu)) {
+ guard(rq_lock_irq)(rq);
+ struct task_struct *curr = rq->curr;
- update_rq_clock(rq);
+ if (cpu_online(cpu)) {
+ /*
+ * Since this is a remote tick for full dynticks mode,
+ * we are always sure that there is no proxy (only a
+ * single task is running).
+ */
+ SCHED_WARN_ON(rq->curr != rq->donor);
+ update_rq_clock(rq);
+
+ if (!is_idle_task(curr)) {
+ /*
+ * Make sure the next tick runs within a
+ * reasonable amount of time.
+ */
+ u64 delta = rq_clock_task(rq) - curr->se.exec_start;
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ }
+ curr->sched_class->task_tick(rq, curr, 0);
- if (!is_idle_task(curr)) {
- /*
- * Make sure the next tick runs within a reasonable
- * amount of time.
- */
- delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ calc_load_nohz_remote(rq);
+ }
}
- curr->sched_class->task_tick(rq, curr, 0);
-
- calc_load_nohz_remote(rq);
-out_unlock:
- rq_unlock_irq(rq, &rf);
-out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
@@ -3798,7 +5780,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -3819,7 +5801,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -3947,13 +5929,11 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
- if (panic_on_warn)
- panic("scheduling while atomic\n");
+ check_panic_on_warn("scheduling while atomic");
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
@@ -3973,7 +5953,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
- if (!preempt && prev->state && prev->non_block_count) {
+ if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
prev->comm, prev->pid, prev->non_block_count);
dump_stack();
@@ -3986,17 +5966,32 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
+ SCHED_WARN_ON(ct_state() == CT_STATE_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq()->sched_count);
}
-static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
+static void prev_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
{
-#ifdef CONFIG_SMP
+ const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ /*
+ * SCX requires a balance() call before every pick_task() including when
+ * waking up from SCHED_IDLE. If @start_class is below SCX, start from
+ * SCX instead. Also, set a flag to detect missing balance() call.
+ */
+ if (scx_enabled()) {
+ rq->scx.flags |= SCX_RQ_BAL_PENDING;
+ if (sched_class_above(&ext_sched_class, start_class))
+ start_class = &ext_sched_class;
+ }
+#endif
+
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
@@ -4005,58 +6000,607 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
* We can terminate the balance pass as soon as we know there is
* a runnable task of @class priority or higher.
*/
- for_class_range(class, prev->sched_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
+ for_active_class_range(class, start_class, &idle_sched_class) {
+ if (class->balance && class->balance(rq, prev, rf))
break;
}
-#endif
-
- put_prev_task(rq, prev);
}
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
+ rq->dl_server = NULL;
+
+ if (scx_enabled())
+ goto restart;
+
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
- * higher scheduling class, because otherwise those loose the
+ * higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely((prev->sched_class == &idle_sched_class ||
- prev->sched_class == &fair_sched_class) &&
- rq->nr_running == rq->cfs.h_nr_running)) {
+ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
+ rq->nr_running == rq->cfs.h_nr_queued)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
- /* Assumes fair_sched_class->next == idle_sched_class */
+ /* Assume the next prioritized class is idle_sched_class */
if (!p) {
- put_prev_task(rq, prev);
- p = pick_next_task_idle(rq);
+ p = pick_task_idle(rq);
+ put_prev_set_next_task(rq, prev, p);
}
return p;
}
restart:
- put_prev_task_balance(rq, prev, rf);
+ prev_balance(rq, prev, rf);
+
+ for_each_active_class(class) {
+ if (class->pick_next_task) {
+ p = class->pick_next_task(rq, prev);
+ if (p)
+ return p;
+ } else {
+ p = class->pick_task(rq);
+ if (p) {
+ put_prev_set_next_task(rq, prev, p);
+ return p;
+ }
+ }
+ }
+
+ BUG(); /* The idle class should always have a runnable task. */
+}
+
+#ifdef CONFIG_SCHED_CORE
+static inline bool is_task_rq_idle(struct task_struct *t)
+{
+ return (task_rq(t)->idle == t);
+}
- for_each_class(class) {
- p = class->pick_next_task(rq);
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+ return is_task_rq_idle(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+ if (is_task_rq_idle(a) || is_task_rq_idle(b))
+ return true;
+
+ return a->core_cookie == b->core_cookie;
+}
+
+static inline struct task_struct *pick_task(struct rq *rq)
+{
+ const struct sched_class *class;
+ struct task_struct *p;
+
+ rq->dl_server = NULL;
+
+ for_each_active_class(class) {
+ p = class->pick_task(rq);
if (p)
return p;
}
- /* The idle class should always have a runnable task: */
- BUG();
+ BUG(); /* The idle class should always have a runnable task. */
+}
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+static void queue_core_balance(struct rq *rq);
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next, *p, *max = NULL;
+ const struct cpumask *smt_mask;
+ bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
+ unsigned long cookie;
+ int i, cpu, occ = 0;
+ struct rq *rq_i;
+ bool need_sync;
+
+ if (!sched_core_enabled(rq))
+ return __pick_next_task(rq, prev, rf);
+
+ cpu = cpu_of(rq);
+
+ /* Stopper task is switching into idle, no need core-wide selection. */
+ if (cpu_is_offline(cpu)) {
+ /*
+ * Reset core_pick so that we don't enter the fastpath when
+ * coming online. core_pick would already be migrated to
+ * another cpu during offline.
+ */
+ rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
+ return __pick_next_task(rq, prev, rf);
+ }
+
+ /*
+ * If there were no {en,de}queues since we picked (IOW, the task
+ * pointers are all still valid), and we haven't scheduled the last
+ * pick yet, do so now.
+ *
+ * rq->core_pick can be NULL if no selection was made for a CPU because
+ * it was either offline or went offline during a sibling's core-wide
+ * selection. In this case, do a core-wide selection.
+ */
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq &&
+ rq->core_pick) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ next = rq->core_pick;
+ rq->dl_server = rq->core_dl_server;
+ rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
+ goto out_set_next;
+ }
+
+ prev_balance(rq, prev, rf);
+
+ smt_mask = cpu_smt_mask(cpu);
+ need_sync = !!rq->core->core_cookie;
+
+ /* reset state */
+ rq->core->core_cookie = 0UL;
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
+ need_sync = true;
+ fi_before = true;
+ }
+
+ /*
+ * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+ *
+ * @task_seq guards the task state ({en,de}queues)
+ * @pick_seq is the @task_seq we did a selection on
+ * @sched_seq is the @pick_seq we scheduled
+ *
+ * However, preemptions can cause multiple picks on the same task set.
+ * 'Fix' this by also increasing @task_seq for every pick.
+ */
+ rq->core->core_task_seq++;
+
+ /*
+ * Optimize for common case where this CPU has no cookies
+ * and there are no cookied tasks running on siblings.
+ */
+ if (!need_sync) {
+ next = pick_task(rq);
+ if (!next->core_cookie) {
+ rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
+ /*
+ * For robustness, update the min_vruntime_fi for
+ * unconstrained picks as well.
+ */
+ WARN_ON_ONCE(fi_before);
+ task_vruntime_update(rq, next, false);
+ goto out_set_next;
+ }
+ }
+
+ /*
+ * For each thread: do the regular task pick and find the max prio task
+ * amongst them.
+ *
+ * Tie-break prio towards the current CPU
+ */
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ rq_i = cpu_rq(i);
+
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
+ update_rq_clock(rq_i);
+
+ rq_i->core_pick = p = pick_task(rq_i);
+ rq_i->core_dl_server = rq_i->dl_server;
+
+ if (!max || prio_less(max, p, fi_before))
+ max = p;
+ }
+
+ cookie = rq->core->core_cookie = max->core_cookie;
+
+ /*
+ * For each thread: try and find a runnable task that matches @max or
+ * force idle.
+ */
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick;
+
+ if (!cookie_equals(p, cookie)) {
+ p = NULL;
+ if (cookie)
+ p = sched_core_find(rq_i, cookie);
+ if (!p)
+ p = idle_sched_class.pick_task(rq_i);
+ }
+
+ rq_i->core_pick = p;
+ rq_i->core_dl_server = NULL;
+
+ if (p == rq_i->idle) {
+ if (rq_i->nr_running) {
+ rq->core->core_forceidle_count++;
+ if (!fi_before)
+ rq->core->core_forceidle_seq++;
+ }
+ } else {
+ occ++;
+ }
+ }
+
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+ next = rq->core_pick;
+ rq->core_sched_seq = rq->core->core_pick_seq;
+
+ /* Something should have been selected for current CPU */
+ WARN_ON_ONCE(!next);
+
+ /*
+ * Reschedule siblings
+ *
+ * NOTE: L1TF -- at this point we're no longer running the old task and
+ * sending an IPI (below) ensures the sibling will no longer be running
+ * their task. This ensures there is no inter-sibling overlap between
+ * non-matching user state.
+ */
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+
+ /*
+ * An online sibling might have gone offline before a task
+ * could be picked for it, or it might be offline but later
+ * happen to come online, but its too late and nothing was
+ * picked for it. That's Ok - it will pick tasks for itself,
+ * so ignore it.
+ */
+ if (!rq_i->core_pick)
+ continue;
+
+ /*
+ * Update for new !FI->FI transitions, or if continuing to be in !FI:
+ * fi_before fi update?
+ * 0 0 1
+ * 0 1 1
+ * 1 0 1
+ * 1 1 0
+ */
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
+
+ rq_i->core_pick->core_occupation = occ;
+
+ if (i == cpu) {
+ rq_i->core_pick = NULL;
+ rq_i->core_dl_server = NULL;
+ continue;
+ }
+
+ /* Did we break L1TF mitigation requirements? */
+ WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+
+ if (rq_i->curr == rq_i->core_pick) {
+ rq_i->core_pick = NULL;
+ rq_i->core_dl_server = NULL;
+ continue;
+ }
+
+ resched_curr(rq_i);
+ }
+
+out_set_next:
+ put_prev_set_next_task(rq, prev, next);
+ if (rq->core->core_forceidle_count && next == rq->idle)
+ queue_core_balance(rq);
+
+ return next;
+}
+
+static bool try_steal_cookie(int this, int that)
+{
+ struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+ struct task_struct *p;
+ unsigned long cookie;
+ bool success = false;
+
+ guard(irq)();
+ guard(double_rq_lock)(dst, src);
+
+ cookie = dst->core->core_cookie;
+ if (!cookie)
+ return false;
+
+ if (dst->curr != dst->idle)
+ return false;
+
+ p = sched_core_find(src, cookie);
+ if (!p)
+ return false;
+
+ do {
+ if (p == src->core_pick || p == src->curr)
+ goto next;
+
+ if (!is_cpu_allowed(p, this))
+ goto next;
+
+ if (p->core_occupation > dst->idle->core_occupation)
+ goto next;
+ /*
+ * sched_core_find() and sched_core_next() will ensure
+ * that task @p is not throttled now, we also need to
+ * check whether the runqueue of the destination CPU is
+ * being throttled.
+ */
+ if (sched_task_is_throttled(p, this))
+ goto next;
+
+ move_queued_task_locked(src, dst, p);
+ resched_curr(dst);
+
+ success = true;
+ break;
+
+next:
+ p = sched_core_next(p, cookie);
+ } while (p);
+
+ return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+ int i;
+
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
+ if (i == cpu)
+ continue;
+
+ if (need_resched())
+ break;
+
+ if (try_steal_cookie(cpu, i))
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_core_balance(struct rq *rq)
+{
+ struct sched_domain *sd;
+ int cpu = cpu_of(rq);
+
+ guard(preempt)();
+ guard(rcu)();
+
+ raw_spin_rq_unlock_irq(rq);
+ for_each_domain(cpu, sd) {
+ if (need_resched())
+ break;
+
+ if (steal_cookie_task(cpu, sd))
+ break;
+ }
+ raw_spin_rq_lock_irq(rq);
+}
+
+static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
+
+static void queue_core_balance(struct rq *rq)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ if (!rq->core->core_cookie)
+ return;
+
+ if (!rq->nr_running) /* not forced idle */
+ return;
+
+ queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
+}
+
+DEFINE_LOCK_GUARD_1(core_lock, int,
+ sched_core_lock(*_T->lock, &_T->flags),
+ sched_core_unlock(*_T->lock, &_T->flags),
+ unsigned long flags)
+
+static void sched_core_cpu_starting(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ int t;
+
+ guard(core_lock)(&cpu);
+
+ WARN_ON_ONCE(rq->core != rq);
+
+ /* if we're the first, we'll be our own leader */
+ if (cpumask_weight(smt_mask) == 1)
+ return;
+
+ /* find the leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ rq = cpu_rq(t);
+ if (rq->core == rq) {
+ core_rq = rq;
+ break;
+ }
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+ return;
+
+ /* install and validate core_rq */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+
+ if (t == cpu)
+ rq->core = core_rq;
+
+ WARN_ON_ONCE(rq->core != core_rq);
+ }
+}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ int t;
+
+ guard(core_lock)(&cpu);
+
+ /* if we're the last man standing, nothing to do */
+ if (cpumask_weight(smt_mask) == 1) {
+ WARN_ON_ONCE(rq->core != rq);
+ return;
+ }
+
+ /* if we're not the leader, nothing to do */
+ if (rq->core != rq)
+ return;
+
+ /* find a new leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ core_rq = cpu_rq(t);
+ break;
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* impossible */
+ return;
+
+ /* copy the shared state to the new leader */
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
+
+ /* install new leader */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+ rq->core = core_rq;
+ }
+}
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->core != rq)
+ rq->core = rq;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
+/*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock.
+ */
+#define SM_IDLE (-1)
+#define SM_NONE 0
+#define SM_PREEMPT 1
+#define SM_RTLOCK_WAIT 2
+
+/*
+ * Helper function for __schedule()
+ *
+ * If a task does not have signals pending, deactivate it
+ * Otherwise marks the task's __state as RUNNING
+ */
+static bool try_to_block_task(struct rq *rq, struct task_struct *p,
+ unsigned long task_state)
+{
+ int flags = DEQUEUE_NOCLOCK;
+
+ if (signal_pending_state(task_state, p)) {
+ WRITE_ONCE(p->__state, TASK_RUNNING);
+ return false;
+ }
+
+ p->sched_contributes_to_load =
+ (task_state & TASK_UNINTERRUPTIBLE) &&
+ !(task_state & TASK_NOLOAD) &&
+ !(task_state & TASK_FROZEN);
+
+ if (unlikely(is_special_task_state(task_state)))
+ flags |= DEQUEUE_SPECIAL;
+
+ /*
+ * __schedule() ttwu()
+ * prev_state = prev->state; if (p->on_rq && ...)
+ * if (prev_state) goto out;
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
+ * p->state = TASK_WAKING
+ *
+ * Where __schedule() and ttwu() have matching control dependencies.
+ *
+ * After this, schedule() must not care about p->state any more.
+ */
+ block_task(rq, p, flags);
+ return true;
}
/*
@@ -4070,7 +6614,7 @@ restart:
* paths. For example, see arch/x86/entry_64.S.
*
* To drive preemption between tasks, the scheduler sets the flag in timer
- * interrupt handler scheduler_tick().
+ * interrupt handler sched_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
@@ -4098,9 +6642,14 @@ restart:
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(int sched_mode)
{
struct task_struct *prev, *next;
+ /*
+ * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
+ * as a preemption by schedule_debug() and RCU.
+ */
+ bool preempt = sched_mode > SM_NONE;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
@@ -4113,7 +6662,7 @@ static void __sched notrace __schedule(bool preempt)
schedule_debug(prev, preempt);
- if (sched_feat(HRTICK))
+ if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
@@ -4132,7 +6681,9 @@ static void __sched notrace __schedule(bool preempt)
* if (signal_pending_state()) if (p->state & @state)
*
* Also, the membarrier system call requires a full memory barrier
- * after coming from user-space, before storing to rq->curr.
+ * after coming from user-space, before storing to rq->curr; this
+ * barrier matches a full barrier in the proximity of the membarrier
+ * system call exit.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -4140,53 +6691,37 @@ static void __sched notrace __schedule(bool preempt)
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
+ rq->clock_update_flags = RQCF_UPDATED;
switch_count = &prev->nivcsw;
+ /* Task state changes only considers SM_PREEMPT as preemption */
+ preempt = sched_mode == SM_PREEMPT;
+
/*
* We must load prev->state once (task_struct::state is volatile), such
- * that:
- *
- * - we form a control dependency vs deactivate_task() below.
- * - ptrace_{,un}freeze_traced() can change ->state underneath us.
- */
- prev_state = prev->state;
- if (!preempt && prev_state) {
- if (signal_pending_state(prev_state, prev)) {
- prev->state = TASK_RUNNING;
- } else {
- prev->sched_contributes_to_load =
- (prev_state & TASK_UNINTERRUPTIBLE) &&
- !(prev_state & TASK_NOLOAD) &&
- !(prev->flags & PF_FROZEN);
-
- if (prev->sched_contributes_to_load)
- rq->nr_uninterruptible++;
-
- /*
- * __schedule() ttwu()
- * prev_state = prev->state; if (p->on_rq && ...)
- * if (prev_state) goto out;
- * p->on_rq = 0; smp_acquire__after_ctrl_dep();
- * p->state = TASK_WAKING
- *
- * Where __schedule() and ttwu() have matching control dependencies.
- *
- * After this, schedule() must not care about p->state any more.
- */
- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-
- if (prev->in_iowait) {
- atomic_inc(&rq->nr_iowait);
- delayacct_blkio_start();
- }
+ * that we form a control dependency vs deactivate_task() below.
+ */
+ prev_state = READ_ONCE(prev->__state);
+ if (sched_mode == SM_IDLE) {
+ /* SCX must consult the BPF scheduler to tell if rq is empty */
+ if (!rq->nr_running && !scx_enabled()) {
+ next = prev;
+ goto picked;
}
+ } else if (!preempt && prev_state) {
+ try_to_block_task(rq, prev, prev_state);
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf);
+ rq_set_donor(rq, next);
+picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+ rq->last_seen_need_resched_ns = 0;
+#endif
if (likely(prev != next)) {
rq->nr_switches++;
@@ -4202,27 +6737,37 @@ static void __sched notrace __schedule(bool preempt)
*
* Here are the schemes providing that barrier on the
* various architectures:
- * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
- * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
+ * RISC-V. switch_mm() relies on membarrier_arch_switch_mm()
+ * on PowerPC and on RISC-V.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
+ *
+ * The barrier matches a full barrier in the proximity of
+ * the membarrier system call entry.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
*/
++*switch_count;
- psi_sched_switch(prev, next, !task_on_rq_queued(prev));
+ migrate_disable_switch(rq, prev);
+ psi_account_irqtime(rq, prev, next);
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
+ prev->se.sched_delayed);
- trace_sched_switch(preempt, prev, next);
+ trace_sched_switch(preempt, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
- rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unlock_irq(rq, &rf);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock_irq(rq);
}
-
- balance_callback(rq);
}
void __noreturn do_task_dead(void)
@@ -4233,7 +6778,7 @@ void __noreturn do_task_dead(void)
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
- __schedule(false);
+ __schedule(SM_NONE);
BUG();
/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -4243,57 +6788,73 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state)
- return;
+ static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
+ unsigned int task_flags;
/*
- * If a worker went to sleep, notify and ask workqueue whether
- * it wants to wake up a task to maintain concurrency.
- * As this function is called inside the schedule() context,
- * we disable preemption to avoid it calling schedule() again
- * in the possible wakeup of a kworker and because wq_worker_sleeping()
- * requires it.
+ * Establish LD_WAIT_CONFIG context to ensure none of the code called
+ * will use a blocking primitive -- which would lead to recursion.
*/
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- preempt_disable();
- if (tsk->flags & PF_WQ_WORKER)
- wq_worker_sleeping(tsk);
- else
- io_wq_worker_sleeping(tsk);
- preempt_enable_no_resched();
- }
+ lock_map_acquire_try(&sched_map);
- if (tsk_is_pi_blocked(tsk))
- return;
+ task_flags = tsk->flags;
+ /*
+ * If a worker goes to sleep, notify and ask workqueue whether it
+ * wants to wake up a task to maintain concurrency.
+ */
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else if (task_flags & PF_IO_WORKER)
+ io_wq_worker_sleeping(tsk);
+
+ /*
+ * spinlock and rwlock must not flush block requests. This will
+ * deadlock if the callback attempts to acquire a lock which is
+ * already acquired.
+ */
+ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
- if (blk_needs_flush_plug(tsk))
- blk_schedule_flush_plug(tsk);
+ blk_flush_plug(tsk->plug, true);
+
+ lock_map_release(&sched_map);
}
static void sched_update_worker(struct task_struct *tsk)
{
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) {
+ if (tsk->flags & PF_BLOCK_TS)
+ blk_plug_invalidate_ts(tsk);
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
- else
+ else if (tsk->flags & PF_IO_WORKER)
io_wq_worker_running(tsk);
}
}
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void __schedule_loop(int sched_mode)
{
- struct task_struct *tsk = current;
-
- sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(false);
+ __schedule(sched_mode);
sched_preempt_enable_no_resched();
} while (need_resched());
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ struct task_struct *tsk = current;
+
+#ifdef CONFIG_RT_MUTEXES
+ lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+ if (!task_is_running(tsk))
+ sched_submit_work(tsk);
+ __schedule_loop(SM_NONE);
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
@@ -4312,18 +6873,18 @@ void __sched schedule_idle(void)
{
/*
* As this skips calling sched_submit_work(), which the idle task does
- * regardless because that function is a nop when the task is in a
+ * regardless because that function is a NOP when the task is in a
* TASK_RUNNING state, make sure this isn't used someplace that the
* current task can be in any other state. Note, idle is always in the
* TASK_RUNNING state.
*/
- WARN_ON_ONCE(current->state);
+ WARN_ON_ONCE(current->__state);
do {
- __schedule(false);
+ __schedule(SM_IDLE);
} while (need_resched());
}
-#ifdef CONFIG_CONTEXT_TRACKING
+#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
/*
@@ -4333,7 +6894,7 @@ asmlinkage __visible void __sched schedule_user(void)
* we find a better solution.
*
* NB: There are buggy callers of this function. Ideally we
- * should warn if prev_state != CONTEXT_USER, but that will trigger
+ * should warn if prev_state != CT_STATE_USER, but that will trigger
* too frequently to make sense yet.
*/
enum ctx_state prev_state = exception_enter();
@@ -4354,6 +6915,14 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+ __schedule_loop(SM_RTLOCK_WAIT);
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
static void __sched notrace preempt_schedule_common(void)
{
do {
@@ -4372,7 +6941,7 @@ static void __sched notrace preempt_schedule_common(void)
*/
preempt_disable_notrace();
preempt_latency_start(1);
- __schedule(true);
+ __schedule(SM_PREEMPT);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
@@ -4396,12 +6965,32 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_dynamic_enabled
+#define preempt_schedule_dynamic_enabled preempt_schedule
+#define preempt_schedule_dynamic_disabled NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
+void __sched notrace dynamic_preempt_schedule(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
+ return;
+ preempt_schedule();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule);
+EXPORT_SYMBOL(dynamic_preempt_schedule);
+#endif
+#endif
+
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
*
@@ -4445,7 +7034,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule(true);
+ __schedule(SM_PREEMPT);
exception_exit(prev_ctx);
preempt_latency_stop(1);
@@ -4454,13 +7043,34 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_notrace_dynamic_enabled
+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+#define preempt_schedule_notrace_dynamic_disabled NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
+void __sched notrace dynamic_preempt_schedule_notrace(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
+ return;
+ preempt_schedule_notrace();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
+#endif
+#endif
+
#endif /* CONFIG_PREEMPTION */
/*
* This is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * off of IRQ context.
+ * Note, that this is called and return with IRQs disabled. This will
+ * protect us against recursive calling from IRQ contexts.
*/
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
@@ -4474,7 +7084,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
do {
preempt_disable();
local_irq_enable();
- __schedule(true);
+ __schedule(SM_PREEMPT);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
@@ -4485,26 +7095,53 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
+const struct sched_class *__setscheduler_class(int policy, int prio)
+{
+ if (dl_prio(prio))
+ return &dl_sched_class;
+
+ if (rt_prio(prio))
+ return &rt_sched_class;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (task_should_scx(policy))
+ return &ext_sched_class;
+#endif
+
+ return &fair_sched_class;
+}
+
#ifdef CONFIG_RT_MUTEXES
-static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
-{
- if (pi_task)
- prio = min(prio, pi_task->prio);
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
- return prio;
+void rt_mutex_pre_schedule(void)
+{
+ lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+ sched_submit_work(current);
}
-static inline int rt_effective_prio(struct task_struct *p, int prio)
+void rt_mutex_schedule(void)
{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ lockdep_assert(current->sched_rt_mutex);
+ __schedule_loop(SM_NONE);
+}
- return __rt_effective_prio(pi_task, prio);
+void rt_mutex_post_schedule(void)
+{
+ sched_update_worker(current);
+ lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
}
/*
@@ -4522,7 +7159,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
int prio, oldprio, queued, running, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- const struct sched_class *prev_class;
+ const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
struct rq *rq;
@@ -4545,7 +7182,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
* right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
* ensure a task is de-boosted (pi_task is set to NULL) before the
* task is allowed to run again (and can exit). This ensures the pointer
- * points to a blocked task -- which guaratees the task is present.
+ * points to a blocked task -- which guarantees the task is present.
*/
p->pi_top_task = pi_task;
@@ -4556,7 +7193,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
goto out_unlock;
/*
- * Idle task boosting is a nono in general. There is one
+ * Idle task boosting is a no-no in general. There is one
* exception, when PREEMPT_RT and NOHZ is active:
*
* The idle task calls get_next_timer_interrupt() and holds
@@ -4580,8 +7217,13 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
+ next_class = __setscheduler_class(p->policy, prio);
+
+ if (prev_class != next_class && p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, queue_flag);
if (running)
@@ -4600,27 +7242,28 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.dl_boosted = 1;
+ p->dl.pi_se = pi_task->dl.pi_se;
queue_flag |= ENQUEUE_REPLENISH;
- } else
- p->dl.dl_boosted = 0;
- p->sched_class = &dl_sched_class;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
+ p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
+ p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
}
+ p->sched_class = next_class;
p->prio = prio;
+ check_class_changing(rq, p, prev_class);
+
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
@@ -4630,1282 +7273,390 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
- __task_rq_unlock(rq, &rf);
- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_rq_unlock(rq);
+
preempt_enable();
}
-#else
-static inline int rt_effective_prio(struct task_struct *p, int prio)
-{
- return prio;
-}
#endif
-void set_user_nice(struct task_struct *p, long nice)
+#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+int __sched __cond_resched(void)
{
- bool queued, running;
- int old_prio;
- struct rq_flags rf;
- struct rq *rq;
-
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
- return;
- /*
- * We have to be careful, if called from sys_setpriority(),
- * the task might be in the middle of scheduling on another CPU.
- */
- rq = task_rq_lock(p, &rf);
- update_rq_clock(rq);
-
- /*
- * The RT priorities are set via sched_setscheduler(), but we still
- * allow the 'normal' nice value to be set - but as expected
- * it wont have any effect on scheduling until the task is
- * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
- */
- if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
- p->static_prio = NICE_TO_PRIO(nice);
- goto out_unlock;
+ if (should_resched(0) && !irqs_disabled()) {
+ preempt_schedule_common();
+ return 1;
}
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- if (running)
- put_prev_task(rq, p);
-
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p, true);
- old_prio = p->prio;
- p->prio = effective_prio(p);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
-
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- p->sched_class->prio_changed(rq, p, old_prio);
-
-out_unlock:
- task_rq_unlock(rq, p, &rf);
-}
-EXPORT_SYMBOL(set_user_nice);
-
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
- /* Convert nice value [19,-20] to rlimit style value [1,40]: */
- int nice_rlim = nice_to_rlimit(nice);
-
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
-}
-
-#ifdef __ARCH_WANT_SYS_NICE
-
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
-{
- long nice, retval;
-
/*
- * Setpriority might change our priority at the same moment.
- * We don't have to worry. Conceptually one call occurs first
- * and we have a single winner.
+ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * whether the current CPU is in an RCU read-side critical section,
+ * so the tick can report quiescent states even for CPUs looping
+ * in kernel context. In contrast, in non-preemptible kernels,
+ * RCU readers leave no in-memory hints, which means that CPU-bound
+ * processes executing in kernel context might never report an
+ * RCU quiescent state. Therefore, the following code causes
+ * cond_resched() to report a quiescent state, but only when RCU
+ * is in urgent need of one.
*/
- increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
- nice = task_nice(current) + increment;
-
- nice = clamp_val(nice, MIN_NICE, MAX_NICE);
- if (increment < 0 && !can_nice(current, nice))
- return -EPERM;
-
- retval = security_task_setnice(current, nice);
- if (retval)
- return retval;
-
- set_user_nice(current, nice);
+#ifndef CONFIG_PREEMPT_RCU
+ rcu_all_qs();
+#endif
return 0;
}
-
+EXPORT_SYMBOL(__cond_resched);
#endif
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
- */
-int task_prio(const struct task_struct *p)
-{
- return p->prio - MAX_RT_PRIO;
-}
-
-/**
- * idle_cpu - is a given CPU idle currently?
- * @cpu: the processor in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (rq->curr != rq->idle)
- return 0;
-
- if (rq->nr_running)
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define cond_resched_dynamic_enabled __cond_resched
+#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
+DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(cond_resched);
+
+#define might_resched_dynamic_enabled __cond_resched
+#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
+DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+EXPORT_STATIC_CALL_TRAMP(might_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
+int __sched dynamic_cond_resched(void)
+{
+ klp_sched_try_switch();
+ if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
-
-#ifdef CONFIG_SMP
- if (rq->ttwu_pending)
- return 0;
-#endif
-
- return 1;
+ return __cond_resched();
}
+EXPORT_SYMBOL(dynamic_cond_resched);
-/**
- * available_idle_cpu - is a given CPU idle for enqueuing work.
- * @cpu: the CPU in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int available_idle_cpu(int cpu)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
+int __sched dynamic_might_resched(void)
{
- if (!idle_cpu(cpu))
- return 0;
-
- if (vcpu_is_preempted(cpu))
+ if (!static_branch_unlikely(&sk_dynamic_might_resched))
return 0;
-
- return 1;
-}
-
-/**
- * idle_task - return the idle task for a given CPU.
- * @cpu: the processor in question.
- *
- * Return: The idle task for the CPU @cpu.
- */
-struct task_struct *idle_task(int cpu)
-{
- return cpu_rq(cpu)->idle;
-}
-
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- *
- * The task of @pid, if found. %NULL otherwise.
- */
-static struct task_struct *find_process_by_pid(pid_t pid)
-{
- return pid ? find_task_by_vpid(pid) : current;
+ return __cond_resched();
}
-
-/*
- * sched_setparam() passes in -1 for its policy, to let the functions
- * it calls know not to change it.
- */
-#define SETPARAM_POLICY -1
-
-static void __setscheduler_params(struct task_struct *p,
- const struct sched_attr *attr)
-{
- int policy = attr->sched_policy;
-
- if (policy == SETPARAM_POLICY)
- policy = p->policy;
-
- p->policy = policy;
-
- if (dl_policy(policy))
- __setparam_dl(p, attr);
- else if (fair_policy(policy))
- p->static_prio = NICE_TO_PRIO(attr->sched_nice);
-
- /*
- * __sched_setscheduler() ensures attr->sched_priority == 0 when
- * !rt_policy. Always setting this ensures that things like
- * getparam()/getattr() don't report silly values for !rt tasks.
- */
- p->rt_priority = attr->sched_priority;
- p->normal_prio = normal_prio(p);
- set_load_weight(p, true);
-}
-
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
-{
- /*
- * If params can't change scheduling class changes aren't allowed
- * either.
- */
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- return;
-
- __setscheduler_params(p, attr);
-
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
-
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
-
-/*
- * Check the target process has a UID that matches the current process's:
- */
-static bool check_same_owner(struct task_struct *p)
-{
- const struct cred *cred = current_cred(), *pcred;
- bool match;
-
- rcu_read_lock();
- pcred = __task_cred(p);
- match = (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
- rcu_read_unlock();
- return match;
-}
-
-static int __sched_setscheduler(struct task_struct *p,
- const struct sched_attr *attr,
- bool user, bool pi)
-{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
- const struct sched_class *prev_class;
- struct rq_flags rf;
- int reset_on_fork;
- int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct rq *rq;
-
- /* The pi code expects interrupts enabled */
- BUG_ON(pi && in_interrupt());
-recheck:
- /* Double check policy once rq lock held: */
- if (policy < 0) {
- reset_on_fork = p->sched_reset_on_fork;
- policy = oldpolicy = p->policy;
- } else {
- reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-
- if (!valid_policy(policy))
- return -EINVAL;
- }
-
- if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
- return -EINVAL;
-
- /*
- * Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
- * SCHED_BATCH and SCHED_IDLE is 0.
- */
- if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
- return -EINVAL;
- if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
- (rt_policy(policy) != (attr->sched_priority != 0)))
- return -EINVAL;
-
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
- if (user) {
- if (attr->sched_flags & SCHED_FLAG_SUGOV)
- return -EINVAL;
-
- retval = security_task_setscheduler(p);
- if (retval)
- return retval;
- }
-
- /* Update task specific "requested" clamps */
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
- retval = uclamp_validate(p, attr);
- if (retval)
- return retval;
- }
-
- if (pi)
- cpuset_read_lock();
-
- /*
- * Make sure no PI-waiters arrive (or leave) while we are
- * changing the priority of the task:
- *
- * To be able to change p->policy safely, the appropriate
- * runqueue lock must be held.
- */
- rq = task_rq_lock(p, &rf);
- update_rq_clock(rq);
-
- /*
- * Changing the policy of the stop threads its a very bad idea:
- */
- if (p == rq->stop) {
- retval = -EINVAL;
- goto unlock;
- }
-
- /*
- * If not changing anything there's no need to proceed further,
- * but store a possible modification of reset_on_fork.
- */
- if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
- goto change;
- if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
- goto change;
- if (dl_policy(policy) && dl_param_changed(p, attr))
- goto change;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
- goto change;
-
- p->sched_reset_on_fork = reset_on_fork;
- retval = 0;
- goto unlock;
- }
-change:
-
- if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0 &&
- !task_group_is_autogroup(task_group(p))) {
- retval = -EPERM;
- goto unlock;
- }
+EXPORT_SYMBOL(dynamic_might_resched);
#endif
-#ifdef CONFIG_SMP
- if (dl_bandwidth_enabled() && dl_policy(policy) &&
- !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
- cpumask_t *span = rq->rd->span;
-
- /*
- * Don't allow tasks with an affinity mask smaller than
- * the entire root_domain to become SCHED_DEADLINE. We
- * will also fail if there's no bandwidth available.
- */
- if (!cpumask_subset(span, p->cpus_ptr) ||
- rq->rd->dl_bw.bw == 0) {
- retval = -EPERM;
- goto unlock;
- }
- }
#endif
- }
-
- /* Re-check policy now with rq lock held: */
- if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
- policy = oldpolicy = -1;
- task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
- goto recheck;
- }
-
- /*
- * If setscheduling to SCHED_DEADLINE (or changing the parameters
- * of a SCHED_DEADLINE task) we need to check if enough bandwidth
- * is available.
- */
- if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- retval = -EBUSY;
- goto unlock;
- }
-
- p->sched_reset_on_fork = reset_on_fork;
- oldprio = p->prio;
-
- if (pi) {
- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
- queue_flags &= ~DEQUEUE_MOVE;
- }
-
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
-
- prev_class = p->sched_class;
-
- __setscheduler(rq, p, attr, pi);
- __setscheduler_uclamp(p, attr);
-
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
-
- enqueue_task(rq, p, queue_flags);
- }
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
-
- /* Avoid rq from going away on us: */
- preempt_disable();
- task_rq_unlock(rq, p, &rf);
-
- if (pi) {
- cpuset_read_unlock();
- rt_mutex_adjust_pi(p);
- }
-
- /* Run balance callbacks after we've adjusted the PI chain: */
- balance_callback(rq);
- preempt_enable();
-
- return 0;
-
-unlock:
- task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
- return retval;
-}
-
-static int _sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool check)
-{
- struct sched_attr attr = {
- .sched_policy = policy,
- .sched_priority = param->sched_priority,
- .sched_nice = PRIO_TO_NICE(p->static_prio),
- };
-
- /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
- if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- policy &= ~SCHED_RESET_ON_FORK;
- attr.sched_policy = policy;
- }
-
- return __sched_setscheduler(p, &attr, check, true);
-}
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return _sched_setscheduler(p, policy, param, true);
-}
-EXPORT_SYMBOL_GPL(sched_setscheduler);
-
-int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
-{
- return __sched_setscheduler(p, attr, true, true);
-}
-EXPORT_SYMBOL_GPL(sched_setattr);
-
-int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
-{
- return __sched_setscheduler(p, attr, false, true);
-}
-
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission. For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
- *
- * Return: 0 on success. An error code otherwise.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return _sched_setscheduler(p, policy, param, false);
-}
-EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
-
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
- struct sched_param lparam;
- struct task_struct *p;
- int retval;
-
- if (!param || pid < 0)
- return -EINVAL;
- if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
- return -EFAULT;
-
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
-
- if (likely(p)) {
- retval = sched_setscheduler(p, policy, &lparam);
- put_task_struct(p);
- }
-
- return retval;
-}
/*
- * Mimics kernel/events/core.c perf_copy_attr().
- */
-static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
-{
- u32 size;
- int ret;
-
- /* Zero the full structure, so that a short copy will be nice: */
- memset(attr, 0, sizeof(*attr));
-
- ret = get_user(size, &uattr->size);
- if (ret)
- return ret;
-
- /* ABI compatibility quirk: */
- if (!size)
- size = SCHED_ATTR_SIZE_VER0;
- if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
- goto err_size;
-
- ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
- if (ret) {
- if (ret == -E2BIG)
- goto err_size;
- return ret;
- }
-
- if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
- size < SCHED_ATTR_SIZE_VER1)
- return -EINVAL;
-
- /*
- * XXX: Do we want to be lenient like existing syscalls; or do we want
- * to be strict and return an error on out-of-bounds values?
- */
- attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-
- return 0;
-
-err_size:
- put_user(sizeof(*attr), &uattr->size);
- return -E2BIG;
-}
-
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
*
- * Return: 0 on success. An error code otherwise.
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
*/
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
+int __cond_resched_lock(spinlock_t *lock)
{
- if (policy < 0)
- return -EINVAL;
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
- return do_sched_setscheduler(pid, policy, param);
-}
+ lockdep_assert_held(lock);
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
- return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
+ }
+ return ret;
}
+EXPORT_SYMBOL(__cond_resched_lock);
-/**
- * sys_sched_setattr - same as above, but with extended sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, flags)
+int __cond_resched_rwlock_read(rwlock_t *lock)
{
- struct sched_attr attr;
- struct task_struct *p;
- int retval;
-
- if (!uattr || pid < 0 || flags)
- return -EINVAL;
-
- retval = sched_copy_attr(uattr, &attr);
- if (retval)
- return retval;
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
- if ((int)attr.sched_policy < 0)
- return -EINVAL;
- if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
- attr.sched_policy = SETPARAM_POLICY;
+ lockdep_assert_held_read(lock);
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
-
- if (likely(p)) {
- retval = sched_setattr(p, &attr);
- put_task_struct(p);
+ if (rwlock_needbreak(lock) || resched) {
+ read_unlock(lock);
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ read_lock(lock);
}
-
- return retval;
+ return ret;
}
+EXPORT_SYMBOL(__cond_resched_rwlock_read);
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- *
- * Return: On success, the policy of the thread. Otherwise, a negative error
- * code.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+int __cond_resched_rwlock_write(rwlock_t *lock)
{
- struct task_struct *p;
- int retval;
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
+ int ret = 0;
- if (pid < 0)
- return -EINVAL;
+ lockdep_assert_held_write(lock);
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (p) {
- retval = security_task_getscheduler(p);
- if (!retval)
- retval = p->policy
- | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ if (rwlock_needbreak(lock) || resched) {
+ write_unlock(lock);
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ write_lock(lock);
}
- rcu_read_unlock();
- return retval;
+ return ret;
}
+EXPORT_SYMBOL(__cond_resched_rwlock_write);
-/**
- * sys_sched_getparam - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- *
- * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
- * code.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
- struct sched_param lp = { .sched_priority = 0 };
- struct task_struct *p;
- int retval;
-
- if (!param || pid < 0)
- return -EINVAL;
-
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- rcu_read_unlock();
-
- /*
- * This one might sleep, we cannot do it with a spinlock held ...
- */
- retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+#ifdef CONFIG_PREEMPT_DYNAMIC
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
-}
+#ifdef CONFIG_GENERIC_ENTRY
+#include <linux/entry-common.h>
+#endif
/*
- * Copy the kernel size attribute structure (which might be larger
- * than what user-space knows about) to user-space.
- *
- * Note that all cases are valid: user-space buffer can be larger or
- * smaller than the kernel-space buffer. The usual case is that both
- * have the same size.
- */
-static int
-sched_attr_copy_to_user(struct sched_attr __user *uattr,
- struct sched_attr *kattr,
- unsigned int usize)
-{
- unsigned int ksize = sizeof(*kattr);
-
- if (!access_ok(uattr, usize))
- return -EFAULT;
-
- /*
- * sched_getattr() ABI forwards and backwards compatibility:
- *
- * If usize == ksize then we just copy everything to user-space and all is good.
- *
- * If usize < ksize then we only copy as much as user-space has space for,
- * this keeps ABI compatibility as well. We skip the rest.
- *
- * If usize > ksize then user-space is using a newer version of the ABI,
- * which part the kernel doesn't know about. Just ignore it - tooling can
- * detect the kernel's knowledge of attributes from the attr->size value
- * which is set to ksize in this case.
- */
- kattr->size = min(usize, ksize);
-
- if (copy_to_user(uattr, kattr, kattr->size))
- return -EFAULT;
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- false
+ *
+ * LAZY:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- true
+ */
+
+enum {
+ preempt_dynamic_undefined = -1,
+ preempt_dynamic_none,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+ preempt_dynamic_lazy,
+};
- return 0;
-}
+int preempt_dynamic_mode = preempt_dynamic_undefined;
-/**
- * sys_sched_getattr - similar to sched_getparam, but with sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @usize: sizeof(attr) for fwd/bwd comp.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, usize, unsigned int, flags)
+int sched_dynamic_mode(const char *str)
{
- struct sched_attr kattr = { };
- struct task_struct *p;
- int retval;
+#ifndef CONFIG_PREEMPT_RT
+ if (!strcmp(str, "none"))
+ return preempt_dynamic_none;
- if (!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags)
- return -EINVAL;
-
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- kattr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- if (task_has_dl_policy(p))
- __getparam_dl(p, &kattr);
- else if (task_has_rt_policy(p))
- kattr.sched_priority = p->rt_priority;
- else
- kattr.sched_nice = task_nice(p);
-
-#ifdef CONFIG_UCLAMP_TASK
- kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ if (!strcmp(str, "voluntary"))
+ return preempt_dynamic_voluntary;
#endif
- rcu_read_unlock();
+ if (!strcmp(str, "full"))
+ return preempt_dynamic_full;
- return sched_attr_copy_to_user(uattr, &kattr, usize);
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+ if (!strcmp(str, "lazy"))
+ return preempt_dynamic_lazy;
+#endif
-out_unlock:
- rcu_read_unlock();
- return retval;
+ return -EINVAL;
}
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
-{
- cpumask_var_t cpus_allowed, new_mask;
- struct task_struct *p;
- int retval;
+#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
- rcu_read_lock();
-
- p = find_process_by_pid(pid);
- if (!p) {
- rcu_read_unlock();
- return -ESRCH;
- }
-
- /* Prevent p going away */
- get_task_struct(p);
- rcu_read_unlock();
-
- if (p->flags & PF_NO_SETAFFINITY) {
- retval = -EINVAL;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
- retval = -EPERM;
- if (!check_same_owner(p)) {
- rcu_read_lock();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
- }
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
+#else
+#error "Unsupported PREEMPT_DYNAMIC mechanism"
+#endif
- retval = security_task_setscheduler(p);
- if (retval)
- goto out_free_new_mask;
+static DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: none\n");
+ break;
+ case preempt_dynamic_voluntary:
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, in_mask, cpus_allowed);
+ case preempt_dynamic_full:
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: full\n");
+ break;
- /*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
- */
-#ifdef CONFIG_SMP
- if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
- rcu_read_lock();
- if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
- retval = -EBUSY;
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
+ case preempt_dynamic_lazy:
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_enable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: lazy\n");
+ break;
}
-#endif
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
- }
- }
-out_free_new_mask:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
-out_put_task:
- put_task_struct(p);
- return retval;
+ preempt_dynamic_mode = mode;
}
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
- struct cpumask *new_mask)
+void sched_dynamic_update(int mode)
{
- if (len < cpumask_size())
- cpumask_clear(new_mask);
- else if (len > cpumask_size())
- len = cpumask_size();
-
- return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+ mutex_lock(&sched_dynamic_mutex);
+ __sched_dynamic_update(mode);
+ mutex_unlock(&sched_dynamic_mutex);
}
-/**
- * sys_sched_setaffinity - set the CPU affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new CPU mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
-{
- cpumask_var_t new_mask;
- int retval;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
- retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
- if (retval == 0)
- retval = sched_setaffinity(pid, new_mask);
- free_cpumask_var(new_mask);
- return retval;
-}
-
-long sched_getaffinity(pid_t pid, struct cpumask *mask)
+static int klp_cond_resched(void)
{
- struct task_struct *p;
- unsigned long flags;
- int retval;
-
- rcu_read_lock();
-
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-out_unlock:
- rcu_read_unlock();
-
- return retval;
+ __klp_sched_try_switch();
+ return __cond_resched();
}
-/**
- * sys_sched_getaffinity - get the CPU affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current CPU mask
- *
- * Return: size of CPU mask copied to user_mask_ptr on success. An
- * error code otherwise.
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
+void sched_dynamic_klp_enable(void)
{
- int ret;
- cpumask_var_t mask;
-
- if ((len * BITS_PER_BYTE) < nr_cpu_ids)
- return -EINVAL;
- if (len & (sizeof(unsigned long)-1))
- return -EINVAL;
-
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
+ mutex_lock(&sched_dynamic_mutex);
- ret = sched_getaffinity(pid, mask);
- if (ret == 0) {
- unsigned int retlen = min(len, cpumask_size());
-
- if (copy_to_user(user_mask_ptr, mask, retlen))
- ret = -EFAULT;
- else
- ret = retlen;
- }
- free_cpumask_var(mask);
+ klp_override = true;
+ static_call_update(cond_resched, klp_cond_resched);
- return ret;
+ mutex_unlock(&sched_dynamic_mutex);
}
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
- *
- * Return: 0.
- */
-static void do_sched_yield(void)
+void sched_dynamic_klp_disable(void)
{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = this_rq_lock_irq(&rf);
+ mutex_lock(&sched_dynamic_mutex);
- schedstat_inc(rq->yld_count);
- current->sched_class->yield_task(rq);
+ klp_override = false;
+ __sched_dynamic_update(preempt_dynamic_mode);
- /*
- * Since we are going to call schedule() anyway, there's
- * no need to preempt or enable interrupts:
- */
- preempt_disable();
- rq_unlock(rq, &rf);
- sched_preempt_enable_no_resched();
-
- schedule();
+ mutex_unlock(&sched_dynamic_mutex);
}
-SYSCALL_DEFINE0(sched_yield)
-{
- do_sched_yield();
- return 0;
-}
+#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-#ifndef CONFIG_PREEMPTION
-int __sched _cond_resched(void)
+static int __init setup_preempt_mode(char *str)
{
- if (should_resched(0)) {
- preempt_schedule_common();
- return 1;
- }
- rcu_all_qs();
- return 0;
-}
-EXPORT_SYMBOL(_cond_resched);
-#endif
-
-/*
- * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-int __cond_resched_lock(spinlock_t *lock)
-{
- int resched = should_resched(PREEMPT_LOCK_OFFSET);
- int ret = 0;
-
- lockdep_assert_held(lock);
-
- if (spin_needbreak(lock) || resched) {
- spin_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
- cpu_relax();
- ret = 1;
- spin_lock(lock);
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 0;
}
- return ret;
-}
-EXPORT_SYMBOL(__cond_resched_lock);
-/**
- * yield - yield the current processor to other threads.
- *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- * yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
- */
-void __sched yield(void)
-{
- set_current_state(TASK_RUNNING);
- do_sched_yield();
+ sched_dynamic_update(mode);
+ return 1;
}
-EXPORT_SYMBOL(yield);
+__setup("preempt=", setup_preempt_mode);
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Return:
- * true (>0) if we indeed boosted the target task.
- * false (0) if we failed to boost the target.
- * -ESRCH if there's no task to yield to.
- */
-int __sched yield_to(struct task_struct *p, bool preempt)
+static void __init preempt_dynamic_init(void)
{
- struct task_struct *curr = current;
- struct rq *rq, *p_rq;
- unsigned long flags;
- int yielded = 0;
-
- local_irq_save(flags);
- rq = this_rq();
-
-again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1) {
- yielded = -ESRCH;
- goto out_irq;
- }
-
- double_rq_lock(rq, p_rq);
- if (task_rq(p) != p_rq) {
- double_rq_unlock(rq, p_rq);
- goto again;
+ if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+ sched_dynamic_update(preempt_dynamic_none);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ sched_dynamic_update(preempt_dynamic_voluntary);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ sched_dynamic_update(preempt_dynamic_lazy);
+ } else {
+ /* Default static call setting, nothing to do */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+ preempt_dynamic_mode = preempt_dynamic_full;
+ pr_info("Dynamic Preempt: full\n");
+ }
}
+}
- if (!curr->sched_class->yield_to_task)
- goto out_unlock;
-
- if (curr->sched_class != p->sched_class)
- goto out_unlock;
-
- if (task_running(p_rq, p) || p->state)
- goto out_unlock;
+#define PREEMPT_MODEL_ACCESSOR(mode) \
+ bool preempt_model_##mode(void) \
+ { \
+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
+ return preempt_dynamic_mode == preempt_dynamic_##mode; \
+ } \
+ EXPORT_SYMBOL_GPL(preempt_model_##mode)
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
- if (yielded) {
- schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity takes care of
- * fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
- }
+PREEMPT_MODEL_ACCESSOR(none);
+PREEMPT_MODEL_ACCESSOR(voluntary);
+PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
-out_unlock:
- double_rq_unlock(rq, p_rq);
-out_irq:
- local_irq_restore(flags);
+#else /* !CONFIG_PREEMPT_DYNAMIC: */
- if (yielded > 0)
- schedule();
+static inline void preempt_dynamic_init(void) { }
- return yielded;
-}
-EXPORT_SYMBOL_GPL(yield_to);
+#endif /* CONFIG_PREEMPT_DYNAMIC */
int io_schedule_prepare(void)
{
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- blk_schedule_flush_plug(current);
-
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
@@ -5941,156 +7692,31 @@ void __sched io_schedule(void)
}
EXPORT_SYMBOL(io_schedule);
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the maximum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = MAX_USER_RT_PRIO-1;
- break;
- case SCHED_DEADLINE:
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
- ret = 0;
- break;
- }
- return ret;
-}
-
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the minimum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = 1;
- break;
- case SCHED_DEADLINE:
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
- ret = 0;
- }
- return ret;
-}
-
-static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
-{
- struct task_struct *p;
- unsigned int time_slice;
- struct rq_flags rf;
- struct rq *rq;
- int retval;
-
- if (pid < 0)
- return -EINVAL;
-
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
-
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
-
- rq = task_rq_lock(p, &rf);
- time_slice = 0;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &rf);
-
- rcu_read_unlock();
- jiffies_to_timespec64(time_slice, t);
- return 0;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
-}
-
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct __kernel_timespec __user *, interval)
-{
- struct timespec64 t;
- int retval = sched_rr_get_interval(pid, &t);
-
- if (retval == 0)
- retval = put_timespec64(&t, interval);
-
- return retval;
-}
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
- struct old_timespec32 __user *, interval)
-{
- struct timespec64 t;
- int retval = sched_rr_get_interval(pid, &t);
-
- if (retval == 0)
- retval = put_old_timespec32(&t, interval);
- return retval;
-}
-#endif
-
void sched_show_task(struct task_struct *p)
{
- unsigned long free = 0;
+ unsigned long free;
int ppid;
if (!try_get_task_stack(p))
return;
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
- if (p->state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
-#ifdef CONFIG_DEBUG_STACK_USAGE
+ if (task_is_running(p))
+ pr_cont(" running task ");
free = stack_not_used(p);
-#endif
ppid = 0;
rcu_read_lock();
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
- (unsigned long)task_thread_info(p)->flags);
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n",
+ free, task_pid_nr(p), task_tgid_nr(p),
+ ppid, p->flags, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
+ print_stop_info(KERN_INFO, p);
+ print_scx_info(KERN_INFO, p);
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
@@ -6099,36 +7725,31 @@ EXPORT_SYMBOL_GPL(sched_show_task);
static inline bool
state_filter_match(unsigned long state_filter, struct task_struct *p)
{
+ unsigned int state = READ_ONCE(p->__state);
+
/* no filter, everything matches */
if (!state_filter)
return true;
/* filter, but doesn't match */
- if (!(p->state & state_filter))
+ if (!(state & state_filter))
return false;
/*
* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
* TASK_KILLABLE).
*/
- if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
return false;
return true;
}
-void show_state_filter(unsigned long state_filter)
+void show_state_filter(unsigned int state_filter)
{
struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
rcu_read_lock();
for_each_process_thread(g, p) {
/*
@@ -6164,31 +7785,35 @@ void show_state_filter(unsigned long state_filter)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
{
+#ifdef CONFIG_SMP
+ struct affinity_context ac = (struct affinity_context) {
+ .new_mask = cpumask_of(cpu),
+ .flags = 0,
+ };
+#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- __sched_fork(0, idle);
-
raw_spin_lock_irqsave(&idle->pi_lock, flags);
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock(rq);
- idle->state = TASK_RUNNING;
+ idle->__state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- idle->flags |= PF_IDLE;
-
- scs_task_reset(idle);
- kasan_unpoison_task_stack(idle);
+ /*
+ * PF_KTHREAD should already be set at this point; regardless, make it
+ * look like a proper per-CPU kthread.
+ */
+ idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
+ kthread_set_per_cpu(idle, cpu);
#ifdef CONFIG_SMP
/*
- * Its possible that init_idle() gets called multiple times on a task,
- * in that case do_set_cpus_allowed() will not do the right thing.
- *
- * And since this is boot we can forgo the serialization.
+ * No validation and serialization required at boot time and for
+ * setting up the idle tasks of not yet online CPUs.
*/
- set_cpus_allowed_common(idle, cpumask_of(cpu));
+ set_cpus_allowed_common(idle, &ac);
#endif
/*
* We're having a chicken and egg problem, even though we are
@@ -6205,12 +7830,13 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->idle = idle;
+ rq_set_donor(rq, idle);
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
@@ -6234,7 +7860,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
{
int ret = 1;
- if (!cpumask_weight(cur))
+ if (cpumask_empty(cur))
return ret;
ret = dl_cpuset_cpumask_can_shrink(cur, trial);
@@ -6242,8 +7868,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_cpus_allowed)
+int task_can_attach(struct task_struct *p)
{
int ret = 0;
@@ -6256,16 +7881,9 @@ int task_can_attach(struct task_struct *p,
* success of set_cpus_allowed_ptr() on all attached tasks
* before cpus_mask may be changed.
*/
- if (p->flags & PF_NO_SETAFFINITY) {
+ if (p->flags & PF_NO_SETAFFINITY)
ret = -EINVAL;
- goto out;
- }
-
- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed))
- ret = dl_task_can_attach(p, cs_cpus_allowed);
-out:
return ret;
}
@@ -6302,7 +7920,7 @@ void sched_setnuma(struct task_struct *p, int nid)
rq = task_rq_lock(p, &rf);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
@@ -6321,137 +7939,171 @@ void sched_setnuma(struct task_struct *p, int nid)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Ensure that the idle task is using init_mm right before its CPU goes
- * offline.
+ * Invoked on the outgoing CPU in context of the CPU hotplug thread
+ * after ensuring that there are no user space tasks left on the CPU.
+ *
+ * If there is a lazy mm in use on the hotplug thread, drop it and
+ * switch to init_mm.
+ *
+ * The reference count on init_mm is dropped in finish_cpu().
*/
-void idle_task_exit(void)
+static void sched_force_init_mm(void)
{
struct mm_struct *mm = current->active_mm;
- BUG_ON(cpu_online(smp_processor_id()));
- BUG_ON(current != this_rq()->idle);
-
if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
+ mmgrab_lazy_tlb(&init_mm);
+ local_irq_disable();
+ current->active_mm = &init_mm;
+ switch_mm_irqs_off(mm, &init_mm, current);
+ local_irq_enable();
finish_arch_post_lock_switch();
+ mmdrop_lazy_tlb(mm);
}
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
+static int __balance_push_cpu_stop(void *arg)
{
- long delta = calc_load_fold_active(rq, 1);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-}
+ struct task_struct *p = arg;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+ int cpu;
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
- const struct sched_class *class;
- struct task_struct *next;
+ raw_spin_lock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
- for_each_class(class) {
- next = class->pick_next_task(rq);
- if (next) {
- next->sched_class->put_prev_task(rq, next);
- return next;
- }
+ update_rq_clock(rq);
+
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ cpu = select_fallback_rq(rq->cpu, p);
+ rq = __migrate_task(rq, &rf, p, cpu);
}
- /* The idle class should always have a runnable task */
- BUG();
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
*
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
+ * effective when the hotplug motion is down.
*/
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
+static void balance_push(struct rq *rq)
{
- struct rq *rq = dead_rq;
- struct task_struct *next, *stop = rq->stop;
- struct rq_flags orf = *rf;
- int dest_cpu;
+ struct task_struct *push_task = rq->curr;
+
+ lockdep_assert_rq_held(rq);
/*
- * Fudge the rq selection such that the below task selection loop
- * doesn't get stuck on the currently eligible stop task.
- *
- * We're currently inside stop_machine() and the rq is either stuck
- * in the stop_machine_cpu_stop() loop, or we're executing this code,
- * either way we should never end up calling schedule() until we're
- * done here.
+ * Ensure the thing is persistent until balance_push_set(.on = false);
*/
- rq->stop = NULL;
+ rq->balance_callback = &balance_push_callback;
/*
- * put_prev_task() and pick_next_task() sched
- * class method both need to have an up-to-date
- * value of rq->clock[_task]
+ * Only active while going offline and when invoked on the outgoing
+ * CPU.
*/
- update_rq_clock(rq);
-
- for (;;) {
- /*
- * There's this thread running, bail when that's the only
- * remaining thread:
- */
- if (rq->nr_running == 1)
- break;
+ if (!cpu_dying(rq->cpu) || rq != this_rq())
+ return;
- next = __pick_migrate_task(rq);
+ /*
+ * Both the cpu-hotplug and stop task are in this case and are
+ * required to complete the hotplug process.
+ */
+ if (kthread_is_per_cpu(push_task) ||
+ is_migration_disabled(push_task)) {
/*
- * Rules for changing task_struct::cpus_mask are holding
- * both pi_lock and rq->lock, such that holding either
- * stabilizes the mask.
+ * If this is the idle task on the outgoing CPU try to wake
+ * up the hotplug control thread which might wait for the
+ * last task to vanish. The rcuwait_active() check is
+ * accurate here because the waiter is pinned on this CPU
+ * and can't obviously be running in parallel.
*
- * Drop rq->lock is not quite as disastrous as it usually is
- * because !cpu_active at this point, which means load-balance
- * will not interfere. Also, stop-machine.
- */
- rq_unlock(rq, rf);
- raw_spin_lock(&next->pi_lock);
- rq_relock(rq, rf);
-
- /*
- * Since we're inside stop-machine, _nothing_ should have
- * changed the task, WARN if weird stuff happened, because in
- * that case the above rq->lock drop is a fail too.
+ * On RT kernels this also has to check whether there are
+ * pinned and scheduled out tasks on the runqueue. They
+ * need to leave the migrate disabled section first.
*/
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
- raw_spin_unlock(&next->pi_lock);
- continue;
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+ rcuwait_active(&rq->hotplug_wait)) {
+ raw_spin_rq_unlock(rq);
+ rcuwait_wake_up(&rq->hotplug_wait);
+ raw_spin_rq_lock(rq);
}
+ return;
+ }
- /* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
- rq = __migrate_task(rq, rf, next, dest_cpu);
- if (rq != dead_rq) {
- rq_unlock(rq, rf);
- rq = dead_rq;
- *rf = orf;
- rq_relock(rq, rf);
- }
- raw_spin_unlock(&next->pi_lock);
+ get_task_struct(push_task);
+ /*
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
+ * Both preemption and IRQs are still disabled.
+ */
+ preempt_disable();
+ raw_spin_rq_unlock(rq);
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+ this_cpu_ptr(&push_work));
+ preempt_enable();
+ /*
+ * At this point need_resched() is true and we'll take the loop in
+ * schedule(). The next pick is obviously going to be the stop task
+ * which kthread_is_per_cpu() and will push this task away.
+ */
+ raw_spin_rq_lock(rq);
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (on) {
+ WARN_ON_ONCE(rq->balance_callback);
+ rq->balance_callback = &balance_push_callback;
+ } else if (rq->balance_callback == &balance_push_callback) {
+ rq->balance_callback = NULL;
}
+ rq_unlock_irqrestore(rq, &rf);
+}
- rq->stop = stop;
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+ struct rq *rq = this_rq();
+
+ rcuwait_wait_event(&rq->hotplug_wait,
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
+ TASK_UNINTERRUPTIBLE);
}
+
+#else
+
+static inline void balance_push(struct rq *rq)
+{
+}
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
+static inline void balance_hotplug_wait(void)
+{
+}
+
#endif /* CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
@@ -6474,6 +8126,7 @@ void set_rq_offline(struct rq *rq)
if (rq->online) {
const struct sched_class *class;
+ update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
@@ -6484,6 +8137,30 @@ void set_rq_offline(struct rq *rq)
}
}
+static inline void sched_set_rq_online(struct rq *rq, int cpu)
+{
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+static inline void sched_set_rq_offline(struct rq *rq, int cpu)
+{
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+}
+
/*
* used to mark begin/end of suspend/resume:
*/
@@ -6519,38 +8196,56 @@ static void cpuset_cpu_active(void)
cpuset_update_active_cpus();
}
-static int cpuset_cpu_inactive(unsigned int cpu)
+static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- if (dl_cpu_busy(cpu))
- return -EBUSY;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
}
- return 0;
+}
+
+static inline void sched_smt_present_inc(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_inc_cpuslocked(&sched_smt_present);
+#endif
+}
+
+static inline void sched_smt_present_dec(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
}
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
-#ifdef CONFIG_SCHED_SMT
+ /*
+ * Clear the balance_push callback and prepare to schedule
+ * regular tasks.
+ */
+ balance_push_set(cpu, false);
+
/*
* When going up, increment the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
- static_branch_inc_cpuslocked(&sched_smt_present);
-#endif
+ sched_smt_present_inc(cpu);
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
+ sched_update_numa(cpu, true);
sched_domains_numa_masks_set(cpu);
cpuset_cpu_active();
}
+ scx_rq_activate(rq);
+
/*
* Put the rq online, if not already. This happens:
*
@@ -6560,46 +8255,67 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
- rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_online(rq);
- }
- rq_unlock_irqrestore(rq, &rf);
+ sched_set_rq_online(rq, cpu);
return 0;
}
int sched_cpu_deactivate(unsigned int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
int ret;
+ ret = dl_bw_deactivate(cpu);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+ * load balancing when not active
+ */
+ nohz_balance_exit_idle(rq);
+
set_cpu_active(cpu, false);
+
+ /*
+ * From this point forward, this CPU will refuse to run any task that
+ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
+ * push those tasks away until this gets cleared, see
+ * sched_cpu_dying().
+ */
+ balance_push_set(cpu, true);
+
/*
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
- * users of this state to go away such that all new such users will
- * observe it.
+ * We've cleared cpu_active_mask / set balance_push, wait for all
+ * preempt-disabled and RCU users of this state to go away such that
+ * all new such users will observe it.
+ *
+ * Specifically, we rely on ttwu to no longer target this CPU, see
+ * ttwu_queue_cond() and is_cpu_allowed().
*
- * Do sync before park smpboot threads to take care the rcu boost case.
+ * Do sync before park smpboot threads to take care the RCU boost case.
*/
synchronize_rcu();
-#ifdef CONFIG_SCHED_SMT
+ sched_set_rq_offline(rq, cpu);
+
+ scx_rq_deactivate(rq);
+
/*
* When going down, decrement the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
- static_branch_dec_cpuslocked(&sched_smt_present);
+ sched_smt_present_dec(cpu);
+
+#ifdef CONFIG_SCHED_SMT
+ sched_core_cpu_deactivate(cpu);
#endif
if (!sched_smp_initialized)
return 0;
- ret = cpuset_cpu_inactive(cpu);
- if (ret) {
- set_cpu_active(cpu, true);
- return ret;
- }
+ sched_update_numa(cpu, false);
+ cpuset_cpu_inactive(cpu);
sched_domains_numa_masks_clear(cpu);
return 0;
}
@@ -6614,12 +8330,68 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
+ sched_core_cpu_starting(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+ balance_hotplug_wait();
+ sched_force_init_mm();
+ return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the tear-down thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+ long delta = calc_load_fold_active(rq, 1);
+
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+}
+
+static void dump_rq_tasks(struct rq *rq, const char *loglvl)
+{
+ struct task_struct *g, *p;
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
+ for_each_process_thread(g, p) {
+ if (task_cpu(p) != cpu)
+ continue;
+
+ if (!task_on_rq_queued(p))
+ continue;
+
+ printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
+ }
+}
+
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6629,25 +8401,23 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
+ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
+ WARN(true, "Dying CPU not properly vacated!");
+ dump_rq_tasks(rq, KERN_WARNING);
}
- migrate_tasks(rq, &rf);
- BUG_ON(rq->nr_running != 1);
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
update_max_interval();
- nohz_balance_exit_idle(rq);
hrtick_clear(rq);
+ sched_core_cpu_dying(cpu);
return 0;
}
#endif
void __init sched_init_smp(void)
{
- sched_init_numa();
+ sched_init_numa(NUMA_NO_NODE);
/*
* There's no userspace yet to cause hotplug operations; hence all the
@@ -6659,8 +8429,9 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
BUG();
+ current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
init_sched_rt_class();
@@ -6699,17 +8470,26 @@ struct task_group root_task_group;
LIST_HEAD(task_groups);
/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
+static struct kmem_cache *task_group_cache __ro_after_init;
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
-
void __init sched_init(void)
{
unsigned long ptr = 0;
int i;
+ /* Make sure the linker didn't screw up */
+#ifdef CONFIG_SMP
+ BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
+#endif
+ BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
+ BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
+ BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
+#ifdef CONFIG_SCHED_CLASS_EXT
+ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
+ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
+#endif
+
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6729,8 +8509,11 @@ void __init sched_init(void)
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_EXT_GROUP_SCHED
+ root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
@@ -6740,17 +8523,6 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
-
- init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
@@ -6774,7 +8546,7 @@ void __init sched_init(void)
struct rq *rq;
rq = cpu_rq(i);
- raw_spin_lock_init(&rq->lock);
+ raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -6787,7 +8559,7 @@ void __init sched_init(void)
/*
* How much CPU bandwidth does root_task_group get?
*
- * In case of task-groups formed thr' the cgroup filesystem, it
+ * In case of task-groups formed through the cgroup filesystem, it
* gets 100% of the CPU resources in the system. This overall
* system CPU resource is divided among the tasks of
* root_task_group and its child task-groups in a fair manner,
@@ -6806,15 +8578,20 @@ void __init sched_init(void)
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * This is required for init cpu because rt.c:__enable_runtime()
+ * starts working after scheduler_running, which is not the case
+ * yet.
+ */
+ rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->balance_callback = NULL;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
+ rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -6831,72 +8608,117 @@ void __init sched_init(void)
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
- rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
+ INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ rcuwait_init(&rq->hotplug_wait);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+ fair_server_init(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = rq;
+ rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
+ rq->core_enabled = 0;
+ rq->core_tree = RB_ROOT;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;
+
+ rq->core_cookie = 0UL;
+#endif
+ zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
}
set_load_weight(&init_task, false);
+ init_task.se.slice = sysctl_sched_base_slice,
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
+ */
+ WARN_ON(!set_kthread_struct(current));
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*/
+ __sched_fork(0, current);
init_idle(current, smp_processor_id());
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+ balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();
-
- init_schedstats();
+ init_sched_ext_class();
psi_init();
init_uclamp();
+ preempt_dynamic_init();
+
scheduler_running = 1;
}
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
- int nested = preempt_count() + rcu_preempt_depth();
- return (nested == preempt_offset);
-}
-
-void __might_sleep(const char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line)
{
+ unsigned int state = get_current_state();
/*
* Blocking primitives will set (and therefore destroy) current->state,
* since we will exit with TASK_RUNNING make sure we enter with it,
* otherwise we will destroy state.
*/
- WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+ WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
"do not call blocking ops when !TASK_RUNNING; "
- "state=%lx set at [<%p>] %pS\n",
- current->state,
+ "state=%x set at [<%p>] %pS\n", state,
(void *)current->task_state_change,
(void *)current->task_state_change);
- ___might_sleep(file, line, preempt_offset);
+ __might_resched(file, line, 0);
}
EXPORT_SYMBOL(__might_sleep);
-void ___might_sleep(const char *file, int line, int preempt_offset)
+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
+ return;
+
+ if (preempt_count() == preempt_offset)
+ return;
+
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, ip);
+}
+
+static inline bool resched_offsets_ok(unsigned int offsets)
+{
+ unsigned int nested = preempt_count();
+
+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
+
+ return nested == offsets;
+}
+
+void __might_resched(const char *file, int line, unsigned int offsets)
{
/* Ratelimiting timestamp: */
static unsigned long prev_jiffy;
@@ -6906,7 +8728,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
/* WARN_ON_ONCE() by default, no rate limit required: */
rcu_sleep_check();
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
!is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
@@ -6919,29 +8741,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
/* Save this before calling printk(), since that will clobber it: */
preempt_disable_ip = get_preempt_disable_ip(current);
- printk(KERN_ERR
- "BUG: sleeping function called from invalid context at %s:%d\n",
- file, line);
- printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(), current->non_block_count,
- current->pid, current->comm);
+ pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
+ current->pid, current->comm);
+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
+ offsets & MIGHT_RESCHED_PREEMPT_MASK);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ pr_err("RCU nest depth: %d, expected: %u\n",
+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
+ }
if (task_stack_end_corrupted(current))
- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+ pr_emerg("Thread overran stack, or stack corrupted\n");
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && !preempt_count_equals(preempt_offset)) {
- pr_err("Preemption disabled at:");
- print_ip_sym(KERN_ERR, preempt_disable_ip);
- }
+
+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
+ preempt_disable_ip);
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
-EXPORT_SYMBOL(___might_sleep);
+EXPORT_SYMBOL(__might_resched);
void __cant_sleep(const char *file, int line, int preempt_offset)
{
@@ -6970,6 +8796,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_sleep);
+
+#ifdef CONFIG_SMP
+void __cant_migrate(const char *file, int line)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (is_migration_disabled(current))
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > 0)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), is_migration_disabled(current),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_migrate);
+#endif
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -6989,11 +8848,11 @@ void normalize_rt_tasks(void)
continue;
p->se.exec_start = 0;
- schedstat_set(p->se.statistics.wait_start, 0);
- schedstat_set(p->se.statistics.sleep_start, 0);
- schedstat_set(p->se.statistics.block_start, 0);
+ schedstat_set(p->stats.wait_start, 0);
+ schedstat_set(p->stats.sleep_start, 0);
+ schedstat_set(p->stats.block_start, 0);
- if (!dl_task(p) && !rt_task(p)) {
+ if (!rt_or_dl_task(p)) {
/*
* Renice negative nice level userspace
* tasks back to 0:
@@ -7010,9 +8869,9 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+#if defined(CONFIG_KGDB_KDB)
/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
+ * These functions are only useful for KDB.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
@@ -7034,30 +8893,7 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * ia64_set_curr_task - set the current task for a given CPU.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a CPU in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void ia64_set_curr_task(int cpu, struct task_struct *p)
-{
- cpu_curr(cpu) = p;
-}
-
-#endif
+#endif /* defined(CONFIG_KGDB_KDB) */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
@@ -7085,6 +8921,22 @@ static void sched_free_group(struct task_group *tg)
kmem_cache_free(task_group_cache, tg);
}
+static void sched_free_group_rcu(struct rcu_head *rcu)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
+{
+ unregister_fair_sched_group(tg);
+ unregister_rt_sched_group(tg);
+ /*
+ * We have to wait for yet another RCU grace period to expire, as
+ * print_cfs_stats() might run concurrently.
+ */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
@@ -7100,6 +8952,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
alloc_uclamp_sched_group(tg, parent);
return tg;
@@ -7127,33 +8980,43 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
online_fair_sched_group(tg);
}
-/* rcu callback to free various structures associated with a task group */
-static void sched_free_group_rcu(struct rcu_head *rhp)
+/* RCU callback to free various structures associated with a task group */
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
/* Now it should be safe to free those cfs_rqs: */
- sched_free_group(container_of(rhp, struct task_group, rcu));
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
/* Wait for possible concurrent references to cfs_rqs complete: */
- call_rcu(&tg->rcu, sched_free_group_rcu);
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
}
-void sched_offline_group(struct task_group *tg)
+void sched_release_group(struct task_group *tg)
{
unsigned long flags;
- /* End participation in shares distribution: */
- unregister_fair_sched_group(tg);
-
+ /*
+ * Unlink first, to avoid walk_tg_tree_from() from finding us (via
+ * sched_cfs_period_timer()).
+ *
+ * For this to be effective, we have to wait for all pending users of
+ * this task group to leave their RCU critical section to ensure no new
+ * user will see our dying task group any more. Specifically ensure
+ * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
+ *
+ * We therefore defer calling unregister_fair_sched_group() to
+ * sched_unregister_group() which is guarantied to get called only after the
+ * current RCU grace period has expired.
+ */
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk, int type)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -7169,7 +9032,7 @@ static void sched_change_group(struct task_struct *tsk, int type)
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
- tsk->sched_class->task_change_group(tsk, type);
+ tsk->sched_class->task_change_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -7182,17 +9045,18 @@ static void sched_change_group(struct task_struct *tsk, int type)
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
* its new group.
*/
-void sched_move_task(struct task_struct *tsk)
+void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(tsk, &rf);
+ CLASS(task_rq_lock, rq_guard)(tsk);
+ rq = rq_guard.rq;
+
update_rq_clock(rq);
- running = task_current(rq, tsk);
+ running = task_current_donor(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
@@ -7200,7 +9064,9 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, TASK_MOVE_GROUP);
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -7213,13 +9079,6 @@ void sched_move_task(struct task_struct *tsk)
*/
resched_curr(rq);
}
-
- task_rq_unlock(rq, tsk, &rf);
-}
-
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-{
- return css ? container_of(css, struct task_group, css) : NULL;
}
static struct cgroup_subsys_state *
@@ -7245,23 +9104,37 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent);
+ int ret;
+
+ ret = scx_tg_online(tg);
+ if (ret)
+ return ret;
if (parent)
sched_online_group(tg, parent);
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* Propagate the effective uclamp value for the new group */
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
cpu_util_update_eff(css);
#endif
return 0;
}
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ scx_tg_offline(tg);
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_offline_group(tg);
+ sched_release_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -7271,55 +9144,21 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
/*
* Relies on the RCU grace period between css_released() and this.
*/
- sched_free_group(tg);
-}
-
-/*
- * This is called before wake_up_new_task(), therefore we really only
- * have to set its group bits, all the other stuff does not apply.
- */
-static void cpu_cgroup_fork(struct task_struct *task)
-{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(task, &rf);
-
- update_rq_clock(rq);
- sched_change_group(task, TASK_SET_GROUP);
-
- task_rq_unlock(rq, task, &rf);
+ sched_unregister_group(tg);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
+#ifdef CONFIG_RT_GROUP_SCHED
struct task_struct *task;
struct cgroup_subsys_state *css;
- int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#endif
- /*
- * Serialize against wake_up_new_task() such that if its
- * running, we're sure to observe its full state.
- */
- raw_spin_lock_irq(&task->pi_lock);
- /*
- * Avoid calling sched_move_task() before wake_up_new_task()
- * has happened. This would lead to problems with PELT, due to
- * move wanting to detach+attach while we're not attached yet.
- */
- if (task->state == TASK_NEW)
- ret = -EINVAL;
- raw_spin_unlock_irq(&task->pi_lock);
-
- if (ret)
- break;
}
- return ret;
+#endif
+ return scx_cgroup_can_attach(tset);
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
@@ -7328,7 +9167,14 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(task, css, tset)
- sched_move_task(task);
+ sched_move_task(task, false);
+
+ scx_cgroup_finish_attach();
+}
+
+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+ scx_cgroup_cancel_attach(tset);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -7341,6 +9187,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
enum uclamp_id clamp_id;
unsigned int clamps;
+ lockdep_assert_held(&uclamp_mutex);
+ SCHED_WARN_ON(!rcu_read_lock_held());
+
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
? css_tg(css)->parent->uclamp : NULL;
@@ -7373,7 +9222,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
}
/* Immediately update descendants RUNNABLE tasks */
- uclamp_update_active_tasks(css, clamps);
+ uclamp_update_active_tasks(css);
}
}
@@ -7431,8 +9280,10 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
if (req.ret)
return req.ret;
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ static_branch_enable(&sched_uclamp_used);
+
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
tg = css_tg(of_css(of));
if (tg->uclamp_req[clamp_id].value != req.util)
@@ -7447,9 +9298,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
/* Update effective clamps to track the most restrictive value */
cpu_util_update_eff(of_css(of));
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
-
return nbytes;
}
@@ -7475,10 +9323,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf,
u64 percent;
u32 rem;
- rcu_read_lock();
- tg = css_tg(seq_css(sf));
- util_clamp = tg->uclamp_req[clamp_id].value;
- rcu_read_unlock();
+ scoped_guard (rcu) {
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ }
if (util_clamp == SCHED_CAPACITY_SCALE) {
seq_puts(sf, "max\n");
@@ -7503,22 +9351,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
}
#endif /* CONFIG_UCLAMP_TASK_GROUP */
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+static unsigned long tg_weight(struct task_group *tg)
+{
#ifdef CONFIG_FAIR_GROUP_SCHED
+ return scale_load_down(tg->shares);
+#else
+ return sched_weight_from_cgroup(tg->scx_weight);
+#endif
+}
+
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
+ int ret;
+
if (shareval > scale_load_down(ULONG_MAX))
shareval = MAX_SHARES;
- return sched_group_set_shares(css_tg(css), scale_load(shareval));
+ ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
+ if (!ret)
+ scx_group_set_weight(css_tg(css),
+ sched_weight_to_cgroup(shareval));
+ return ret;
}
static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- struct task_group *tg = css_tg(css);
-
- return (u64) scale_load_down(tg->shares);
+ return tg_weight(css_tg(css));
}
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
@@ -7530,7 +9392,8 @@ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
+ u64 burst)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -7547,7 +9410,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
- * Likewise, bound things on the otherside by preventing insane quota
+ * Likewise, bound things on the other side by preventing insane quota
* periods. This also allows us to normalize in computing quota
* feasibility.
*/
@@ -7560,15 +9423,20 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (quota != RUNTIME_INF && quota > max_cfs_runtime)
return -EINVAL;
+ if (quota != RUNTIME_INF && (burst > quota ||
+ burst + quota > max_cfs_runtime))
+ return -EINVAL;
+
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- get_online_cpus();
- mutex_lock(&cfs_constraints_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&cfs_constraints_mutex);
+
ret = __cfs_schedulable(tg, period, quota);
if (ret)
- goto out_unlock;
+ return ret;
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -7578,45 +9446,46 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
*/
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
- raw_spin_lock_irq(&cfs_b->lock);
- cfs_b->period = ns_to_ktime(period);
- cfs_b->quota = quota;
- __refill_cfs_bandwidth_runtime(cfs_b);
+ scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+ cfs_b->burst = burst;
- /* Restart the period timer (if active) to handle new period expiry: */
- if (runtime_enabled)
- start_cfs_bandwidth(cfs_b);
+ __refill_cfs_bandwidth_runtime(cfs_b);
- raw_spin_unlock_irq(&cfs_b->lock);
+ /*
+ * Restart the period timer (if active) to handle new
+ * period expiry:
+ */
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
+ }
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
- struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ guard(rq_lock_irq)(rq);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- rq_unlock_irq(rq, &rf);
}
+
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
-out_unlock:
- mutex_unlock(&cfs_constraints_mutex);
- put_online_cpus();
- return ret;
+ return 0;
}
static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
{
- u64 quota, period;
+ u64 quota, period, burst;
period = ktime_to_ns(tg->cfs_bandwidth.period);
+ burst = tg->cfs_bandwidth.burst;
if (cfs_quota_us < 0)
quota = RUNTIME_INF;
else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
@@ -7624,7 +9493,7 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
else
return -EINVAL;
- return tg_set_cfs_bandwidth(tg, period, quota);
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_quota(struct task_group *tg)
@@ -7642,15 +9511,16 @@ static long tg_get_cfs_quota(struct task_group *tg)
static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
{
- u64 quota, period;
+ u64 quota, period, burst;
if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
return -EINVAL;
period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota;
+ burst = tg->cfs_bandwidth.burst;
- return tg_set_cfs_bandwidth(tg, period, quota);
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
}
static long tg_get_cfs_period(struct task_group *tg)
@@ -7663,6 +9533,30 @@ static long tg_get_cfs_period(struct task_group *tg)
return cfs_period_us;
}
+static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
+{
+ u64 quota, period, burst;
+
+ if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ burst = (u64)cfs_burst_us * NSEC_PER_USEC;
+ period = ktime_to_ns(tg->cfs_bandwidth.period);
+ quota = tg->cfs_bandwidth.quota;
+
+ return tg_set_cfs_bandwidth(tg, period, quota, burst);
+}
+
+static long tg_get_cfs_burst(struct task_group *tg)
+{
+ u64 burst_us;
+
+ burst_us = tg->cfs_bandwidth.burst;
+ do_div(burst_us, NSEC_PER_USEC);
+
+ return burst_us;
+}
+
static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -7687,6 +9581,18 @@ static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
+static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return tg_get_cfs_burst(css_tg(css));
+}
+
+static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_burst_us)
+{
+ return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
+}
+
struct cfs_schedulable_data {
struct task_group *tg;
u64 period, quota;
@@ -7732,11 +9638,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
/*
* Ensure max(child_quota) <= parent_quota. On cgroup2,
- * always take the min. On cgroup1, only inherit when no
- * limit is set:
+ * always take the non-RUNTIME_INF min. On cgroup1, only
+ * inherit when no limit is set. In both cases this is used
+ * by the scheduler to determine if a given CFS task has a
+ * bandwidth constraint at some higher level.
*/
if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
- quota = min(quota, parent_quota);
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF)
+ quota = min(quota, parent_quota);
} else {
if (quota == RUNTIME_INF)
quota = parent_quota;
@@ -7751,7 +9662,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
- int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
@@ -7763,11 +9673,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
do_div(data.quota, NSEC_PER_USEC);
}
- rcu_read_lock();
- ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
}
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@ -7780,19 +9687,45 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
if (schedstat_enabled() && tg != &root_task_group) {
+ struct sched_statistics *stats;
u64 ws = 0;
int i;
- for_each_possible_cpu(i)
- ws += schedstat_val(tg->se[i]->statistics.wait_sum);
+ for_each_possible_cpu(i) {
+ stats = __schedstats_from_se(tg->se[i]);
+ ws += schedstat_val(stats->wait_sum);
+ }
seq_printf(sf, "wait_sum %llu\n", ws);
}
+ seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
+ seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time);
+
+ return 0;
+}
+
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
+
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -7820,13 +9753,37 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return css_tg(css)->idle;
+}
+
+static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 idle)
+{
+ int ret;
+
+ ret = sched_group_set_idle(css_tg(css), idle);
+ if (!ret)
+ scx_group_set_idle(css_tg(css), idle);
+ return ret;
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
{
.name = "shares",
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+ {
+ .name = "idle",
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -7840,9 +9797,18 @@ static struct cftype cpu_legacy_files[] = {
.write_u64 = cpu_cfs_period_write_u64,
},
{
+ .name = "cfs_burst_us",
+ .read_u64 = cpu_cfs_burst_read_u64,
+ .write_u64 = cpu_cfs_burst_write_u64,
+ },
+ {
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
+ {
+ .name = "stat.local",
+ .seq_show = cpu_cfs_local_stat_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
@@ -7880,53 +9846,72 @@ static int cpu_extra_stat_show(struct seq_file *sf,
{
struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- u64 throttled_usec;
+ u64 throttled_usec, burst_usec;
throttled_usec = cfs_b->throttled_time;
do_div(throttled_usec, NSEC_PER_USEC);
+ burst_usec = cfs_b->burst_time;
+ do_div(burst_usec, NSEC_PER_USEC);
seq_printf(sf, "nr_periods %d\n"
"nr_throttled %d\n"
- "throttled_usec %llu\n",
+ "throttled_usec %llu\n"
+ "nr_bursts %d\n"
+ "burst_usec %llu\n",
cfs_b->nr_periods, cfs_b->nr_throttled,
- throttled_usec);
+ throttled_usec, cfs_b->nr_burst, burst_usec);
}
#endif
return 0;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cpu_local_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ u64 throttled_self_usec;
+
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "throttled_usec %llu\n",
+ throttled_self_usec);
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- struct task_group *tg = css_tg(css);
- u64 weight = scale_load_down(tg->shares);
-
- return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+ return sched_weight_to_cgroup(tg_weight(css_tg(css)));
}
static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 weight)
+ struct cftype *cft, u64 cgrp_weight)
{
- /*
- * cgroup weight knobs should use the common MIN, DFL and MAX
- * values which are 1, 100 and 10000 respectively. While it loses
- * a bit of range on both ends, it maps pretty well onto the shares
- * value used by scheduler and the round-trip conversions preserve
- * the original value over the entire range.
- */
- if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ unsigned long weight;
+ int ret;
+
+ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
return -ERANGE;
- weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+ weight = sched_weight_from_cgroup(cgrp_weight);
- return sched_group_set_shares(css_tg(css), scale_load(weight));
+ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+ if (!ret)
+ scx_group_set_weight(css_tg(css), cgrp_weight);
+ return ret;
}
static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- unsigned long weight = scale_load_down(css_tg(css)->shares);
+ unsigned long weight = tg_weight(css_tg(css));
int last_delta = INT_MAX;
int prio, delta;
@@ -7945,7 +9930,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
- int idx;
+ int idx, ret;
if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
@@ -7954,9 +9939,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
idx = array_index_nospec(idx, 40);
weight = sched_prio_to_weight[idx];
- return sched_group_set_shares(css_tg(css), scale_load(weight));
+ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+ if (!ret)
+ scx_group_set_weight(css_tg(css),
+ sched_weight_to_cgroup(weight));
+ return ret;
}
-#endif
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
long period, long quota)
@@ -8004,18 +9993,19 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
{
struct task_group *tg = css_tg(of_css(of));
u64 period = tg_get_cfs_period(tg);
+ u64 burst = tg->cfs_bandwidth.burst;
u64 quota;
int ret;
ret = cpu_period_quota_parse(buf, &period, &quota);
if (!ret)
- ret = tg_set_cfs_bandwidth(tg, period, quota);
+ ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
return ret ?: nbytes;
}
#endif
static struct cftype cpu_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
{
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
@@ -8028,6 +10018,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64,
},
+ {
+ .name = "idle",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -8036,6 +10032,12 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_max_show,
.write = cpu_max_write,
},
+ {
+ .name = "max.burst",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_cfs_burst_read_u64,
+ .write_u64 = cpu_cfs_burst_write_u64,
+ },
#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
@@ -8057,12 +10059,14 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
- .fork = cpu_cgroup_fork,
+ .css_local_stat_show = cpu_local_stat_show,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
+ .cancel_attach = cpu_cgroup_cancel_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
.early_init = true,
@@ -8073,6 +10077,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
void dump_cpu_task(int cpu)
{
+ if (in_hardirq() && cpu == smp_processor_id()) {
+ struct pt_regs *regs;
+
+ regs = get_irq_regs();
+ if (regs) {
+ show_regs(regs);
+ return;
+ }
+ }
+
+ if (trigger_single_cpu_backtrace(cpu))
+ return;
+
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}
@@ -8101,10 +10118,10 @@ const int sched_prio_to_weight[40] = {
};
/*
- * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated.
*
* In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
+ * pre-calculated inverse to speed up arithmetics by turning divisions
* into multiplications:
*/
const u32 sched_prio_to_wmult[40] = {
@@ -8118,4 +10135,564 @@ const u32 sched_prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-#undef CREATE_TRACE_POINTS
+void call_trace_sched_update_nr_running(struct rq *rq, int count)
+{
+ trace_sched_update_nr_running_tp(rq, count);
+}
+
+#ifdef CONFIG_SCHED_MM_CID
+
+/*
+ * @cid_lock: Guarantee forward-progress of cid allocation.
+ *
+ * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
+ * is only used when contention is detected by the lock-free allocation so
+ * forward progress can be guaranteed.
+ */
+DEFINE_RAW_SPINLOCK(cid_lock);
+
+/*
+ * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
+ *
+ * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
+ * detected, it is set to 1 to ensure that all newly coming allocations are
+ * serialized by @cid_lock until the allocation which detected contention
+ * completes and sets @use_cid_lock back to 0. This guarantees forward progress
+ * of a cid allocation.
+ */
+int use_cid_lock;
+
+/*
+ * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
+ * concurrently with respect to the execution of the source runqueue context
+ * switch.
+ *
+ * There is one basic properties we want to guarantee here:
+ *
+ * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
+ * used by a task. That would lead to concurrent allocation of the cid and
+ * userspace corruption.
+ *
+ * Provide this guarantee by introducing a Dekker memory ordering to guarantee
+ * that a pair of loads observe at least one of a pair of stores, which can be
+ * shown as:
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible. But rather than using
+ * values 0 and 1, this algorithm cares about specific state transitions of the
+ * runqueue current task (as updated by the scheduler context switch), and the
+ * per-mm/cpu cid value.
+ *
+ * Let's introduce task (Y) which has task->mm == mm and task (N) which has
+ * task->mm != mm for the rest of the discussion. There are two scheduler state
+ * transitions on context switch we care about:
+ *
+ * (TSA) Store to rq->curr with transition from (N) to (Y)
+ *
+ * (TSB) Store to rq->curr with transition from (Y) to (N)
+ *
+ * On the remote-clear side, there is one transition we care about:
+ *
+ * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ *
+ * There is also a transition to UNSET state which can be performed from all
+ * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
+ * guarantees that only a single thread will succeed:
+ *
+ * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ *
+ * Just to be clear, what we do _not_ want to happen is a transition to UNSET
+ * when a thread is actively using the cid (property (1)).
+ *
+ * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ *
+ * Scenario A) (TSA)+(TMA) (from next task perspective)
+ *
+ * CPU0 CPU1
+ *
+ * Context switch CS-1 Remote-clear
+ * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
+ * (implied barrier after cmpxchg)
+ * - switch_mm_cid()
+ * - memory barrier (see switch_mm_cid()
+ * comment explaining how this barrier
+ * is combined with other scheduler
+ * barriers)
+ * - mm_cid_get (next)
+ * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ *
+ * This Dekker ensures that either task (Y) is observed by the
+ * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
+ * observed.
+ *
+ * If task (Y) store is observed by rcu_dereference(), it means that there is
+ * still an active task on the cpu. Remote-clear will therefore not transition
+ * to UNSET, which fulfills property (1).
+ *
+ * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
+ * it will move its state to UNSET, which clears the percpu cid perhaps
+ * uselessly (which is not an issue for correctness). Because task (Y) is not
+ * observed, CPU1 can move ahead to set the state to UNSET. Because moving
+ * state to UNSET is done with a cmpxchg expecting that the old state has the
+ * LAZY flag set, only one thread will successfully UNSET.
+ *
+ * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
+ * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
+ * CPU1 will observe task (Y) and do nothing more, which is fine.
+ *
+ * What we are effectively preventing with this Dekker is a scenario where
+ * neither LAZY flag nor store (Y) are observed, which would fail property (1)
+ * because this would UNSET a cid which is actively used.
+ */
+
+void sched_mm_cid_migrate_from(struct task_struct *t)
+{
+ t->migrate_from_cpu = task_cpu(t);
+}
+
+static
+int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid)
+{
+ struct mm_struct *mm = t->mm;
+ struct task_struct *src_task;
+ int src_cid, last_mm_cid;
+
+ if (!mm)
+ return -1;
+
+ last_mm_cid = t->last_mm_cid;
+ /*
+ * If the migrated task has no last cid, or if the current
+ * task on src rq uses the cid, it means the source cid does not need
+ * to be moved to the destination cpu.
+ */
+ if (last_mm_cid == -1)
+ return -1;
+ src_cid = READ_ONCE(src_pcpu_cid->cid);
+ if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
+ return -1;
+
+ /*
+ * If we observe an active task using the mm on this rq, it means we
+ * are not the last task to be migrated from this cpu for this mm, so
+ * there is no need to move src_cid to the destination cpu.
+ */
+ guard(rcu)();
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ t->last_mm_cid = -1;
+ return -1;
+ }
+
+ return src_cid;
+}
+
+static
+int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid,
+ int src_cid)
+{
+ struct task_struct *src_task;
+ struct mm_struct *mm = t->mm;
+ int lazy_cid;
+
+ if (src_cid == -1)
+ return -1;
+
+ /*
+ * Attempt to clear the source cpu cid to move it to the destination
+ * cpu.
+ */
+ lazy_cid = mm_cid_set_lazy_put(src_cid);
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
+ return -1;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, this task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ scoped_guard (rcu) {
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
+ }
+
+ /*
+ * The src_cid is unused, so it can be unset.
+ */
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ return -1;
+ WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
+ return src_cid;
+}
+
+/*
+ * Migration to dst cpu. Called with dst_rq lock held.
+ * Interrupts are disabled, which keeps the window of cid ownership without the
+ * source rq lock held small.
+ */
+void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+{
+ struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
+ struct mm_struct *mm = t->mm;
+ int src_cid, src_cpu;
+ bool dst_cid_is_set;
+ struct rq *src_rq;
+
+ lockdep_assert_rq_held(dst_rq);
+
+ if (!mm)
+ return;
+ src_cpu = t->migrate_from_cpu;
+ if (src_cpu == -1) {
+ t->last_mm_cid = -1;
+ return;
+ }
+ /*
+ * Move the src cid if the dst cid is unset. This keeps id
+ * allocation closest to 0 in cases where few threads migrate around
+ * many CPUs.
+ *
+ * If destination cid or recent cid is already set, we may have
+ * to just clear the src cid to ensure compactness in frequent
+ * migrations scenarios.
+ *
+ * It is not useful to clear the src cid when the number of threads is
+ * greater or equal to the number of allowed CPUs, because user-space
+ * can expect that the number of allowed cids can reach the number of
+ * allowed CPUs.
+ */
+ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
+ dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
+ !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
+ if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
+ return;
+ src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
+ src_rq = cpu_rq(src_cpu);
+ src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
+ if (src_cid == -1)
+ return;
+ src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
+ src_cid);
+ if (src_cid == -1)
+ return;
+ if (dst_cid_is_set) {
+ __mm_cid_put(mm, src_cid);
+ return;
+ }
+ /* Move src_cid to dst cpu. */
+ mm_cid_snapshot_time(dst_rq, mm);
+ WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+ WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
+}
+
+static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
+ int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *t;
+ int cid, lazy_cid;
+
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid))
+ return;
+
+ /*
+ * Clear the cpu cid if it is set to keep cid allocation compact. If
+ * there happens to be other tasks left on the source cpu using this
+ * mm, the next task using this mm will reallocate its cid on context
+ * switch.
+ */
+ lazy_cid = mm_cid_set_lazy_put(cid);
+ if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
+ return;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, that task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ scoped_guard (rcu) {
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+ return;
+ }
+
+ /*
+ * The cid is unused, so it can be unset.
+ * Disable interrupts to keep the window of cid ownership without rq
+ * lock small.
+ */
+ scoped_guard (irqsave) {
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
+ }
+}
+
+static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct mm_cid *pcpu_cid;
+ struct task_struct *curr;
+ u64 rq_clock;
+
+ /*
+ * rq->clock load is racy on 32-bit but one spurious clear once in a
+ * while is irrelevant.
+ */
+ rq_clock = READ_ONCE(rq->clock);
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+
+ /*
+ * In order to take care of infrequently scheduled tasks, bump the time
+ * snapshot associated with this cid if an active task using the mm is
+ * observed on this rq.
+ */
+ scoped_guard (rcu) {
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ WRITE_ONCE(pcpu_cid->time, rq_clock);
+ return;
+ }
+ }
+
+ if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+ return;
+ sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
+ int weight)
+{
+ struct mm_cid *pcpu_cid;
+ int cid;
+
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid) || cid < weight)
+ return;
+ sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void task_mm_cid_work(struct callback_head *work)
+{
+ unsigned long now = jiffies, old_scan, next_scan;
+ struct task_struct *t = current;
+ struct cpumask *cidmask;
+ struct mm_struct *mm;
+ int weight, cpu;
+
+ SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+
+ work->next = work; /* Prevent double-add */
+ if (t->flags & PF_EXITING)
+ return;
+ mm = t->mm;
+ if (!mm)
+ return;
+ old_scan = READ_ONCE(mm->mm_cid_next_scan);
+ next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ if (!old_scan) {
+ unsigned long res;
+
+ res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
+ if (res != old_scan)
+ old_scan = res;
+ else
+ old_scan = next_scan;
+ }
+ if (time_before(now, old_scan))
+ return;
+ if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
+ return;
+ cidmask = mm_cidmask(mm);
+ /* Clear cids that were not recently used. */
+ for_each_possible_cpu(cpu)
+ sched_mm_cid_remote_clear_old(mm, cpu);
+ weight = cpumask_weight(cidmask);
+ /*
+ * Clear cids that are greater or equal to the cidmask weight to
+ * recompact it.
+ */
+ for_each_possible_cpu(cpu)
+ sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+}
+
+void init_sched_mm_cid(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ int mm_users = 0;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1)
+ mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ }
+ t->cid_work.next = &t->cid_work; /* Protect against double add */
+ init_task_work(&t->cid_work, task_mm_cid_work);
+}
+
+void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->cid_work;
+ unsigned long now = jiffies;
+
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
+ work->next != work)
+ return;
+ if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
+ return;
+
+ /* No page allocation under rq lock */
+ task_work_add(curr, work, TWA_RESUME);
+}
+
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ guard(rq_lock_irqsave)(rq);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 0);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ mm_cid_put(mm);
+ t->last_mm_cid = t->mm_cid = -1;
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ guard(rq_lock_irqsave)(rq);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 0);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ mm_cid_put(mm);
+ t->last_mm_cid = t->mm_cid = -1;
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ scoped_guard (rq_lock_irqsave, rq) {
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 1);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
+ }
+ rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+ t->mm_cid_active = 1;
+}
+#endif
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
+ struct sched_enq_and_set_ctx *ctx)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
+
+ *ctx = (struct sched_enq_and_set_ctx){
+ .p = p,
+ .queue_flags = queue_flags,
+ .queued = task_on_rq_queued(p),
+ .running = task_current(rq, p),
+ };
+
+ update_rq_clock(rq);
+ if (ctx->queued)
+ dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ if (ctx->running)
+ put_prev_task(rq, p);
+}
+
+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+{
+ struct rq *rq = task_rq(ctx->p);
+
+ lockdep_assert_rq_held(rq);
+
+ if (ctx->queued)
+ enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ if (ctx->running)
+ set_next_task(rq, ctx->p);
+}
+#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
new file mode 100644
index 000000000000..1ef98a93eb1d
--- /dev/null
+++ b/kernel/sched/core_sched.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * A simple wrapper around refcount. An allocated sched_core_cookie's
+ * address is used to compute the cookie of the task.
+ */
+struct sched_core_cookie {
+ refcount_t refcnt;
+};
+
+static unsigned long sched_core_alloc_cookie(void)
+{
+ struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL);
+ if (!ck)
+ return 0;
+
+ refcount_set(&ck->refcnt, 1);
+ sched_core_get();
+
+ return (unsigned long)ck;
+}
+
+static void sched_core_put_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr && refcount_dec_and_test(&ptr->refcnt)) {
+ kfree(ptr);
+ sched_core_put();
+ }
+}
+
+static unsigned long sched_core_get_cookie(unsigned long cookie)
+{
+ struct sched_core_cookie *ptr = (void *)cookie;
+
+ if (ptr)
+ refcount_inc(&ptr->refcnt);
+
+ return cookie;
+}
+
+/*
+ * sched_core_update_cookie - replace the cookie on a task
+ * @p: the task to update
+ * @cookie: the new cookie
+ *
+ * Effectively exchange the task cookie; caller is responsible for lifetimes on
+ * both ends.
+ *
+ * Returns: the old cookie
+ */
+static unsigned long sched_core_update_cookie(struct task_struct *p,
+ unsigned long cookie)
+{
+ unsigned long old_cookie;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Since creating a cookie implies sched_core_get(), and we cannot set
+ * a cookie until after we've created it, similarly, we cannot destroy
+ * a cookie until after we've removed it, we must have core scheduling
+ * enabled here.
+ */
+ SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
+
+ if (sched_core_enqueued(p))
+ sched_core_dequeue(rq, p, DEQUEUE_SAVE);
+
+ old_cookie = p->core_cookie;
+ p->core_cookie = cookie;
+
+ /*
+ * Consider the cases: !prev_cookie and !cookie.
+ */
+ if (cookie && task_on_rq_queued(p))
+ sched_core_enqueue(rq, p);
+
+ /*
+ * If task is currently running, it may not be compatible anymore after
+ * the cookie change, so enter the scheduler on its CPU to schedule it
+ * away.
+ *
+ * Note that it is possible that as a result of this cookie change, the
+ * core has now entered/left forced idle state. Defer accounting to the
+ * next scheduling edge, rather than always forcing a reschedule here.
+ */
+ if (task_on_cpu(rq, p))
+ resched_curr(rq);
+
+ task_rq_unlock(rq, p, &rf);
+
+ return old_cookie;
+}
+
+static unsigned long sched_core_clone_cookie(struct task_struct *p)
+{
+ unsigned long cookie, flags;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ cookie = sched_core_get_cookie(p->core_cookie);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return cookie;
+}
+
+void sched_core_fork(struct task_struct *p)
+{
+ RB_CLEAR_NODE(&p->core_node);
+ p->core_cookie = sched_core_clone_cookie(current);
+}
+
+void sched_core_free(struct task_struct *p)
+{
+ sched_core_put_cookie(p->core_cookie);
+}
+
+static void __sched_core_set(struct task_struct *p, unsigned long cookie)
+{
+ cookie = sched_core_get_cookie(cookie);
+ cookie = sched_core_update_cookie(p, cookie);
+ sched_core_put_cookie(cookie);
+}
+
+/* Called from prctl interface: PR_SCHED_CORE */
+int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ unsigned long uaddr)
+{
+ unsigned long cookie = 0, id = 0;
+ struct task_struct *task, *p;
+ struct pid *grp;
+ int err = 0;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -ENODEV;
+
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID);
+
+ if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
+ (cmd != PR_SCHED_CORE_GET && uaddr))
+ return -EINVAL;
+
+ rcu_read_lock();
+ if (pid == 0) {
+ task = current;
+ } else {
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ }
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ switch (cmd) {
+ case PR_SCHED_CORE_GET:
+ if (type != PIDTYPE_PID || uaddr & 7) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ if (cookie) {
+ /* XXX improve ? */
+ ptr_to_hashval((void *)cookie, &id);
+ }
+ err = put_user(id, (u64 __user *)uaddr);
+ goto out;
+
+ case PR_SCHED_CORE_CREATE:
+ cookie = sched_core_alloc_cookie();
+ if (!cookie) {
+ err = -ENOMEM;
+ goto out;
+ }
+ break;
+
+ case PR_SCHED_CORE_SHARE_TO:
+ cookie = sched_core_clone_cookie(current);
+ break;
+
+ case PR_SCHED_CORE_SHARE_FROM:
+ if (type != PIDTYPE_PID) {
+ err = -EINVAL;
+ goto out;
+ }
+ cookie = sched_core_clone_cookie(task);
+ __sched_core_set(current, cookie);
+ goto out;
+
+ default:
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (type == PIDTYPE_PID) {
+ __sched_core_set(task, cookie);
+ goto out;
+ }
+
+ read_lock(&tasklist_lock);
+ grp = task_pid_type(task, type);
+
+ do_each_pid_thread(grp, type, p) {
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) {
+ err = -EPERM;
+ goto out_tasklist;
+ }
+ } while_each_pid_thread(grp, type, p);
+
+ do_each_pid_thread(grp, type, p) {
+ __sched_core_set(p, cookie);
+ } while_each_pid_thread(grp, type, p);
+out_tasklist:
+ read_unlock(&tasklist_lock);
+
+out:
+ sched_core_put_cookie(cookie);
+ put_task_struct(task);
+ return err;
+}
+
+#ifdef CONFIG_SCHEDSTATS
+
+/* REQUIRES: rq->core's clock recently updated. */
+void __sched_core_account_forceidle(struct rq *rq)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ u64 delta, now = rq_clock(rq->core);
+ struct rq *rq_i;
+ struct task_struct *p;
+ int i;
+
+ lockdep_assert_rq_held(rq);
+
+ WARN_ON_ONCE(!rq->core->core_forceidle_count);
+
+ if (rq->core->core_forceidle_start == 0)
+ return;
+
+ delta = now - rq->core->core_forceidle_start;
+ if (unlikely((s64)delta <= 0))
+ return;
+
+ rq->core->core_forceidle_start = now;
+
+ if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
+ /* can't be forced idle without a running task */
+ } else if (rq->core->core_forceidle_count > 1 ||
+ rq->core->core_forceidle_occupation > 1) {
+ /*
+ * For larger SMT configurations, we need to scale the charged
+ * forced idle amount since there can be more than one forced
+ * idle sibling and more than one running cookied task.
+ */
+ delta *= rq->core->core_forceidle_count;
+ delta = div_u64(delta, rq->core->core_forceidle_occupation);
+ }
+
+ for_each_cpu(i, smt_mask) {
+ rq_i = cpu_rq(i);
+ p = rq_i->core_pick ?: rq_i->curr;
+
+ if (p == rq_i->idle)
+ continue;
+
+ /*
+ * Note: this will account forceidle to the current CPU, even
+ * if it comes from our SMT sibling.
+ */
+ __account_forceidle_time(p, delta);
+ }
+}
+
+void __sched_core_tick(struct rq *rq)
+{
+ if (!rq->core->core_forceidle_count)
+ return;
+
+ if (rq != rq->core)
+ update_rq_clock(rq->core);
+
+ __sched_core_account_forceidle(rq);
+}
+
+#endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 941c28cf9738..0de9dda09949 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
-#include <asm/irq_regs.h>
-#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
@@ -21,15 +20,11 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
-struct cpuacct_usage {
- u64 usages[CPUACCT_STAT_NSTATS];
-};
-
/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every CPU */
- struct cpuacct_usage __percpu *cpuusage;
+ u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
@@ -49,7 +44,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
return css_ca(ca->css.parent);
}
-static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage,
@@ -68,7 +63,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
if (!ca)
goto out;
- ca->cpuusage = alloc_percpu(struct cpuacct_usage);
+ ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage)
goto out_free_ca;
@@ -99,56 +94,66 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_stat_index index)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
u64 data;
/*
* We allow index == CPUACCT_STAT_NSTATS here to read
- * the sum of suages.
+ * the sum of usages.
*/
- BUG_ON(index > CPUACCT_STAT_NSTATS);
+ if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
+ return 0;
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
- if (index == CPUACCT_STAT_NSTATS) {
- int i = 0;
-
- data = 0;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- data += cpuusage->usages[i];
- } else {
- data = cpuusage->usages[index];
+ switch (index) {
+ case CPUACCT_STAT_USER:
+ data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
+ break;
+ case CPUACCT_STAT_SYSTEM:
+ data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
+ cpustat[CPUTIME_SOFTIRQ];
+ break;
+ case CPUACCT_STAT_NSTATS:
+ data = *cpuusage;
+ break;
}
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
return data;
}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
{
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- int i;
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
+
+ /* Don't allow to reset global kernel_cpustat */
+ if (ca == &root_cpuacct)
+ return;
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
-
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- cpuusage->usages[i] = val;
+ *cpuusage = 0;
+ cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
+ cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
+ cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
@@ -196,7 +201,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return -EINVAL;
for_each_possible_cpu(cpu)
- cpuacct_cpuusage_write(ca, cpu, 0);
+ cpuacct_cpuusage_write(ca, cpu);
return 0;
}
@@ -243,25 +248,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
seq_puts(m, "\n");
for_each_possible_cpu(cpu) {
- struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
seq_printf(m, "%d", cpu);
-
- for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit
- * platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-#endif
-
- seq_printf(m, " %llu", cpuusage->usages[index]);
-
-#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#endif
- }
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %llu",
+ cpuacct_cpuusage_read(ca, cpu, index));
seq_puts(m, "\n");
}
return 0;
@@ -270,25 +260,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
- s64 val[CPUACCT_STAT_NSTATS];
+ struct task_cputime cputime;
+ u64 val[CPUACCT_STAT_NSTATS];
int cpu;
int stat;
- memset(val, 0, sizeof(val));
+ memset(&cputime, 0, sizeof(cputime));
for_each_possible_cpu(cpu) {
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
- val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
- val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
+ cputime.utime += cpustat[CPUTIME_USER];
+ cputime.utime += cpustat[CPUTIME_NICE];
+ cputime.stime += cpustat[CPUTIME_SYSTEM];
+ cputime.stime += cpustat[CPUTIME_IRQ];
+ cputime.stime += cpustat[CPUTIME_SOFTIRQ];
+
+ cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
}
+ cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
+ &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
+
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
- seq_printf(sf, "%s %lld\n",
- cpuacct_stat_desc[stat],
- (long long)nsec_to_clock_t(val[stat]));
+ seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
+ nsec_to_clock_t(val[stat]));
}
return 0;
@@ -338,19 +333,13 @@ static struct cftype files[] = {
*/
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
+ unsigned int cpu = task_cpu(tsk);
struct cpuacct *ca;
- int index = CPUACCT_STAT_SYSTEM;
- struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
- if (regs && user_mode(regs))
- index = CPUACCT_STAT_USER;
-
- rcu_read_lock();
+ lockdep_assert_rq_held(cpu_rq(cpu));
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- __this_cpu_add(ca->cpuusage->usages[index], cputime);
-
- rcu_read_unlock();
+ *per_cpu_ptr(ca->cpuusage, cpu) += cputime;
}
/*
@@ -362,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
struct cpuacct *ca;
- rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
__this_cpu_add(ca->cpustat->cpustat[index], val);
- rcu_read_unlock();
}
struct cgroup_subsys cpuacct_cgrp_subsys = {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5cc4012572ec..95baa12a1029 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * kernel/sched/cpudl.c
+ * kernel/sched/cpudeadline.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
*/
-#include "sched.h"
static inline int parent(int i)
{
@@ -120,14 +119,38 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
+ unsigned long cap, max_cap = 0;
+ int cpu, max_cpu = -1;
+
+ if (!sched_asym_cpucap_active())
+ return 1;
+
+ /* Ensure the capacity of the CPUs fits the task. */
+ for_each_cpu(cpu, later_mask) {
+ if (!dl_task_fits_capacity(p, cpu)) {
+ cpumask_clear_cpu(cpu, later_mask);
+
+ cap = arch_scale_cpu_capacity(cpu);
+
+ if (cap > max_cap ||
+ (cpu == task_cpu(p) && cap == max_cap)) {
+ max_cap = cap;
+ max_cpu = cpu;
+ }
+ }
+ }
+
+ if (cpumask_empty(later_mask))
+ cpumask_set_cpu(max_cpu, later_mask);
+
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
- if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
+ if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 7c2fe50fd76d..5252fb191fae 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,9 +5,6 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#include <linux/cpufreq.h>
-
-#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7fbaee24c824..1a19d69b91ed 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -6,13 +6,6 @@
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "sched.h"
-
-#include <linux/sched/cpufreq.h>
-#include <trace/events/power.h>
-
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
struct sugov_tunables {
@@ -26,7 +19,7 @@ struct sugov_policy {
struct sugov_tunables *tunables;
struct list_head tunables_hook;
- raw_spinlock_t update_lock; /* For shared policies */
+ raw_spinlock_t update_lock;
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
@@ -53,8 +46,8 @@ struct sugov_cpu {
unsigned int iowait_boost;
u64 last_update;
- unsigned long bw_dl;
- unsigned long max;
+ unsigned long util;
+ unsigned long bw_min;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -90,7 +83,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
if (unlikely(sg_policy->limits_changed)) {
sg_policy->limits_changed = false;
- sg_policy->need_freq_update = true;
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
return true;
}
@@ -102,7 +95,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- if (sg_policy->next_freq == next_freq)
+ if (sg_policy->need_freq_update)
+ sg_policy->need_freq_update = false;
+ else if (sg_policy->next_freq == next_freq)
return false;
sg_policy->next_freq = next_freq;
@@ -111,37 +106,38 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
return true;
}
-static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+static void sugov_deferred_update(struct sugov_policy *sg_policy)
{
- struct cpufreq_policy *policy = sg_policy->policy;
- int cpu;
-
- if (!sugov_update_next_freq(sg_policy, time, next_freq))
- return;
-
- next_freq = cpufreq_driver_fast_switch(policy, next_freq);
- if (!next_freq)
- return;
-
- policy->cur = next_freq;
-
- if (trace_cpu_frequency_enabled()) {
- for_each_cpu(cpu, policy->cpus)
- trace_cpu_frequency(next_freq, cpu);
+ if (!sg_policy->work_in_progress) {
+ sg_policy->work_in_progress = true;
+ irq_work_queue(&sg_policy->irq_work);
}
}
-static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
- unsigned int next_freq)
+/**
+ * get_capacity_ref_freq - get the reference frequency that has been used to
+ * correlate frequency and compute capacity for a given cpufreq policy. We use
+ * the CPU managing it for the arch_scale_freq_ref() call in the function.
+ * @policy: the cpufreq policy of the CPU in question.
+ *
+ * Return: the reference CPU frequency to compute a capacity.
+ */
+static __always_inline
+unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
{
- if (!sugov_update_next_freq(sg_policy, time, next_freq))
- return;
+ unsigned int freq = arch_scale_freq_ref(policy->cpu);
- if (!sg_policy->work_in_progress) {
- sg_policy->work_in_progress = true;
- irq_work_queue(&sg_policy->irq_work);
- }
+ if (freq)
+ return freq;
+
+ if (arch_scale_freq_invariant())
+ return policy->cpuinfo.max_freq;
+
+ /*
+ * Apply a 25% margin so that we select a higher frequency than
+ * the current one before the CPU is fully busy:
+ */
+ return policy->cur + (policy->cur >> 2);
}
/**
@@ -170,135 +166,45 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned long util, unsigned long max)
{
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int freq = arch_scale_freq_invariant() ?
- policy->cpuinfo.max_freq : policy->cur;
+ unsigned int freq;
+ freq = get_capacity_ref_freq(policy);
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
- sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the irq utilization.
- *
- * The DL bandwidth number otoh is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p)
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max)
{
- unsigned long dl_util, util, irq;
- struct rq *rq = cpu_rq(cpu);
-
- if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
+ /* Add dvfs headroom to actual utilization */
+ actual = map_util_perf(actual);
+ /* Actually we don't need to target the max performance */
+ if (actual < max)
+ max = actual;
/*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
+ * Ensure at least minimum performance while providing more compute
+ * capacity when possible.
*/
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
- */
- util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
-
- /*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
- */
- if (util + dl_util >= max)
- return max;
-
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
-
- /*
- * There is still idle time; further improve the number by using the
- * irq metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, max);
- util += irq;
-
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
+ return max(min, max);
}
-static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
{
- struct rq *rq = cpu_rq(sg_cpu->cpu);
- unsigned long util = cpu_util_cfs(rq);
- unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
+ unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
- sg_cpu->max = max;
- sg_cpu->bw_dl = cpu_bw_dl(rq);
-
- return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
+ if (!scx_switched_all())
+ util += cpu_util_cfs_boost(sg_cpu->cpu);
+ util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+ util = max(util, boost);
+ sg_cpu->bw_min = min;
+ sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
}
/**
@@ -375,8 +281,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
- * @util: the utilization to (eventually) boost
- * @max: the maximum value the utilization can be boosted to
+ * @max_cap: the max CPU capacity
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
@@ -391,17 +296,15 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* being more conservative on tasks which does sporadic IO operations.
*/
static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
- unsigned long util, unsigned long max)
+ unsigned long max_cap)
{
- unsigned long boost;
-
/* No boost currently required */
if (!sg_cpu->iowait_boost)
- return util;
+ return 0;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
- return util;
+ return 0;
if (!sg_cpu->iowait_boost_pending) {
/*
@@ -410,114 +313,173 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
sg_cpu->iowait_boost = 0;
- return util;
+ return 0;
}
}
sg_cpu->iowait_boost_pending = false;
/*
- * @util is already in capacity scale; convert iowait_boost
+ * sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
- return max(boost, util);
+ return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
}
#ifdef CONFIG_NO_HZ_COMMON
-static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
{
- unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
- bool ret = idle_calls == sg_cpu->saved_idle_calls;
+ unsigned long idle_calls;
+ bool ret;
+
+ /*
+ * The heuristics in this function is for the fair class. For SCX, the
+ * performance target comes directly from the BPF scheduler. Let's just
+ * follow it.
+ */
+ if (scx_switched_all())
+ return false;
+
+ /* if capped by uclamp_max, always update to be in compliance */
+ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
+ return false;
+
+ /*
+ * Maintain the frequency if the CPU has not been idle recently, as
+ * reduction is likely to be premature.
+ */
+ idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
+ ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls;
return ret;
}
#else
-static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */
/*
* Make sugov_should_update_freq() ignore the rate limit when DL
* has increased the utilization.
*/
-static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
+static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
- if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_policy->limits_changed = true;
+ if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
+ sg_cpu->sg_policy->limits_changed = true;
}
-static void sugov_update_single(struct update_util_data *hook, u64 time,
- unsigned int flags)
+static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
+ u64 time, unsigned long max_cap,
+ unsigned int flags)
{
- struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- unsigned long util, max;
- unsigned int next_f;
- bool busy;
+ unsigned long boost;
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
- ignore_dl_rate_limit(sg_cpu, sg_policy);
+ ignore_dl_rate_limit(sg_cpu);
+
+ if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
+ return false;
+
+ boost = sugov_iowait_apply(sg_cpu, time, max_cap);
+ sugov_get_util(sg_cpu, boost);
+
+ return true;
+}
+
+static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned long max_cap;
+ unsigned int next_f;
+
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
- if (!sugov_should_update_freq(sg_policy, time))
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
- /* Limits may have changed, don't skip frequency update */
- busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
- util = sugov_get_util(sg_cpu);
- max = sg_cpu->max;
- util = sugov_iowait_apply(sg_cpu, time, util, max);
- next_f = get_next_freq(sg_policy, util, max);
- /*
- * Do not reduce the frequency if the CPU has not been idle
- * recently, as the reduction is likely to be premature then.
- */
- if (busy && next_f < sg_policy->next_freq) {
+ if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
+ !sg_policy->need_freq_update) {
next_f = sg_policy->next_freq;
- /* Reset cached freq as next_freq has changed */
- sg_policy->cached_raw_freq = 0;
+ /* Restore cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = cached_freq;
}
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ return;
+
/*
* This code runs under rq->lock for the target CPU, so it won't run
* concurrently on two different CPUs for the same target and it is not
* necessary to acquire the lock in the fast switch case.
*/
if (sg_policy->policy->fast_switch_enabled) {
- sugov_fast_switch(sg_policy, time, next_f);
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
} else {
raw_spin_lock(&sg_policy->update_lock);
- sugov_deferred_update(sg_policy, time, next_f);
+ sugov_deferred_update(sg_policy);
raw_spin_unlock(&sg_policy->update_lock);
}
}
+static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ unsigned long prev_util = sg_cpu->util;
+ unsigned long max_cap;
+
+ /*
+ * Fall back to the "frequency" path if frequency invariance is not
+ * supported, because the direct mapping between the utilization and
+ * the performance levels depends on the frequency invariance.
+ */
+ if (!arch_scale_freq_invariant()) {
+ sugov_update_single_freq(hook, time, flags);
+ return;
+ }
+
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
+ return;
+
+ if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
+ sg_cpu->util = prev_util;
+
+ cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+ sg_cpu->util, max_cap);
+
+ sg_cpu->sg_policy->last_freq_update_time = time;
+}
+
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util = 0, max = 1;
+ unsigned long util = 0, max_cap;
unsigned int j;
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long j_util, j_max;
+ unsigned long boost;
- j_util = sugov_get_util(j_sg_cpu);
- j_max = j_sg_cpu->max;
- j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
+ boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
+ sugov_get_util(j_sg_cpu, boost);
- if (j_util * max > j_max * util) {
- util = j_util;
- max = j_max;
- }
+ util = max(j_sg_cpu->util, util);
}
- return get_next_freq(sg_policy, util, max);
+ return get_next_freq(sg_policy, util, max_cap);
}
static void
@@ -532,17 +494,20 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
- ignore_dl_rate_limit(sg_cpu, sg_policy);
+ ignore_dl_rate_limit(sg_cpu);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
+ if (!sugov_update_next_freq(sg_policy, time, next_f))
+ goto unlock;
+
if (sg_policy->policy->fast_switch_enabled)
- sugov_fast_switch(sg_policy, time, next_f);
+ cpufreq_driver_fast_switch(sg_policy->policy, next_f);
else
- sugov_deferred_update(sg_policy, time, next_f);
+ sugov_deferred_update(sg_policy);
}
-
+unlock:
raw_spin_unlock(&sg_policy->update_lock);
}
@@ -554,7 +519,7 @@ static void sugov_work(struct kthread_work *work)
/*
* Hold sg_policy->update_lock shortly to handle the case where:
- * incase sg_policy->next_freq is read here, and then updated by
+ * in case sg_policy->next_freq is read here, and then updated by
* sugov_deferred_update() just before work_in_progress is set to false
* here, we may miss queueing the new update.
*
@@ -624,9 +589,17 @@ static struct attribute *sugov_attrs[] = {
};
ATTRIBUTE_GROUPS(sugov);
-static struct kobj_type sugov_tunables_ktype = {
+static void sugov_tunables_free(struct kobject *kobj)
+{
+ struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
+
+ kfree(to_sugov_tunables(attr_set));
+}
+
+static const struct kobj_type sugov_tunables_ktype = {
.default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
+ .release = &sugov_tunables_free,
};
/********************** cpufreq governor interface *********************/
@@ -664,9 +637,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
- .sched_runtime = 1000000,
- .sched_deadline = 10000000,
- .sched_period = 10000000,
+ .sched_runtime = NSEC_PER_MSEC,
+ .sched_deadline = 10 * NSEC_PER_MSEC,
+ .sched_period = 10 * NSEC_PER_MSEC,
};
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
@@ -693,7 +666,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
- kthread_bind_mask(thread, policy->related_cpus);
+ if (policy->dvfs_possible_from_any_cpu)
+ set_cpus_allowed_ptr(thread, policy->related_cpus);
+ else
+ kthread_bind_mask(thread, policy->related_cpus);
+
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -726,12 +703,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic
return tunables;
}
-static void sugov_tunables_free(struct sugov_tunables *tunables)
+static void sugov_clear_global_tunables(void)
{
if (!have_governor_per_policy())
global_tunables = NULL;
-
- kfree(tunables);
}
static int sugov_init(struct cpufreq_policy *policy)
@@ -788,13 +763,18 @@ static int sugov_init(struct cpufreq_policy *policy)
goto fail;
out:
+ /*
+ * Schedutil is the preferred governor for EAS, so rebuild sched domains
+ * on governor changes to make sure the scheduler knows about them.
+ */
+ em_rebuild_sched_domains();
mutex_unlock(&global_tunables_lock);
return 0;
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
stop_kthread:
sugov_kthread_stop(sg_policy);
@@ -821,18 +801,21 @@ static void sugov_exit(struct cpufreq_policy *policy)
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
if (!count)
- sugov_tunables_free(tunables);
+ sugov_clear_global_tunables();
mutex_unlock(&global_tunables_lock);
sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
cpufreq_disable_fast_switch(policy);
+
+ em_rebuild_sched_domains();
}
static int sugov_start(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
+ void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
unsigned int cpu;
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
@@ -840,24 +823,24 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->limits_changed = false;
- sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
- for_each_cpu(cpu, policy->cpus) {
- struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
- memset(sg_cpu, 0, sizeof(*sg_cpu));
- sg_cpu->cpu = cpu;
- sg_cpu->sg_policy = sg_policy;
- }
+ if (policy_is_shared(policy))
+ uu = sugov_update_shared;
+ else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
+ uu = sugov_update_single_perf;
+ else
+ uu = sugov_update_single_freq;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- policy_is_shared(policy) ?
- sugov_update_shared :
- sugov_update_single);
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->cpu = cpu;
+ sg_cpu->sg_policy = sg_policy;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
}
return 0;
}
@@ -894,7 +877,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
- .dynamic_switching = true,
+ .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
@@ -909,41 +892,4 @@ struct cpufreq_governor *cpufreq_default_governor(void)
}
#endif
-static int __init sugov_register(void)
-{
- return cpufreq_register_governor(&schedutil_gov);
-}
-core_initcall(sugov_register);
-
-#ifdef CONFIG_ENERGY_MODEL
-extern bool sched_energy_update;
-extern struct mutex sched_energy_mutex;
-
-static void rebuild_sd_workfn(struct work_struct *work)
-{
- mutex_lock(&sched_energy_mutex);
- sched_energy_update = true;
- rebuild_sched_domains();
- sched_energy_update = false;
- mutex_unlock(&sched_energy_mutex);
-}
-static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
-
-/*
- * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
- * on governor changes to make sure the scheduler knows about it.
- */
-void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
- struct cpufreq_governor *old_gov)
-{
- if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
- /*
- * When called from the cpufreq_register_driver() path, the
- * cpu_hotplug_lock is already held, so use a work item to
- * avoid nested locking in rebuild_sched_domains().
- */
- schedule_work(&rebuild_sd_work);
- }
-
-}
-#endif
+cpufreq_governor_init(schedutil_gov);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 0033731a0797..42c40cfdf836 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -11,7 +11,7 @@
* This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows:
*
- * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ * (INVALID), NORMAL, RT1, ... RT99, HIGHER
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
@@ -19,24 +19,47 @@
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
- * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
-#include "sched.h"
-/* Convert between a 140 based task->prio, and our 102 based cpupri */
+/*
+ * p->rt_priority p->prio newpri cpupri
+ *
+ * -1 -1 (CPUPRI_INVALID)
+ *
+ * 99 0 (CPUPRI_NORMAL)
+ *
+ * 1 98 98 1
+ * ...
+ * 49 50 50 49
+ * 50 49 49 50
+ * ...
+ * 99 0 0 99
+ *
+ * 100 100 (CPUPRI_HIGHER)
+ */
static int convert_prio(int prio)
{
int cpupri;
- if (prio == CPUPRI_INVALID)
- cpupri = CPUPRI_INVALID;
- else if (prio == MAX_PRIO)
- cpupri = CPUPRI_IDLE;
- else if (prio >= MAX_RT_PRIO)
- cpupri = CPUPRI_NORMAL;
- else
- cpupri = MAX_RT_PRIO - prio + 1;
+ switch (prio) {
+ case CPUPRI_INVALID:
+ cpupri = CPUPRI_INVALID; /* -1 */
+ break;
+
+ case 0 ... 98:
+ cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
+ break;
+
+ case MAX_RT_PRIO-1:
+ cpupri = CPUPRI_NORMAL; /* 0 */
+ break;
+
+ case MAX_RT_PRIO:
+ cpupri = CPUPRI_HIGHER; /* 100 */
+ break;
+ }
return cpupri;
}
@@ -53,7 +76,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
* When looking at the vector, we need to read the counter,
* do a memory barrier, then read the mask.
*
- * Note: This is still all racey, but we can deal with it.
+ * Note: This is still all racy, but we can deal with it.
* Ideally, we only want to look at masks that are set.
*
* If a mask is not set, then the only thing wrong is that we
@@ -73,11 +96,12 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
return 0;
- if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
return 0;
if (lowest_mask) {
- cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
+ cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+ cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/*
* We have to ensure that we have at least one bit
@@ -124,7 +148,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
int task_pri = convert_prio(p->prio);
int idx, cpu;
- BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+ WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
@@ -162,7 +186,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* The cost of this trade-off is not entirely clear and will probably
* be good for some workloads and bad for others.
*
- * The main idea here is that if some CPUs were overcommitted, we try
+ * The main idea here is that if some CPUs were over-committed, we try
* to spread which is what the scheduler traditionally did. Sys admins
* must do proper RT planning to avoid overloading the system if they
* really care.
@@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* cpupri_set - update the CPU priority setting
* @cp: The cpupri context
* @cpu: The target CPU
- * @newpri: The priority (INVALID-RT99) to assign to this CPU
+ * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index efbb492bb94c..d6cba0020064 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,11 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
+#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
#define CPUPRI_INVALID -1
-#define CPUPRI_IDLE 0
-#define CPUPRI_NORMAL 1
-/* values 2-101 are RT priorities 0-99 */
+#define CPUPRI_NORMAL 0
+/* values 1-99 are for RT1-RT99 priorities */
+#define CPUPRI_HIGHER 100
struct cpupri_vec {
atomic_t count;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435dee1df..6dab4854c6c0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,7 +2,10 @@
/*
* Simple CPU accounting cgroup controller
*/
-#include "sched.h"
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ #include <asm/cputime.h>
+#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -11,15 +14,15 @@
* They are only modified in vtime_account, on corresponding CPU
* with interrupts disabled. So, writes are safe.
* They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
+ * This may result in other CPU reading this CPU's IRQ time and can
* race with irq/vtime_account on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of IRQ time to wrong
+ * task when IRQ is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each IRQ in account_system_time.
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static int sched_clock_irqtime;
+int sched_clock_irqtime;
void enable_sched_clock_irqtime(void)
{
@@ -44,21 +47,23 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
}
/*
- * Called before incrementing preempt_count on {soft,}irq_enter
+ * Called after incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
*/
-void irqtime_account_irq(struct task_struct *curr)
+void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
{
struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
+ unsigned int pc;
s64 delta;
int cpu;
- if (!sched_clock_irqtime)
+ if (!irqtime_enabled())
return;
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
+ pc = irq_count() - offset;
/*
* We do not account for softirq time from ksoftirqd here.
@@ -66,12 +71,11 @@ void irqtime_account_irq(struct task_struct *curr)
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
- if (hardirq_count())
+ if (pc & HARDIRQ_MASK)
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
static u64 irqtime_tick_accounted(u64 maxtime)
{
@@ -86,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime)
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-#define sched_clock_irqtime (0)
-
static u64 irqtime_tick_accounted(u64 dummy)
{
return 0;
@@ -147,10 +149,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += cputime;
+ task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += cputime;
+ task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
@@ -226,6 +228,21 @@ void account_idle_time(u64 cputime)
cpustat[CPUTIME_IDLE] += cputime;
}
+
+#ifdef CONFIG_SCHED_CORE
+/*
+ * Account for forceidle time due to core scheduling.
+ *
+ * REQUIRES: schedstat is enabled.
+ */
+void __account_forceidle_time(struct task_struct *p, u64 delta)
+{
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+
+ task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
@@ -250,7 +267,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
}
/*
- * Account how much elapsed time was spent in steal, irq, or softirq time.
+ * Account how much elapsed time was spent in steal, IRQ, or softirq time.
*/
static inline u64 account_other_time(u64 max)
{
@@ -351,7 +368,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* Check for hardirq is done both for system and user time as there is
* no timer going off while we are on hardirq and hence we may never get an
* opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
+ * p->stime and friends are only updated on system time and not on IRQ
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
@@ -361,7 +378,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
/*
* When returning from idle, many ticks can get accounted at
- * once, including some ticks of steal, irq, and softirq time.
+ * once, including some ticks of steal, IRQ, and softirq time.
* Subtract those ticks from the amount of time accounted to
* idle, or potentially user or system time. Due to rounding,
* other time can exceed ticks occasionally.
@@ -405,37 +422,21 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
+void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_kernel(prev);
-
- vtime_flush(prev);
- arch_vtime_task_switch(prev);
-}
-# endif
-
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_kernel() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
-{
- if (!in_interrupt() && is_idle_task(tsk))
+ unsigned int pc = irq_count() - offset;
+
+ if (pc & HARDIRQ_OFFSET) {
+ vtime_account_hardirq(tsk);
+ } else if (pc & SOFTIRQ_OFFSET) {
+ vtime_account_softirq(tsk);
+ } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
+ is_idle_task(tsk)) {
vtime_account_idle(tsk);
- else
+ } else {
vtime_account_kernel(tsk);
+ }
}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
u64 *ut, u64 *st)
@@ -475,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (vtime_accounting_enabled_this_cpu())
return;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -504,7 +505,7 @@ void account_idle_ticks(unsigned long ticks)
{
u64 cputime, steal;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_idle_ticks(ticks);
return;
}
@@ -520,50 +521,6 @@ void account_idle_ticks(unsigned long ticks)
}
/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime)
- swap(rtime, stime);
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
-
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return scaled;
-}
-
-/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
*
@@ -609,7 +566,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
/*
* If either stime or utime are 0, assume all runtime is userspace.
- * Once a task gets some ticks, the monotonicy code at 'update:'
+ * Once a task gets some ticks, the monotonicity code at 'update:'
* will ensure things converge to the observed ratio.
*/
if (stime == 0) {
@@ -622,7 +579,13 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
goto update;
}
- stime = scale_stime(stime, rtime, stime + utime);
+ stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
+ /*
+ * Because mul_u64_u64_div_u64() can approximate on some
+ * achitectures; enforce the constraint that: a*b/(b+c) <= a.
+ */
+ if (unlikely(stime > rtime))
+ stime = rtime;
update:
/*
@@ -661,7 +624,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
.sum_exec_runtime = p->se.sum_exec_runtime,
};
- task_cputime(p, &cputime.utime, &cputime.stime);
+ if (task_cputime(p, &cputime.utime, &cputime.stime))
+ cputime.sum_exec_runtime = task_sched_runtime(p);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
EXPORT_SYMBOL_GPL(task_cputime_adjusted);
@@ -874,19 +838,21 @@ u64 task_gtime(struct task_struct *t)
* add up the pending nohz execution time since the last
* cputime snapshot.
*/
-void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
+bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
struct vtime *vtime = &t->vtime;
unsigned int seq;
u64 delta;
+ int ret;
if (!vtime_accounting_enabled()) {
*utime = t->utime;
*stime = t->stime;
- return;
+ return false;
}
do {
+ ret = false;
seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime;
@@ -896,6 +862,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
if (vtime->state < VTIME_SYS)
continue;
+ ret = true;
delta = vtime_delta(vtime);
/*
@@ -907,6 +874,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
else
*utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return ret;
}
static int vtime_state_fetch(struct vtime *vtime, int cpu)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f63f337c7147..ff4df16b5186 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -15,13 +15,52 @@
* Michael Trimarchi <michael@amarulasolutions.com>,
* Fabio Checconi <fchecconi@gmail.com>
*/
-#include "sched.h"
-#include "pelt.h"
-struct dl_bandwidth def_dl_bandwidth;
+#include <linux/cpuset.h>
+
+/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting ridiculously long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table sched_dl_sysctls[] = {
+ {
+ .procname = "sched_deadline_period_max_us",
+ .data = &sysctl_sched_dl_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)&sysctl_sched_dl_period_min,
+ },
+ {
+ .procname = "sched_deadline_period_min_us",
+ .data = &sysctl_sched_dl_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = (void *)&sysctl_sched_dl_period_max,
+ },
+};
+
+static int __init sched_dl_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_dl_sysctls);
+ return 0;
+}
+late_initcall(sched_dl_sysctl_init);
+#endif
+
+static bool dl_server(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_server;
+}
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
+ BUG_ON(dl_server(dl_se));
return container_of(dl_se, struct task_struct, dl);
}
@@ -30,12 +69,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
return container_of(dl_rq, struct rq, dl);
}
-static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = task_rq(p);
+ struct rq *rq = dl_se->rq;
+
+ if (!dl_server(dl_se))
+ rq = task_rq(dl_task_of(dl_se));
- return &rq->dl;
+ return rq;
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ return &rq_of_dl_se(dl_se)->dl;
}
static inline int on_dl_rq(struct sched_dl_entity *dl_se)
@@ -43,6 +89,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+#ifdef CONFIG_RT_MUTEXES
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se->pi_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return pi_of(dl_se) != dl_se;
+}
+#else
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return false;
+}
+#endif
+
#ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -54,15 +122,75 @@ static inline struct dl_bw *dl_bw_of(int i)
static inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
+ int cpus;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
+
+ if (cpumask_subset(rd->span, cpu_active_mask))
+ return cpumask_weight(rd->span);
+
+ cpus = 0;
+
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
return cpus;
}
+
+static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
+{
+ unsigned long cap = 0;
+ int i;
+
+ for_each_cpu_and(i, mask, cpu_active_mask)
+ cap += arch_scale_cpu_capacity(i);
+
+ return cap;
+}
+
+/*
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
+ */
+static inline unsigned long dl_bw_capacity(int i)
+{
+ if (!sched_asym_cpucap_active() &&
+ arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) {
+ return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
+ } else {
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ return __dl_bw_capacity(cpu_rq(i)->rd->span);
+ }
+}
+
+static inline bool dl_bw_visited(int cpu, u64 gen)
+{
+ struct root_domain *rd = cpu_rq(cpu)->rd;
+
+ if (rd->visit_gen == gen)
+ return true;
+
+ rd->visit_gen = gen;
+ return false;
+}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -73,14 +201,53 @@ static inline int dl_bw_cpus(int i)
{
return 1;
}
+
+static inline unsigned long dl_bw_capacity(int i)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline bool dl_bw_visited(int cpu, u64 gen)
+{
+ return false;
+}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+ dl->extra_bw += bw;
+}
#endif
static inline
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
+}
+
+static inline bool
+__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+static inline
void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
@@ -93,7 +260,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
@@ -107,7 +274,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw += dl_bw;
SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
}
@@ -117,7 +284,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
@@ -153,33 +320,63 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
__sub_running_bw(dl_se->dl_bw, dl_rq);
}
-static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw)
{
- struct rq *rq;
-
- BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
-
- if (task_on_rq_queued(p))
- return;
+ if (dl_se->dl_non_contending) {
+ sub_running_bw(dl_se, &rq->dl);
+ dl_se->dl_non_contending = 0;
- rq = task_rq(p);
- if (p->dl.dl_non_contending) {
- sub_running_bw(&p->dl, &rq->dl);
- p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+ if (!dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+ }
}
- __sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ __sub_rq_bw(dl_se->dl_bw, &rq->dl);
__add_rq_bw(new_bw, &rq->dl);
}
+static __always_inline
+void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer)
+{
+ /*
+ * If the timer callback was running (hrtimer_try_to_cancel == -1),
+ * it will eventually call put_task_struct().
+ */
+ if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+}
+
+static __always_inline
+void cancel_replenish_timer(struct sched_dl_entity *dl_se)
+{
+ cancel_dl_timer(dl_se, &dl_se->dl_timer);
+}
+
+static __always_inline
+void cancel_inactive_timer(struct sched_dl_entity *dl_se)
+{
+ cancel_dl_timer(dl_se, &dl_se->inactive_timer);
+}
+
+static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
+
+ if (task_on_rq_queued(p))
+ return;
+
+ dl_rq_change_utilization(task_rq(p), &p->dl, new_bw);
+}
+
+static void __dl_clear_params(struct sched_dl_entity *dl_se);
+
/*
* The utilization of a task cannot be immediately removed from
* the rq active utilization (running_bw) when the task blocks.
@@ -190,7 +387,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* fires.
*
* If the task wakes up again before the inactive timer fires,
- * the timer is cancelled, whereas if the task wakes up after the
+ * the timer is canceled, whereas if the task wakes up after the
* inactive timer fired (and running_bw has been decreased) the
* task's utilization has to be added to running_bw again.
* A flag in the deadline scheduling entity (dl_non_contending)
@@ -234,12 +431,11 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* up, and checks if the task is still in the "ACTIVE non contending"
* state or not (in the second case, it updates running_bw).
*/
-static void task_non_contending(struct task_struct *p)
+static void task_non_contending(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->inactive_timer;
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct rq *rq = rq_of_dl_se(dl_se);
+ struct dl_rq *dl_rq = &rq->dl;
s64 zerolag_time;
/*
@@ -269,24 +465,33 @@ static void task_non_contending(struct task_struct *p)
* utilization now, instead of starting a timer
*/
if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
- if (dl_task(p))
+ if (dl_server(dl_se)) {
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || p->state == TASK_DEAD) {
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
- if (p->state == TASK_DEAD)
- sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_lock(&dl_b->lock);
- __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
- __dl_clear_params(p);
- raw_spin_unlock(&dl_b->lock);
+ } else {
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (dl_task(p))
+ sub_running_bw(dl_se, dl_rq);
+
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (READ_ONCE(p->__state) == TASK_DEAD)
+ sub_rq_bw(dl_se, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(dl_se);
+ }
}
return;
}
dl_se->dl_non_contending = 1;
- get_task_struct(p);
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
+
hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
@@ -308,13 +513,12 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
dl_se->dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
- put_task_struct(dl_task_of(dl_se));
+ cancel_inactive_timer(dl_se);
} else {
/*
* Since "dl_non_contending" is not set, the
@@ -327,31 +531,20 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
}
}
-static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- struct sched_dl_entity *dl_se = &p->dl;
-
- return dl_rq->root.rb_leftmost == &dl_se->rb_node;
+ return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
}
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
-{
- raw_spin_lock_init(&dl_b->dl_runtime_lock);
- dl_b->dl_period = period;
- dl_b->dl_runtime = runtime;
-}
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
- raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
if (global_rt_runtime() == RUNTIME_INF)
dl_b->bw = -1;
else
dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
- raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
dl_b->total_bw = 0;
}
@@ -363,7 +556,6 @@ void init_dl_rq(struct dl_rq *dl_rq)
/* zero means no -deadline tasks */
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
- dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
#else
@@ -407,37 +599,17 @@ static inline void dl_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
}
-static void update_dl_migration(struct dl_rq *dl_rq)
-{
- if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
- if (!dl_rq->overloaded) {
- dl_set_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 1;
- }
- } else if (dl_rq->overloaded) {
- dl_clear_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 0;
- }
-}
+#define __node_2_pdl(node) \
+ rb_entry((node), struct task_struct, pushable_dl_tasks)
-static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory++;
-
- update_dl_migration(dl_rq);
+ return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
}
-static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+static inline int has_pushable_dl_tasks(struct rq *rq)
{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory--;
-
- update_dl_migration(dl_rq);
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
}
/*
@@ -446,69 +618,52 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
*/
static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
- struct dl_rq *dl_rq = &rq->dl;
- struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct task_struct *entry;
- bool leftmost = true;
-
- BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct task_struct,
- pushable_dl_tasks);
- if (dl_entity_preempt(&p->dl, &entry->dl))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
+ struct rb_node *leftmost;
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+
+ leftmost = rb_add_cached(&p->pushable_dl_tasks,
+ &rq->dl.pushable_dl_tasks_root,
+ __pushable_less);
if (leftmost)
- dl_rq->earliest_dl.next = p->dl.deadline;
+ rq->dl.earliest_dl.next = p->dl.deadline;
- rb_link_node(&p->pushable_dl_tasks, parent, link);
- rb_insert_color_cached(&p->pushable_dl_tasks,
- &dl_rq->pushable_dl_tasks_root, leftmost);
+ if (!rq->dl.overloaded) {
+ dl_set_overload(rq);
+ rq->dl.overloaded = 1;
+ }
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
{
struct dl_rq *dl_rq = &rq->dl;
+ struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root;
+ struct rb_node *leftmost;
if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
return;
- if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) {
- struct rb_node *next_node;
-
- next_node = rb_next(&p->pushable_dl_tasks);
- if (next_node) {
- dl_rq->earliest_dl.next = rb_entry(next_node,
- struct task_struct, pushable_dl_tasks)->dl.deadline;
- }
- }
+ leftmost = rb_erase_cached(&p->pushable_dl_tasks, root);
+ if (leftmost)
+ dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
- rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-}
-static inline int has_pushable_dl_tasks(struct rq *rq)
-{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+ if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) {
+ dl_clear_overload(rq);
+ rq->dl.overloaded = 0;
+ }
}
static int push_dl_task(struct rq *rq);
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
{
- return dl_task(prev);
+ return rq->online && dl_task(prev);
}
-static DEFINE_PER_CPU(struct callback_head, dl_push_head);
-static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_push_head);
+static DEFINE_PER_CPU(struct balance_callback, dl_pull_head);
static void push_dl_tasks(struct rq *);
static void pull_dl_task(struct rq *);
@@ -547,7 +702,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* Failed to find any suitable CPU.
* The task will never come back!
*/
- BUG_ON(dl_bandwidth_enabled());
+ WARN_ON_ONCE(dl_bandwidth_enabled());
/*
* If admission control is disabled we
@@ -578,7 +733,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
}
/*
- * And we finally need to fixup root_domain(s) bandwidth accounting,
+ * And we finally need to fix up root_domain(s) bandwidth accounting,
* since p is still hanging out in the old (now moved to default) root
* domain.
*/
@@ -620,15 +775,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
}
-static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_dl_task(struct rq *rq)
-{
-}
-
static inline void deadline_queue_push_tasks(struct rq *rq)
{
}
@@ -638,9 +784,28 @@ static inline void deadline_queue_pull_task(struct rq *rq)
}
#endif /* CONFIG_SMP */
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
+
+static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
+ struct rq *rq)
+{
+ /* for non-boosted task, pi_of(dl_se) == dl_se */
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
+
+ /*
+ * If it is a deferred reservation, and the server
+ * is not handling an starvation case, defer it.
+ */
+ if (dl_se->dl_defer && !dl_se->dl_defer_running) {
+ dl_se->dl_throttled = 1;
+ dl_se->dl_defer_armed = 1;
+ }
+}
/*
* We are being explicitly informed that a new instance is starting,
@@ -659,7 +824,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- WARN_ON(dl_se->dl_boosted);
+ WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
@@ -675,10 +840,12 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* future; in fact, we must consider execution overheads (time
* spent on hardirq context, etc.).
*/
- dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
- dl_se->runtime = dl_se->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
+static int start_dl_timer(struct sched_dl_entity *dl_se);
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t);
+
/*
* Pure Earliest Deadline First (EDF) scheduling does not deal with the
* possibility of a entity lasting more than what it declared, and thus
@@ -697,21 +864,27 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* could happen are, typically, a entity voluntarily trying to overcome its
* runtime, or it just underestimated it during sched_setattr().
*/
-static void replenish_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void replenish_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- BUG_ON(pi_se->dl_runtime <= 0);
+ WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0);
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
+ *
+ * Or, it could be the case of a deferred reservation that
+ * was not able to consume its runtime in background and
+ * reached this point with current u > U.
+ *
+ * In both cases, set a new period.
*/
- if (dl_se->dl_deadline == 0) {
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ if (dl_se->dl_deadline == 0 ||
+ (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) {
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
if (dl_se->dl_yielded && dl_se->runtime > 0)
@@ -724,8 +897,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* arbitrary large.
*/
while (dl_se->runtime <= 0) {
- dl_se->deadline += pi_se->dl_period;
- dl_se->runtime += pi_se->dl_runtime;
+ dl_se->deadline += pi_of(dl_se)->dl_period;
+ dl_se->runtime += pi_of(dl_se)->dl_runtime;
}
/*
@@ -739,14 +912,51 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
printk_deferred_once("sched: DL replenish lagged too much\n");
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
}
if (dl_se->dl_yielded)
dl_se->dl_yielded = 0;
if (dl_se->dl_throttled)
dl_se->dl_throttled = 0;
+
+ /*
+ * If this is the replenishment of a deferred reservation,
+ * clear the flag and return.
+ */
+ if (dl_se->dl_defer_armed) {
+ dl_se->dl_defer_armed = 0;
+ return;
+ }
+
+ /*
+ * A this point, if the deferred server is not armed, and the deadline
+ * is in the future, if it is not running already, throttle the server
+ * and arm the defer timer.
+ */
+ if (dl_se->dl_defer && !dl_se->dl_defer_running &&
+ dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
+ if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
+
+ /*
+ * Set dl_se->dl_defer_armed and dl_throttled variables to
+ * inform the start_dl_timer() that this is a deferred
+ * activation.
+ */
+ dl_se->dl_defer_armed = 1;
+ dl_se->dl_throttled = 1;
+ if (!start_dl_timer(dl_se)) {
+ /*
+ * If for whatever reason (delays), a previous timer was
+ * queued but not serviced, cancel it and clean the
+ * deferrable server variables intended for start_dl_timer().
+ */
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
+ }
+ }
+ }
}
/*
@@ -773,8 +983,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* task with deadline equal to period this is the same of using
* dl_period instead of dl_deadline in the equation above.
*/
-static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, u64 t)
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
{
u64 left, right;
@@ -796,9 +1005,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
- (pi_se->dl_runtime >> DL_SCALE);
+ (pi_of(dl_se)->dl_runtime >> DL_SCALE);
return dl_time_before(right, left);
}
@@ -867,7 +1076,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
* is detected, the runtime and deadline need to be updated.
*
* If the task has an implicit deadline, i.e., deadline == period, the Original
- * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * CBS is applied. The runtime is replenished and a new absolute deadline is
* set, as in the previous cases.
*
* However, the Original CBS does not work properly for tasks with
@@ -883,24 +1092,30 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
* Please refer to the comments update_dl_revised_wakeup() function to find
* more about the Revised CBS rule.
*/
-static void update_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void update_dl_entity(struct sched_dl_entity *dl_se)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
- dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+ dl_entity_overflow(dl_se, rq_clock(rq))) {
if (unlikely(!dl_is_implicit(dl_se) &&
!dl_time_before(dl_se->deadline, rq_clock(rq)) &&
- !dl_se->dl_boosted)){
+ !is_dl_boosted(dl_se))) {
update_dl_revised_wakeup(dl_se, rq);
return;
}
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ replenish_dl_new_period(dl_se, rq);
+ } else if (dl_server(dl_se) && dl_se->dl_defer) {
+ /*
+ * The server can still use its previous deadline, so check if
+ * it left the dl_defer_running state.
+ */
+ if (!dl_se->dl_defer_running) {
+ dl_se->dl_defer_armed = 1;
+ dl_se->dl_throttled = 1;
+ }
}
}
@@ -919,22 +1134,35 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct task_struct *p)
+static int start_dl_timer(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->dl_timer;
- struct rq *rq = task_rq(p);
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
ktime_t now, act;
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
/*
* We want the timer to fire at the deadline, but considering
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
+ *
+ * The deferred reservation will have its timer set to
+ * (deadline - runtime). At that point, the CBS rule will decide
+ * if the current deadline can be used, or if a replenishment is
+ * required to avoid add too much pressure on the system
+ * (current u > U).
*/
- act = ns_to_ktime(dl_next_period(dl_se));
+ if (dl_se->dl_defer_armed) {
+ WARN_ON_ONCE(!dl_se->dl_throttled);
+ act = ns_to_ktime(dl_se->deadline - dl_se->runtime);
+ } else {
+ /* act = deadline - rel-deadline + period */
+ act = ns_to_ktime(dl_next_period(dl_se));
+ }
+
now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -957,13 +1185,89 @@ static int start_dl_timer(struct task_struct *p)
* and observe our state.
*/
if (!hrtimer_is_queued(timer)) {
- get_task_struct(p);
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
}
return 1;
}
+static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ rq_unpin_lock(rq, rf);
+ push_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
+#endif
+}
+
+/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
+static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
+
+static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = rq_of_dl_se(dl_se);
+ u64 fw;
+
+ scoped_guard (rq_lock, rq) {
+ struct rq_flags *rf = &scope.rf;
+
+ if (!dl_se->dl_throttled || !dl_se->dl_runtime)
+ return HRTIMER_NORESTART;
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ if (!dl_se->dl_runtime)
+ return HRTIMER_NORESTART;
+
+ if (!dl_se->server_has_tasks(dl_se)) {
+ replenish_dl_entity(dl_se);
+ return HRTIMER_NORESTART;
+ }
+
+ if (dl_se->dl_defer_armed) {
+ /*
+ * First check if the server could consume runtime in background.
+ * If so, it is possible to push the defer timer for this amount
+ * of time. The dl_server_min_res serves as a limit to avoid
+ * forwarding the timer for a too small amount of time.
+ */
+ if (dl_time_before(rq_clock(dl_se->rq),
+ (dl_se->deadline - dl_se->runtime - dl_server_min_res))) {
+
+ /* reset the defer timer */
+ fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime;
+
+ hrtimer_forward_now(timer, ns_to_ktime(fw));
+ return HRTIMER_RESTART;
+ }
+
+ dl_se->dl_defer_running = 1;
+ }
+
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+
+ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl))
+ resched_curr(rq);
+
+ __push_dl_task(rq, rf);
+ }
+
+ return HRTIMER_NORESTART;
+}
+
/*
* This is the bandwidth enforcement timer callback. If here, we know
* a task is not on its dl_rq, since the fact that the timer was running
@@ -982,10 +1286,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
dl_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p;
struct rq_flags rf;
struct rq *rq;
+ if (dl_server(dl_se))
+ return dl_server_timer(timer, dl_se);
+
+ p = dl_task_of(dl_se);
rq = task_rq_lock(p, &rf);
/*
@@ -999,7 +1307,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled.
*/
- if (dl_se->dl_boosted)
+ if (is_dl_boosted(dl_se))
goto unlock;
/*
@@ -1027,7 +1335,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* but do not enqueue -- wait for our wakeup to do that.
*/
if (!task_on_rq_queued(p)) {
- replenish_dl_entity(dl_se, dl_se);
+ replenish_dl_entity(dl_se);
goto unlock;
}
@@ -1037,9 +1345,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* If the runqueue is no longer available, migrate the
* task elsewhere. This necessarily changes rq.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ lockdep_unpin_lock(__rq_lockp(rq), rf.cookie);
rq = dl_task_offline_migration(rq, p);
- rf.cookie = lockdep_pin_lock(&rq->lock);
+ rf.cookie = lockdep_pin_lock(__rq_lockp(rq));
update_rq_clock(rq);
/*
@@ -1051,26 +1359,12 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
#endif
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ if (dl_task(rq->donor))
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
-#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq, check if we need
- * to kick someone away.
- */
- if (has_pushable_dl_tasks(rq)) {
- /*
- * Nothing relies on rq->lock after this, so its safe to drop
- * rq->lock.
- */
- rq_unpin_lock(rq, &rf);
- push_dl_task(rq);
- rq_repin_lock(rq, &rf);
- }
-#endif
+ __push_dl_task(rq, &rf);
unlock:
task_rq_unlock(rq, p, &rf);
@@ -1084,7 +1378,7 @@ unlock:
return HRTIMER_NORESTART;
}
-void init_dl_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
@@ -1098,7 +1392,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
* cannot use the runtime, and so it replenishes the task. This rule
* works fine for implicit deadline tasks (deadline == period), and the
* CBS was designed for implicit deadline tasks. However, a task with
- * constrained deadline (deadine < period) might be awakened after the
+ * constrained deadline (deadline < period) might be awakened after the
* deadline, but before the next period. In this case, replenishing the
* task would allow it to run for runtime / deadline. As in this case
* deadline < period, CBS enables a task to run for more than the
@@ -1112,12 +1406,11 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
*/
static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
return;
dl_se->dl_throttled = 1;
if (dl_se->runtime > 0)
@@ -1131,91 +1424,46 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0);
}
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
/*
- * This function implements the GRUB accounting rule:
- * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as
- * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * This function implements the GRUB accounting rule. According to the
+ * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
+ * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
* where u is the utilization of the task, Umax is the maximum reclaimable
* utilization, Uinact is the (per-runqueue) inactive utilization, computed
* as the difference between the "total runqueue utilization" and the
- * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * "runqueue active utilization", and Uextra is the (per runqueue) extra
* reclaimable utilization.
- * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- * multiplied by 2^BW_SHIFT, the result has to be shifted right by
- * BW_SHIFT.
- * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
- * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
- * Since delta is a 64 bit variable, to have an overflow its value
- * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
- * So, overflow is not an issue here.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
+ * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
+ * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value should be
+ * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
+ * not an issue here.
*/
static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
- u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
- u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
/*
- * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
- * we compare u_inact + rq->dl.extra_bw with
- * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
- * u_inact + rq->dl.extra_bw can be larger than
- * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
- * leading to wrong results)
+ * Instead of computing max{u, (u_max - u_inact - u_extra)}, we
+ * compare u_inact + u_extra with u_max - u, because u_inact + u_extra
+ * can be larger than u_max. So, u_max - u_inact - u_extra would be
+ * negative leading to wrong results.
*/
- if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
- u_act = u_act_min;
+ if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
+ u_act = dl_se->dl_bw;
else
- u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+ u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
+ u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
return (delta * u_act) >> BW_SHIFT;
}
-/*
- * Update the current task's runtime statistics (provided it is still
- * a -deadline task and has not been removed from the dl_rq).
- */
-static void update_curr_dl(struct rq *rq)
+s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
- struct task_struct *curr = rq->curr;
- struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec, scaled_delta_exec;
- int cpu = cpu_of(rq);
- u64 now;
-
- if (!dl_task(curr) || !on_dl_rq(dl_se))
- return;
-
- /*
- * Consumed budget is computed considering the time as
- * observed by schedulable tasks (excluding time spent
- * in hardirq context, etc.). Deadlines are instead
- * computed using hard walltime. This seems to be the more
- * natural solution, but the full ramifications of this
- * approach need further study.
- */
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0)) {
- if (unlikely(dl_se->dl_yielded))
- goto throttle;
- return;
- }
-
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
-
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
-
- if (dl_entity_is_special(dl_se))
- return;
+ s64 scaled_delta_exec;
/*
* For tasks that participate in GRUB, we implement GRUB-PA: the
@@ -1225,10 +1473,9 @@ static void update_curr_dl(struct rq *rq)
* according to current frequency and CPU maximum capacity.
*/
if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
- scaled_delta_exec = grub_reclaim(delta_exec,
- rq,
- &curr->dl);
+ scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
} else {
+ int cpu = cpu_of(rq);
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
@@ -1236,8 +1483,64 @@ static void update_curr_dl(struct rq *rq)
scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
}
+ return scaled_delta_exec;
+}
+
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags);
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ s64 scaled_delta_exec;
+
+ if (unlikely(delta_exec <= 0)) {
+ if (unlikely(dl_se->dl_yielded))
+ goto throttle;
+ return;
+ }
+
+ if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer)
+ return;
+
+ if (dl_entity_is_special(dl_se))
+ return;
+
+ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
+
dl_se->runtime -= scaled_delta_exec;
+ /*
+ * The fair server can consume its runtime while throttled (not queued/
+ * running as regular CFS).
+ *
+ * If the server consumes its entire runtime in this state. The server
+ * is not required for the current period. Thus, reset the server by
+ * starting a new period, pushing the activation.
+ */
+ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) {
+ /*
+ * If the server was previously activated - the starving condition
+ * took place, it this point it went away because the fair scheduler
+ * was able to get runtime in background. So return to the initial
+ * state.
+ */
+ dl_se->dl_defer_running = 0;
+
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+
+ replenish_dl_new_period(dl_se, dl_se->rq);
+
+ /*
+ * Not being able to start the timer seems problematic. If it could not
+ * be started for whatever reason, we need to "unthrottle" the DL server
+ * and queue right away. Otherwise nothing might queue it. That's similar
+ * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn.
+ */
+ WARN_ON_ONCE(!start_dl_timer(dl_se));
+
+ return;
+ }
+
throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
@@ -1247,15 +1550,32 @@ throttle:
(dl_se->flags & SCHED_FLAG_DL_OVERRUN))
dl_se->dl_overrun = 1;
- __dequeue_task_dl(rq, curr, 0);
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
- enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+ dequeue_dl_entity(dl_se, 0);
+ if (!dl_server(dl_se)) {
+ update_stats_dequeue_dl(&rq->dl, dl_se, 0);
+ dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
+ }
+
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
+ if (dl_server(dl_se))
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+ else
+ enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+ }
- if (!is_leftmost(curr, &rq->dl))
+ if (!is_leftmost(dl_se, &rq->dl))
resched_curr(rq);
}
/*
+ * The fair server (sole dl_server) does not account for real-time
+ * workload because it is running fair work.
+ */
+ if (dl_se == &rq->fair_server)
+ return;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
* Because -- for now -- we share the rt bandwidth, we need to
* account our runtime there too, otherwise actual rt tasks
* would be able to exceed the shared quota.
@@ -1279,6 +1599,182 @@ throttle:
rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
+#endif
+}
+
+/*
+ * In the non-defer mode, the idle time is not accounted, as the
+ * server provides a guarantee.
+ *
+ * If the dl_server is in defer mode, the idle time is also considered
+ * as time available for the fair server, avoiding a penalty for the
+ * rt scheduler that did not consumed that time.
+ */
+void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
+{
+ s64 delta_exec, scaled_delta_exec;
+
+ if (!rq->fair_server.dl_defer)
+ return;
+
+ /* no need to discount more */
+ if (rq->fair_server.runtime < 0)
+ return;
+
+ delta_exec = rq_clock_task(rq) - p->se.exec_start;
+ if (delta_exec < 0)
+ return;
+
+ scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
+
+ rq->fair_server.runtime -= scaled_delta_exec;
+
+ if (rq->fair_server.runtime < 0) {
+ rq->fair_server.dl_defer_running = 0;
+ rq->fair_server.runtime = 0;
+ }
+
+ p->se.exec_start = rq_clock_task(rq);
+}
+
+void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ /* 0 runtime = fair server disabled */
+ if (dl_se->dl_runtime)
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+void dl_server_start(struct sched_dl_entity *dl_se)
+{
+ struct rq *rq = dl_se->rq;
+
+ /*
+ * XXX: the apply do not work fine at the init phase for the
+ * fair server because things are not yet set. We need to improve
+ * this before getting generic.
+ */
+ if (!dl_server(dl_se)) {
+ u64 runtime = 50 * NSEC_PER_MSEC;
+ u64 period = 1000 * NSEC_PER_MSEC;
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+ }
+
+ if (!dl_se->dl_runtime)
+ return;
+
+ dl_se->dl_server_active = 1;
+ enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
+ resched_curr(dl_se->rq);
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+ if (!dl_se->dl_runtime)
+ return;
+
+ dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
+ dl_se->dl_server_active = 0;
+}
+
+void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_has_tasks_f has_tasks,
+ dl_server_pick_f pick_task)
+{
+ dl_se->rq = rq;
+ dl_se->server_has_tasks = has_tasks;
+ dl_se->server_pick_task = pick_task;
+}
+
+void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 new_bw = dl_se->dl_bw;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+
+ dl_b = dl_bw_of(cpu_of(rq));
+ guard(raw_spinlock)(&dl_b->lock);
+
+ if (!dl_bw_cpus(cpu))
+ return;
+
+ __dl_add(dl_b, new_bw, dl_bw_cpus(cpu));
+}
+
+int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
+{
+ u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ u64 new_bw = to_ratio(period, runtime);
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ unsigned long cap;
+ int retval = 0;
+ int cpus;
+
+ dl_b = dl_bw_of(cpu);
+ guard(raw_spinlock)(&dl_b->lock);
+
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
+ if (__dl_overflow(dl_b, cap, old_bw, new_bw))
+ return -EBUSY;
+
+ if (init) {
+ __add_rq_bw(new_bw, &rq->dl);
+ __dl_add(dl_b, new_bw, cpus);
+ } else {
+ __dl_sub(dl_b, dl_se->dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+
+ dl_rq_change_utilization(rq, dl_se, new_bw);
+ }
+
+ dl_se->dl_runtime = runtime;
+ dl_se->dl_deadline = period;
+ dl_se->dl_period = period;
+
+ dl_se->runtime = 0;
+ dl_se->deadline = 0;
+
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+
+ return retval;
+}
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *donor = rq->donor;
+ struct sched_dl_entity *dl_se = &donor->dl;
+ s64 delta_exec;
+
+ if (!dl_task(donor) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ delta_exec = update_curr_common(rq);
+ update_curr_dl_se(rq, dl_se, delta_exec);
}
static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
@@ -1286,19 +1782,28 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
inactive_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p = NULL;
struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(p, &rf);
+ if (!dl_server(dl_se)) {
+ p = dl_task_of(dl_se);
+ rq = task_rq_lock(p, &rf);
+ } else {
+ rq = dl_se->rq;
+ rq_lock(rq, &rf);
+ }
sched_clock_tick();
update_rq_clock(rq);
- if (!dl_task(p) || p->state == TASK_DEAD) {
+ if (dl_server(dl_se))
+ goto no_task;
+
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+ if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) {
sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
dl_se->dl_non_contending = 0;
@@ -1307,23 +1812,30 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
- __dl_clear_params(p);
+ __dl_clear_params(dl_se);
goto unlock;
}
+
+no_task:
if (dl_se->dl_non_contending == 0)
goto unlock;
sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
- task_rq_unlock(rq, p, &rf);
- put_task_struct(p);
+
+ if (!dl_server(dl_se)) {
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ } else {
+ rq_unlock(rq, &rf);
+ }
return HRTIMER_NORESTART;
}
-void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
@@ -1331,6 +1843,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
timer->function = inactive_task_timer;
}
+#define __node_2_dle(node) \
+ rb_entry((node), struct sched_dl_entity, rb_node)
+
#ifdef CONFIG_SMP
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -1339,6 +1854,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+ if (dl_rq->earliest_dl.curr == 0)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
}
@@ -1356,11 +1873,11 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
- struct rb_node *leftmost = dl_rq->root.rb_leftmost;
- struct sched_dl_entity *entry;
+ struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
+ struct sched_dl_entity *entry = __node_2_dle(leftmost);
- entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
@@ -1376,54 +1893,106 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
u64 deadline = dl_se->deadline;
- WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
- inc_dl_migration(dl_se, dl_rq);
}
static inline
void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
-
- WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
- dec_dl_migration(dl_se, dl_rq);
+}
+
+static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
+{
+ return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
+}
+
+static __always_inline struct sched_statistics *
+__schedstats_from_dl_se(struct sched_dl_entity *dl_se)
+{
+ if (!schedstat_enabled())
+ return NULL;
+
+ if (dl_server(dl_se))
+ return NULL;
+
+ return &dl_task_of(dl_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se)
+{
+ struct sched_statistics *stats = __schedstats_from_dl_se(dl_se);
+ if (stats)
+ __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats);
+}
+
+static inline void
+update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_dl(dl_rq, dl_se);
+}
+
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (!schedstat_enabled())
+ return;
+
+ if ((flags & DEQUEUE_SLEEP)) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_dl_rq(dl_rq)));
+ }
}
static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rb_node **link = &dl_rq->root.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_dl_entity *entry;
- int leftmost = 1;
-
- BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_dl_entity, rb_node);
- if (dl_time_before(dl_se->deadline, entry->deadline))
- link = &parent->rb_left;
- else {
- link = &parent->rb_right;
- leftmost = 0;
- }
- }
- rb_link_node(&dl_se->rb_node, parent, link);
- rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost);
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node));
+
+ rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less);
inc_dl_tasks(dl_se, dl_rq);
}
@@ -1436,67 +2005,18 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
return;
rb_erase_cached(&dl_se->rb_node, &dl_rq->root);
+
RB_CLEAR_NODE(&dl_se->rb_node);
dec_dl_tasks(dl_se, dl_rq);
}
static void
-enqueue_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, int flags)
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
- BUG_ON(on_dl_rq(dl_se));
+ WARN_ON_ONCE(on_dl_rq(dl_se));
- /*
- * If this is a wakeup or a new instance, the scheduling
- * parameters of the task might need updating. Otherwise,
- * we want a replenishment of its runtime.
- */
- if (flags & ENQUEUE_WAKEUP) {
- task_contending(dl_se, flags);
- update_dl_entity(dl_se, pi_se);
- } else if (flags & ENQUEUE_REPLENISH) {
- replenish_dl_entity(dl_se, pi_se);
- } else if ((flags & ENQUEUE_RESTORE) &&
- dl_time_before(dl_se->deadline,
- rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
- setup_new_dl_entity(dl_se);
- }
-
- __enqueue_dl_entity(dl_se);
-}
-
-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
-{
- __dequeue_dl_entity(dl_se);
-}
-
-static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
- struct sched_dl_entity *pi_se = &p->dl;
-
- /*
- * Use the scheduling parameters of the top pi-waiter task if:
- * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
- * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
- * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
- * boosted due to a SCHED_DEADLINE pi-waiter).
- * Otherwise we keep our runtime and deadline.
- */
- if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
- pi_se = &pi_task->dl;
- } else if (!dl_prio(p->normal_prio)) {
- /*
- * Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceeds its
- * runtime while doing so. No point in replenishing
- * it, as it's going to return back to its original
- * scheduling class after this.
- */
- BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
- return;
- }
+ update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
/*
* Check if a constrained deadline task was activated
@@ -1504,12 +2024,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* If that is the case, the task will be throttled and
* the replenishment timer will be set to the next period.
*/
- if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
- dl_check_constrained_dl(&p->dl);
+ if (!dl_se->dl_throttled && !dl_is_implicit(dl_se))
+ dl_check_constrained_dl(dl_se);
- if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
- add_rq_bw(&p->dl, &rq->dl);
- add_running_bw(&p->dl, &rq->dl);
+ if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ add_rq_bw(dl_se, dl_rq);
+ add_running_bw(dl_se, dl_rq);
}
/*
@@ -1524,33 +2046,60 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* be counted in the active utilization; hence, we need to call
* add_running_bw().
*/
- if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
if (flags & ENQUEUE_WAKEUP)
- task_contending(&p->dl, flags);
+ task_contending(dl_se, flags);
return;
}
- enqueue_dl_entity(&p->dl, pi_se, flags);
+ /*
+ * If this is a wakeup or a new instance, the scheduling
+ * parameters of the task might need updating. Otherwise,
+ * we want a replenishment of its runtime.
+ */
+ if (flags & ENQUEUE_WAKEUP) {
+ task_contending(dl_se, flags);
+ update_dl_entity(dl_se);
+ } else if (flags & ENQUEUE_REPLENISH) {
+ replenish_dl_entity(dl_se);
+ } else if ((flags & ENQUEUE_RESTORE) &&
+ !is_dl_boosted(dl_se) &&
+ dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
+ setup_new_dl_entity(dl_se);
+ }
+
+ /*
+ * If the reservation is still throttled, e.g., it got replenished but is a
+ * deferred task and still got to wait, don't enqueue.
+ */
+ if (dl_se->dl_throttled && start_dl_timer(dl_se))
+ return;
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
- enqueue_pushable_dl_task(rq, p);
-}
+ /*
+ * We're about to enqueue, make sure we're not ->dl_throttled!
+ * In case the timer was not started, say because the defer time
+ * has passed, mark as not throttled and mark unarmed.
+ * Also cancel earlier timers, since letting those run is pointless.
+ */
+ if (dl_se->dl_throttled) {
+ hrtimer_try_to_cancel(&dl_se->dl_timer);
+ dl_se->dl_defer_armed = 0;
+ dl_se->dl_throttled = 0;
+ }
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
- dequeue_dl_entity(&p->dl);
- dequeue_pushable_dl_task(rq, p);
+ __enqueue_dl_entity(dl_se);
}
-static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
- update_curr_dl(rq);
- __dequeue_task_dl(rq, p, flags);
+ __dequeue_dl_entity(dl_se);
- if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
- sub_running_bw(&p->dl, &rq->dl);
- sub_rq_bw(&p->dl, &rq->dl);
+ if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ sub_running_bw(dl_se, dl_rq);
+ sub_rq_bw(dl_se, dl_rq);
}
/*
@@ -1563,7 +2112,78 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* or "inactive")
*/
if (flags & DEQUEUE_SLEEP)
- task_non_contending(p);
+ task_non_contending(dl_se);
+}
+
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (is_dl_boosted(&p->dl)) {
+ /*
+ * Because of delays in the detection of the overrun of a
+ * thread's runtime, it might be the case that a thread
+ * goes to sleep in a rt mutex with negative runtime. As
+ * a consequence, the thread will be throttled.
+ *
+ * While waiting for the mutex, this thread can also be
+ * boosted via PI, resulting in a thread that is throttled
+ * and boosted at the same time.
+ *
+ * In this case, the boost overrides the throttle.
+ */
+ if (p->dl.dl_throttled) {
+ /*
+ * The replenish timer needs to be canceled. No
+ * problem if it fires concurrently: boosted threads
+ * are ignored in dl_task_timer().
+ */
+ cancel_replenish_timer(&p->dl);
+ p->dl.dl_throttled = 0;
+ }
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task that is going
+ * to be deboosted, but exceeds its runtime while doing so. No point in
+ * replenishing it, as it's going to return back to its original
+ * scheduling class after this. If it has been throttled, we need to
+ * clear the flag, otherwise the task may wake up as throttled after
+ * being boosted again with no means to replenish the runtime and clear
+ * the throttle.
+ */
+ p->dl.dl_throttled = 0;
+ if (!(flags & ENQUEUE_REPLENISH))
+ printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n",
+ task_pid_nr(p));
+
+ return;
+ }
+
+ check_schedstat_required();
+ update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= ENQUEUE_MIGRATING;
+
+ enqueue_dl_entity(&p->dl, flags);
+
+ if (dl_server(&p->dl))
+ return;
+
+ if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+}
+
+static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+ update_curr_dl(rq);
+
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= DEQUEUE_MIGRATING;
+
+ dequeue_dl_entity(&p->dl, flags);
+ if (!p->dl.dl_throttled && !dl_server(&p->dl))
+ dequeue_pushable_dl_task(rq, p);
+
+ return true;
}
/*
@@ -1598,21 +2218,31 @@ static void yield_task_dl(struct rq *rq)
#ifdef CONFIG_SMP
+static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
+ struct rq *rq)
+{
+ return (!rq->dl.dl_nr_running ||
+ dl_time_before(p->dl.deadline,
+ rq->dl.earliest_dl.curr));
+}
+
static int find_later_rq(struct task_struct *task);
static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int flags)
{
- struct task_struct *curr;
+ struct task_struct *curr, *donor;
+ bool select_rq;
struct rq *rq;
- if (sd_flag != SD_BALANCE_WAKE)
+ if (!(flags & WF_TTWU))
goto out;
rq = cpu_rq(cpu);
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
+ donor = READ_ONCE(rq->donor);
/*
* If we are dealing with a -deadline task, we must
@@ -1623,16 +2253,23 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- if (unlikely(dl_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
- (p->nr_cpus_allowed > 1)) {
+ select_rq = unlikely(dl_task(donor)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &donor->dl)) &&
+ p->nr_cpus_allowed > 1;
+
+ /*
+ * Take the capacity of the CPU into account to
+ * ensure it fits the requirement of the task.
+ */
+ if (sched_asym_cpucap_active())
+ select_rq |= !dl_task_fits_capacity(p, cpu);
+
+ if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
- (dl_time_before(p->dl.deadline,
- cpu_rq(target)->dl.earliest_dl.curr) ||
- (cpu_rq(target)->dl.dl_nr_running == 0)))
+ dl_task_is_earliest_deadline(p, cpu_rq(target)))
cpu = target;
}
rcu_read_unlock();
@@ -1643,9 +2280,10 @@ out:
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
+ struct rq_flags rf;
struct rq *rq;
- if (p->state != TASK_WAKING)
+ if (READ_ONCE(p->__state) != TASK_WAKING)
return;
rq = task_rq(p);
@@ -1654,22 +2292,22 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
- * timer cannot be cancelled, inactive_task_timer()
+ * timer cannot be canceled, inactive_task_timer()
* will see that dl_not_contending is not set, and
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ cancel_inactive_timer(&p->dl);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -1679,7 +2317,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
+ !cpudl_find(&rq->rd->cpudl, rq->donor, NULL))
return;
/*
@@ -1715,10 +2353,10 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
int flags)
{
- if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+ if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
}
@@ -1728,26 +2366,31 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
* In the unlikely case current and p have the same deadline
* let us try to decide what's the best thing to do...
*/
- if ((p->dl.deadline == rq->curr->dl.deadline) &&
+ if ((p->dl.deadline == rq->donor->dl.deadline) &&
!test_tsk_need_resched(rq->curr))
check_preempt_equal_dl(rq, p);
#endif /* CONFIG_SMP */
}
#ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, dl_se->runtime);
}
#else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
#endif
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
p->se.exec_start = rq_clock_task(rq);
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_end_dl(dl_rq, dl_se);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
@@ -1755,44 +2398,72 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
if (!first)
return;
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
-
- if (rq->curr->sched_class != &dl_sched_class)
+ if (rq->donor->sched_class != &dl_sched_class)
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
deadline_queue_push_tasks(rq);
+
+ if (hrtick_enabled_dl(rq))
+ start_hrtick_dl(rq, &p->dl);
}
-static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
- struct dl_rq *dl_rq)
+static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
{
struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
- return rb_entry(left, struct sched_dl_entity, rb_node);
+ return __node_2_dle(left);
}
-static struct task_struct *pick_next_task_dl(struct rq *rq)
+/*
+ * __pick_next_task_dl - Helper to pick the next -deadline task to run.
+ * @rq: The runqueue to pick the next task from.
+ */
+static struct task_struct *__pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
+again:
if (!sched_dl_runnable(rq))
return NULL;
- dl_se = pick_next_dl_entity(rq, dl_rq);
- BUG_ON(!dl_se);
- p = dl_task_of(dl_se);
- set_next_task_dl(rq, p, true);
+ dl_se = pick_next_dl_entity(dl_rq);
+ WARN_ON_ONCE(!dl_se);
+
+ if (dl_server(dl_se)) {
+ p = dl_se->server_pick_task(dl_se);
+ if (!p) {
+ if (dl_server_active(dl_se)) {
+ dl_se->dl_yielded = 1;
+ update_curr_dl_se(rq, dl_se, 0);
+ }
+ goto again;
+ }
+ rq->dl_server = dl_se;
+ } else {
+ p = dl_task_of(dl_se);
+ }
+
return p;
}
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static struct task_struct *pick_task_dl(struct rq *rq)
+{
+ return __pick_task_dl(rq);
+}
+
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next)
{
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct dl_rq *dl_rq = &rq->dl;
+
+ if (on_dl_rq(&p->dl))
+ update_stats_wait_start_dl(dl_rq, dl_se);
+
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
@@ -1818,9 +2489,9 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
* not being the leftmost task anymore. In that case NEED_RESCHED will
* be set and schedule() will start a new hrtick for the next task.
*/
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
- is_leftmost(p, &rq->dl))
- start_hrtick_dl(rq, p);
+ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(&p->dl, &rq->dl))
+ start_hrtick_dl(rq, &p->dl);
}
static void task_fork_dl(struct task_struct *p)
@@ -1836,35 +2507,26 @@ static void task_fork_dl(struct task_struct *p)
/* Only try algorithms three times */
#define DL_MAX_TRIES 3
-static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
-{
- if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr))
- return 1;
- return 0;
-}
-
/*
* Return the earliest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise:
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
+ struct rb_node *next_node;
if (!has_pushable_dl_tasks(rq))
return NULL;
-next_node:
- if (next_node) {
- p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+ next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+ while (next_node) {
+ p = __node_2_pdl(next_node);
- if (pick_dl_task(rq, p, cpu))
+ if (task_is_pushable(rq, p, cpu))
return p;
next_node = rb_next(next_node);
- goto next_node;
}
return NULL;
@@ -1929,8 +2591,8 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
- best_cpu = cpumask_first_and(later_mask,
- sched_domain_span(sd));
+ best_cpu = cpumask_any_and_distribute(later_mask,
+ sched_domain_span(sd));
/*
* Last chance: if a CPU being in both later_mask
* and current sd span is valid, that becomes our
@@ -1952,7 +2614,7 @@ static int find_later_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
- cpu = cpumask_any(later_mask);
+ cpu = cpumask_any_distribute(later_mask);
if (cpu < nr_cpu_ids)
return cpu;
@@ -1974,9 +2636,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
- if (later_rq->dl.dl_nr_running &&
- !dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr)) {
+ if (!dl_task_is_earliest_deadline(task, later_rq)) {
/*
* Target rq has tasks of equal or earlier deadline,
* retrying does not release any lock and is unlikely
@@ -1989,9 +2649,10 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
- task_running(rq, task) ||
+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
+ task_on_cpu(rq, task) ||
!dl_task(task) ||
+ is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
@@ -2004,9 +2665,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
* its earliest one has a later deadline than our
* task, the rq is a good one.
*/
- if (!later_rq->dl.dl_nr_running ||
- dl_time_before(task->dl.deadline,
- later_rq->dl.earliest_dl.curr))
+ if (dl_task_is_earliest_deadline(task, later_rq))
break;
/* Otherwise we try again. */
@@ -2024,15 +2683,14 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
- struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+ WARN_ON_ONCE(!dl_task(p));
return p;
}
@@ -2048,29 +2706,29 @@ static int push_dl_task(struct rq *rq)
struct rq *later_rq;
int ret = 0;
- if (!rq->dl.overloaded)
- return 0;
-
next_task = pick_next_pushable_dl_task(rq);
if (!next_task)
return 0;
retry:
- if (WARN_ON(next_task == rq->curr))
- return 0;
-
/*
* If next_task preempts rq->curr, and rq->curr
* can move away, it makes sense to just reschedule
* without going further in pushing next_task.
*/
- if (dl_task(rq->curr) &&
- dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+ if (dl_task(rq->donor) &&
+ dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) &&
rq->curr->nr_cpus_allowed > 1) {
resched_curr(rq);
return 0;
}
+ if (is_migration_disabled(next_task))
+ return 0;
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -2102,15 +2760,7 @@ retry:
goto retry;
}
- deactivate_task(rq, next_task, 0);
- set_task_cpu(next_task, later_rq->cpu);
-
- /*
- * Update the later_rq clock here, because the clock is used
- * by the cpufreq_update_util() inside __add_running_bw().
- */
- update_rq_clock(later_rq);
- activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+ move_queued_task_locked(rq, later_rq, next_task);
ret = 1;
resched_curr(later_rq);
@@ -2133,7 +2783,7 @@ static void push_dl_tasks(struct rq *rq)
static void pull_dl_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
- struct task_struct *p;
+ struct task_struct *p, *push_task;
bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
@@ -2154,7 +2804,7 @@ static void pull_dl_task(struct rq *this_rq)
src_rq = cpu_rq(cpu);
/*
- * It looks racy, abd it is! However, as in sched_rt.c,
+ * It looks racy, and it is! However, as in sched_rt.c,
* we are fine with this.
*/
if (this_rq->dl.dl_nr_running &&
@@ -2163,6 +2813,7 @@ static void pull_dl_task(struct rq *this_rq)
continue;
/* Might drop this_rq->lock */
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
@@ -2180,9 +2831,7 @@ static void pull_dl_task(struct rq *this_rq)
* - it will preempt the last one we pulled (if any).
*/
if (p && dl_time_before(p->dl.deadline, dmin) &&
- (!this_rq->dl.dl_nr_running ||
- dl_time_before(p->dl.deadline,
- this_rq->dl.earliest_dl.curr))) {
+ dl_task_is_earliest_deadline(p, this_rq)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!task_on_rq_queued(p));
@@ -2191,20 +2840,30 @@ static void pull_dl_task(struct rq *this_rq)
* deadline than the current task of its runqueue.
*/
if (dl_time_before(p->dl.deadline,
- src_rq->curr->dl.deadline))
+ src_rq->donor->dl.deadline))
goto skip;
- resched = true;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
- dmin = p->dl.deadline;
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ move_queued_task_locked(src_rq, this_rq, p);
+ dmin = p->dl.deadline;
+ resched = true;
+ }
/* Is there any other task even earlier? */
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(this_rq);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(this_rq);
+ }
}
if (resched)
@@ -2217,23 +2876,23 @@ skip:
*/
static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
+ if (!task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
- dl_task(rq->curr) &&
+ dl_task(rq->donor) &&
(rq->curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->donor->dl))) {
push_dl_tasks(rq);
}
}
static void set_cpus_allowed_dl(struct task_struct *p,
- const struct cpumask *new_mask)
+ struct affinity_context *ctx)
{
struct root_domain *src_rd;
struct rq *rq;
- BUG_ON(!dl_task(p));
+ WARN_ON_ONCE(!dl_task(p));
rq = task_rq(p);
src_rd = rq->rd;
@@ -2243,7 +2902,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* update. We already made space for us in the destination
* domain (see cpuset_can_attach()).
*/
- if (!cpumask_intersects(src_rd->span, new_mask)) {
+ if (!cpumask_intersects(src_rd->span, ctx->new_mask)) {
struct dl_bw *src_dl_b;
src_dl_b = dl_bw_of(cpu_of(rq));
@@ -2257,7 +2916,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- set_cpus_allowed_common(p, new_mask);
+ set_cpus_allowed_common(p, ctx);
}
/* Assumes rq->lock is held */
@@ -2296,9 +2955,13 @@ void dl_add_task_root_domain(struct task_struct *p)
struct rq *rq;
struct dl_bw *dl_b;
- rq = task_rq_lock(p, &rf);
- if (!dl_task(p))
- goto unlock;
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ if (!dl_task(p)) {
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
+ return;
+ }
+
+ rq = __task_rq_lock(p, &rf);
dl_b = &rq->rd->dl_bw;
raw_spin_lock(&dl_b->lock);
@@ -2307,17 +2970,27 @@ void dl_add_task_root_domain(struct task_struct *p)
raw_spin_unlock(&dl_b->lock);
-unlock:
task_rq_unlock(rq, p, &rf);
}
void dl_clear_root_domain(struct root_domain *rd)
{
- unsigned long flags;
+ int i;
- raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+ guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
rd->dl_bw.total_bw = 0;
- raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+
+ /*
+ * dl_server bandwidth is only restored when CPUs are attached to root
+ * domains (after domains are created or CPUs moved back to the
+ * default root doamin).
+ */
+ for_each_cpu(i, rd->span) {
+ struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
+
+ if (dl_server(dl_se) && cpu_active(i))
+ rd->dl_bw.total_bw += dl_se->dl_bw;
+ }
}
#endif /* CONFIG_SMP */
@@ -2333,7 +3006,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* will reset the task parameters.
*/
if (task_on_rq_queued(p) && p->dl.dl_runtime)
- task_non_contending(p);
+ task_non_contending(&p->dl);
+
+ /*
+ * In case a task is setscheduled out from SCHED_DEADLINE we need to
+ * keep track of that on its cpuset (for correct bandwidth tracking).
+ */
+ dec_dl_tasks_cs(p);
if (!task_on_rq_queued(p)) {
/*
@@ -2372,8 +3051,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ cancel_inactive_timer(&p->dl);
+
+ /*
+ * In case a task is setscheduled to SCHED_DEADLINE we need to keep
+ * track of that on its cpuset (for correct bandwidth tracking).
+ */
+ inc_dl_tasks_cs(p);
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) {
@@ -2382,15 +3066,17 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
return;
}
- if (rq->curr != p) {
+ if (rq->donor != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
deadline_queue_push_tasks(rq);
#endif
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ if (dl_task(rq->donor))
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
+ } else {
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
}
}
@@ -2401,17 +3087,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || rq->curr == p) {
+ if (!task_on_rq_queued(p))
+ return;
+
#ifdef CONFIG_SMP
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- deadline_queue_pull_task(rq);
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ deadline_queue_pull_task(rq);
+ if (task_current_donor(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
@@ -2419,26 +3108,42 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
-#else
+ } else {
/*
- * Again, we don't know if p has a earlier
- * or later deadline, so let's blindly set a
- * (maybe not needed) rescheduling point.
+ * Current may not be deadline in case p was throttled but we
+ * have just replenished it (e.g. rt_mutex_setprio()).
+ *
+ * Otherwise, if p was given an earlier deadline, reschedule.
*/
- resched_curr(rq);
-#endif /* CONFIG_SMP */
+ if (!dl_task(rq->curr) ||
+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
+ resched_curr(rq);
}
+#else
+ /*
+ * We don't know if p has a earlier or later deadline, so let's blindly
+ * set a (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif
+}
+
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_dl(struct task_struct *p, int cpu)
+{
+ return p->dl.dl_throttled;
}
+#endif
+
+DEFINE_SCHED_CLASS(dl) = {
-const struct sched_class dl_sched_class = {
- .next = &rt_sched_class,
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
- .check_preempt_curr = check_preempt_curr_dl,
+ .wakeup_preempt = wakeup_preempt_dl,
- .pick_next_task = pick_next_task_dl,
+ .pick_task = pick_task_dl,
.put_prev_task = put_prev_task_dl,
.set_next_task = set_next_task_dl,
@@ -2450,6 +3155,7 @@ const struct sched_class dl_sched_class = {
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
+ .find_lock_rq = find_lock_later_rq,
#endif
.task_tick = task_tick_dl,
@@ -2460,35 +3166,44 @@ const struct sched_class dl_sched_class = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_dl,
+#endif
};
+/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
+static u64 dl_generation;
+
int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
+ u64 gen = ++dl_generation;
struct dl_bw *dl_b;
- int cpu, ret = 0;
+ int cpu, cpus, ret = 0;
unsigned long flags;
/*
* Here we want to check the bandwidth not being set to some
* value smaller than the currently allocated bandwidth in
* any of the root_domains.
- *
- * FIXME: Cycling on all the CPUs is overdoing, but simpler than
- * cycling on root_domains... Discussion on different/better
- * solutions is welcome!
*/
- for_each_possible_cpu(cpu) {
+ for_each_online_cpu(cpu) {
rcu_read_lock_sched();
+
+ if (dl_bw_visited(cpu, gen))
+ goto next;
+
dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
+ if (new_bw * cpus < dl_b->total_bw)
ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+next:
rcu_read_unlock_sched();
if (ret)
@@ -2502,33 +3217,34 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
{
if (global_rt_runtime() == RUNTIME_INF) {
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
- dl_rq->extra_bw = 1 << BW_SHIFT;
+ dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
} else {
dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
- dl_rq->extra_bw = to_ratio(global_rt_period(),
- global_rt_runtime());
+ dl_rq->max_bw = dl_rq->extra_bw =
+ to_ratio(global_rt_period(), global_rt_runtime());
}
}
void sched_dl_do_global(void)
{
u64 new_bw = -1;
+ u64 gen = ++dl_generation;
struct dl_bw *dl_b;
int cpu;
unsigned long flags;
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
- /*
- * FIXME: As above...
- */
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
+
+ if (dl_bw_visited(cpu, gen)) {
+ rcu_read_unlock_sched();
+ continue;
+ }
+
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
@@ -2551,11 +3267,12 @@ void sched_dl_do_global(void)
int sched_dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
+ int cpus, err = -1, cpu = task_cpu(p);
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ unsigned long cap;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
@@ -2570,15 +3287,17 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* allocated bandwidth of the container.
*/
raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ !__dl_overflow(dl_b, cap, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
/*
* XXX this is slightly incorrect: when the task
* utilization decreases, we should delay the total
@@ -2618,7 +3337,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
@@ -2631,7 +3350,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
}
/*
@@ -2646,6 +3366,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ u64 period, max, min;
+
/* special dl tasks don't actually use any parameter */
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return true;
@@ -2669,22 +3391,29 @@ bool __checkparam_dl(const struct sched_attr *attr)
attr->sched_period & (1ULL << 63))
return false;
+ period = attr->sched_period;
+ if (!period)
+ period = attr->sched_deadline;
+
/* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
+ if (period < attr->sched_deadline ||
attr->sched_deadline < attr->sched_runtime)
return false;
+ max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
+ min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
+
+ if (period < min || period > max)
+ return false;
+
return true;
}
/*
* This function clears the sched_dl_entity static params.
*/
-void __dl_clear_params(struct task_struct *p)
+static void __dl_clear_params(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
-
dl_se->dl_runtime = 0;
dl_se->dl_deadline = 0;
dl_se->dl_period = 0;
@@ -2692,11 +3421,23 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
- dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
+ dl_se->dl_server = 0;
+
+#ifdef CONFIG_RT_MUTEXES
+ dl_se->pi_se = dl_se;
+#endif
+}
+
+void init_dl_entity(struct sched_dl_entity *dl_se)
+{
+ RB_CLEAR_NODE(&dl_se->rb_node);
+ init_dl_task_timer(dl_se);
+ init_dl_inactive_task_timer(dl_se);
+ __dl_clear_params(dl_se);
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2706,60 +3447,25 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true;
return false;
}
#ifdef CONFIG_SMP
-int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
-{
- unsigned int dest_cpu;
- struct dl_bw *dl_b;
- bool overflow;
- int cpus, ret;
- unsigned long flags;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
- if (overflow) {
- ret = -EBUSY;
- } else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw, cpus);
- ret = 0;
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
-
- return ret;
-}
-
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
- int ret = 1, trial_cpus;
+ unsigned long flags, cap;
struct dl_bw *cur_dl_b;
- unsigned long flags;
+ int ret = 1;
rcu_read_lock_sched();
cur_dl_b = dl_bw_of(cpumask_any(cur));
- trial_cpus = cpumask_weight(trial);
-
+ cap = __dl_bw_capacity(trial);
raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
- if (cur_dl_b->bw != -1 &&
- cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ if (__dl_overflow(cur_dl_b, cap, 0, 0))
ret = 0;
raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
rcu_read_unlock_sched();
@@ -2767,22 +3473,97 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-bool dl_cpu_busy(unsigned int cpu)
+enum dl_bw_request {
+ dl_bw_req_deactivate = 0,
+ dl_bw_req_alloc,
+ dl_bw_req_free
+};
+
+static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
{
- unsigned long flags;
+ unsigned long flags, cap;
struct dl_bw *dl_b;
- bool overflow;
- int cpus;
+ bool overflow = 0;
+ u64 fair_server_bw = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
+
+ cap = dl_bw_capacity(cpu);
+ switch (req) {
+ case dl_bw_req_free:
+ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
+ break;
+ case dl_bw_req_alloc:
+ overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
+
+ if (!overflow) {
+ /*
+ * We reserve space in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
+ }
+ break;
+ case dl_bw_req_deactivate:
+ /*
+ * cpu is not off yet, but we need to do the math by
+ * considering it off already (i.e., what would happen if we
+ * turn cpu off?).
+ */
+ cap -= arch_scale_cpu_capacity(cpu);
+
+ /*
+ * cpu is going offline and NORMAL tasks will be moved away
+ * from it. We can thus discount dl_server bandwidth
+ * contribution as it won't need to be servicing tasks after
+ * the cpu is off.
+ */
+ if (cpu_rq(cpu)->fair_server.dl_server)
+ fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
+
+ /*
+ * Not much to check if no DEADLINE bandwidth is present.
+ * dl_servers we can discount, as tasks will be moved out the
+ * offlined CPUs anyway.
+ */
+ if (dl_b->total_bw - fair_server_bw > 0) {
+ /*
+ * Leaving at least one CPU for DEADLINE tasks seems a
+ * wise thing to do. As said above, cpu is not offline
+ * yet, so account for that.
+ */
+ if (dl_bw_cpus(cpu) - 1)
+ overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
+ else
+ overflow = 1;
+ }
+
+ break;
+ }
+
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- return overflow;
+ return overflow ? -EBUSY : 0;
+}
+
+int dl_bw_deactivate(int cpu)
+{
+ return dl_bw_manage(dl_bw_req_deactivate, cpu, 0);
+}
+
+int dl_bw_alloc(int cpu, u64 dl_bw)
+{
+ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
+}
+
+void dl_bw_free(int cpu, u64 dl_bw)
+{
+ dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
}
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 36c54265bb2b..ef047add7f9e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,12 +6,9 @@
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
-#include "sched.h"
-
-static DEFINE_SPINLOCK(sched_debug_lock);
/*
- * This allows printing both to /proc/sched_debug and
+ * This allows printing both to /sys/kernel/debug/sched/debug and
* to the console
*/
#define SEQ_printf(m, x...) \
@@ -169,191 +166,478 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
-__read_mostly bool sched_debug_enabled;
+#ifdef CONFIG_SMP
-static __init int sched_init_debug(void)
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
- &sched_feat_fops);
+ char buf[16];
+ unsigned int scaling;
- debugfs_create_bool("sched_debug", 0644, NULL,
- &sched_debug_enabled);
+ if (cnt > 15)
+ cnt = 15;
- return 0;
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+ buf[cnt] = '\0';
+
+ if (kstrtouint(buf, 10, &scaling))
+ return -EINVAL;
+
+ if (scaling >= SCHED_TUNABLESCALING_END)
+ return -EINVAL;
+
+ sysctl_sched_tunable_scaling = scaling;
+ if (sched_update_scaling())
+ return -EINVAL;
+
+ *ppos += cnt;
+ return cnt;
}
-late_initcall(sched_init_debug);
-#ifdef CONFIG_SMP
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+ return 0;
+}
-#ifdef CONFIG_SYSCTL
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_scaling_show, NULL);
+}
-static struct ctl_table sd_ctl_dir[] = {
- {
- .procname = "sched_domain",
- .mode = 0555,
- },
- {}
+static const struct file_operations sched_scaling_fops = {
+ .open = sched_scaling_open,
+ .write = sched_scaling_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
};
-static struct ctl_table sd_ctl_root[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = sd_ctl_dir,
- },
- {}
+#endif /* SMP */
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+ int mode;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ mode = sched_dynamic_mode(strstrip(buf));
+ if (mode < 0)
+ return mode;
+
+ sched_dynamic_update(mode);
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static int sched_dynamic_show(struct seq_file *m, void *v)
+{
+ static const char * preempt_modes[] = {
+ "none", "voluntary", "full", "lazy",
+ };
+ int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
+ int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+
+ for (; i < j; i++) {
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, "(");
+ seq_puts(m, preempt_modes[i]);
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, ")");
+
+ seq_puts(m, " ");
+ }
+
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int sched_dynamic_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_dynamic_show, NULL);
+}
+
+static const struct file_operations sched_dynamic_fops = {
+ .open = sched_dynamic_open,
+ .write = sched_dynamic_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
};
-static struct ctl_table *sd_alloc_ctl_entry(int n)
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+__read_mostly bool sched_debug_verbose;
+
+#ifdef CONFIG_SMP
+static struct dentry *sd_dentry;
+
+
+static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- struct ctl_table *entry =
- kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+ ssize_t result;
+ bool orig;
- return entry;
+ cpus_read_lock();
+ mutex_lock(&sched_domains_mutex);
+
+ orig = sched_debug_verbose;
+ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+
+ if (sched_debug_verbose && !orig)
+ update_sched_domain_debugfs();
+ else if (!sched_debug_verbose && orig) {
+ debugfs_remove(sd_dentry);
+ sd_dentry = NULL;
+ }
+
+ mutex_unlock(&sched_domains_mutex);
+ cpus_read_unlock();
+
+ return result;
}
+#else
+#define sched_verbose_write debugfs_write_file_bool
+#endif
+
+static const struct file_operations sched_verbose_fops = {
+ .read = debugfs_read_file_bool,
+ .write = sched_verbose_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct seq_operations sched_debug_sops;
-static void sd_free_ctl_entry(struct ctl_table **tablep)
+static int sched_debug_open(struct inode *inode, struct file *filp)
{
- struct ctl_table *entry;
+ return seq_open(filp, &sched_debug_sops);
+}
- /*
- * In the intermediate directories, both the child directory and
- * procname are dynamically allocated and could fail but the mode
- * will always be set. In the lowest directory the names are
- * static strings and all have proc handlers.
- */
- for (entry = *tablep; entry->mode; entry++) {
- if (entry->child)
- sd_free_ctl_entry(&entry->child);
- if (entry->proc_handler == NULL)
- kfree(entry->procname);
+static const struct file_operations sched_debug_fops = {
+ .open = sched_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+enum dl_param {
+ DL_RUNTIME = 0,
+ DL_PERIOD,
+};
+
+static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
+
+static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, enum dl_param param)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ struct rq *rq = cpu_rq(cpu);
+ u64 runtime, period;
+ size_t err;
+ int retval;
+ u64 value;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &value);
+ if (err)
+ return err;
+
+ scoped_guard (rq_lock_irqsave, rq) {
+ runtime = rq->fair_server.dl_runtime;
+ period = rq->fair_server.dl_period;
+
+ switch (param) {
+ case DL_RUNTIME:
+ if (runtime == value)
+ break;
+ runtime = value;
+ break;
+ case DL_PERIOD:
+ if (value == period)
+ break;
+ period = value;
+ break;
+ }
+
+ if (runtime > period ||
+ period > fair_server_period_max ||
+ period < fair_server_period_min) {
+ return -EINVAL;
+ }
+
+ if (rq->cfs.h_nr_queued) {
+ update_rq_clock(rq);
+ dl_server_stop(&rq->fair_server);
+ }
+
+ retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
+ if (retval)
+ cnt = retval;
+
+ if (!runtime)
+ printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
+ cpu_of(rq));
+
+ if (rq->cfs.h_nr_queued)
+ dl_server_start(&rq->fair_server);
}
- kfree(*tablep);
- *tablep = NULL;
+ *ppos += cnt;
+ return cnt;
}
-static void
-set_table_entry(struct ctl_table *entry,
- const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler)
-{
- entry->procname = procname;
- entry->data = data;
- entry->maxlen = maxlen;
- entry->mode = mode;
- entry->proc_handler = proc_handler;
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
- struct ctl_table *table = sd_alloc_ctl_entry(9);
-
- if (table == NULL)
- return NULL;
-
- set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
- set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
- set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
- /* &table[8] is terminator */
-
- return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
- struct ctl_table *entry, *table;
- struct sched_domain *sd;
- int domain_num = 0, i;
- char buf[32];
-
- for_each_domain(cpu, sd)
- domain_num++;
- entry = table = sd_alloc_ctl_entry(domain_num + 1);
- if (table == NULL)
- return NULL;
-
- i = 0;
- for_each_domain(cpu, sd) {
- snprintf(buf, 32, "domain%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_domain_table(sd);
- entry++;
- i++;
+static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+ u64 value;
+
+ switch (param) {
+ case DL_RUNTIME:
+ value = rq->fair_server.dl_runtime;
+ break;
+ case DL_PERIOD:
+ value = rq->fair_server.dl_period;
+ break;
}
- return table;
+
+ seq_printf(m, "%llu\n", value);
+ return 0;
+
}
-static cpumask_var_t sd_sysctl_cpus;
-static struct ctl_table_header *sd_sysctl_header;
+static ssize_t
+sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+}
-void register_sched_domain_sysctl(void)
+static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
{
- static struct ctl_table *cpu_entries;
- static struct ctl_table **cpu_idx;
- static bool init_done = false;
- char buf[32];
- int i;
+ return sched_fair_server_show(m, v, DL_RUNTIME);
+}
- if (!cpu_entries) {
- cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
- if (!cpu_entries)
- return;
+static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_runtime_fops = {
+ .open = sched_fair_server_runtime_open,
+ .write = sched_fair_server_runtime_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static ssize_t
+sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+}
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = cpu_entries;
+static int sched_fair_server_period_show(struct seq_file *m, void *v)
+{
+ return sched_fair_server_show(m, v, DL_PERIOD);
+}
+
+static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_period_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_period_fops = {
+ .open = sched_fair_server_period_open,
+ .write = sched_fair_server_period_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *debugfs_sched;
+
+static void debugfs_fair_server_init(void)
+{
+ struct dentry *d_fair;
+ unsigned long cpu;
+
+ d_fair = debugfs_create_dir("fair_server", debugfs_sched);
+ if (!d_fair)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%lu", cpu);
+ d_cpu = debugfs_create_dir(buf, d_fair);
+
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
}
+}
- if (!cpu_idx) {
- struct ctl_table *e = cpu_entries;
+static __init int sched_init_debug(void)
+{
+ struct dentry __maybe_unused *numa;
- cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
- if (!cpu_idx)
- return;
+ debugfs_sched = debugfs_create_dir("sched", NULL);
- /* deal with sparse possible map */
- for_each_possible_cpu(i) {
- cpu_idx[i] = e;
- e++;
- }
+ debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+ debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+ debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+#endif
+
+ debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice);
+
+ debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+ debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
+
+#ifdef CONFIG_SMP
+ debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+
+ mutex_lock(&sched_domains_mutex);
+ update_sched_domain_debugfs();
+ mutex_unlock(&sched_domains_mutex);
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ numa = debugfs_create_dir("numa_balancing", debugfs_sched);
+
+ debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+ debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+ debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+ debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+ debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+#endif
+
+ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
+
+ debugfs_fair_server_init();
+
+ return 0;
+}
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
+
+static cpumask_var_t sd_sysctl_cpus;
+
+static int sd_flags_show(struct seq_file *m, void *v)
+{
+ unsigned long flags = *(unsigned int *)m->private;
+ int idx;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ seq_puts(m, sd_flag_debug[idx].name);
+ seq_puts(m, " ");
}
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static int sd_flags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, sd_flags_show, inode->i_private);
+}
+
+static const struct file_operations sd_flags_fops = {
+ .open = sd_flags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void register_sd(struct sched_domain *sd, struct dentry *parent)
+{
+#define SDM(type, mode, member) \
+ debugfs_create_##type(#member, mode, parent, &sd->member)
+
+ SDM(ulong, 0644, min_interval);
+ SDM(ulong, 0644, max_interval);
+ SDM(u64, 0644, max_newidle_lb_cost);
+ SDM(u32, 0644, busy_factor);
+ SDM(u32, 0644, imbalance_pct);
+ SDM(u32, 0644, cache_nice_tries);
+ SDM(str, 0444, name);
+
+#undef SDM
+
+ debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
+ debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level);
+}
+
+void update_sched_domain_debugfs(void)
+{
+ int cpu, i;
+
+ /*
+ * This can unfortunately be invoked before sched_debug_init() creates
+ * the debug directory. Don't touch sd_sysctl_cpus until then.
+ */
+ if (!debugfs_sched)
+ return;
+
+ if (!sched_debug_verbose)
+ return;
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
- if (!init_done) {
- init_done = true;
- /* init to possible to not have holes in @cpu_entries */
- cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ if (!sd_dentry) {
+ sd_dentry = debugfs_create_dir("domains", debugfs_sched);
+
+ /* rebuild sd_sysctl_cpus if empty since it gets cleared below */
+ if (cpumask_empty(sd_sysctl_cpus))
+ cpumask_copy(sd_sysctl_cpus, cpu_online_mask);
}
- for_each_cpu(i, sd_sysctl_cpus) {
- struct ctl_table *e = cpu_idx[i];
+ for_each_cpu(cpu, sd_sysctl_cpus) {
+ struct sched_domain *sd;
+ struct dentry *d_cpu;
+ char buf[32];
- if (e->child)
- sd_free_ctl_entry(&e->child);
+ snprintf(buf, sizeof(buf), "cpu%d", cpu);
+ debugfs_lookup_and_remove(buf, sd_dentry);
+ d_cpu = debugfs_create_dir(buf, sd_dentry);
- if (!e->procname) {
- snprintf(buf, 32, "cpu%d", i);
- e->procname = kstrdup(buf, GFP_KERNEL);
+ i = 0;
+ for_each_domain(cpu, sd) {
+ struct dentry *d_sd;
+
+ snprintf(buf, sizeof(buf), "domain%d", i);
+ d_sd = debugfs_create_dir(buf, d_cpu);
+
+ register_sd(sd, d_sd);
+ i++;
}
- e->mode = 0555;
- e->child = sd_alloc_ctl_cpu_table(i);
- __cpumask_clear_cpu(i, sd_sysctl_cpus);
+ __cpumask_clear_cpu(cpu, sd_sysctl_cpus);
}
-
- WARN_ON(sd_sysctl_header);
- sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
void dirty_sched_domain_sysctl(int cpu)
@@ -362,13 +646,6 @@ void dirty_sched_domain_sysctl(int cpu)
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
}
-/* may be called multiple times per register */
-void unregister_sched_domain_sysctl(void)
-{
- unregister_sysctl_table(sd_sysctl_header);
- sd_sysctl_header = NULL;
-}
-#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -377,9 +654,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
struct sched_entity *se = tg->se[cpu];
#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
-#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
+#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \
+ #F, (long long)schedstat_val(stats->F))
#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
+#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \
+ #F, SPLIT_NS((long long)schedstat_val(stats->F)))
if (!se)
return;
@@ -389,16 +668,19 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->sum_exec_runtime);
if (schedstat_enabled()) {
- PN_SCHEDSTAT(se->statistics.wait_start);
- PN_SCHEDSTAT(se->statistics.sleep_start);
- PN_SCHEDSTAT(se->statistics.block_start);
- PN_SCHEDSTAT(se->statistics.sleep_max);
- PN_SCHEDSTAT(se->statistics.block_max);
- PN_SCHEDSTAT(se->statistics.exec_max);
- PN_SCHEDSTAT(se->statistics.slice_max);
- PN_SCHEDSTAT(se->statistics.wait_max);
- PN_SCHEDSTAT(se->statistics.wait_sum);
- P_SCHEDSTAT(se->statistics.wait_count);
+ struct sched_statistics *stats;
+ stats = __schedstats_from_se(se);
+
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
}
P(se->load.weight);
@@ -416,43 +698,69 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
static char group_path[PATH_MAX];
-static char *task_group_path(struct task_group *tg)
+static void task_group_path(struct task_group *tg, char *path, int plen)
{
- if (autogroup_path(tg, group_path, PATH_MAX))
- return group_path;
+ if (autogroup_path(tg, path, plen))
+ return;
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ cgroup_path(tg->css.cgroup, path, plen);
+}
- return group_path;
+/*
+ * Only 1 SEQ_printf_task_group_path() caller can use the full length
+ * group_path[] for cgroup path. Other simultaneous callers will have
+ * to use a shorter stack buffer. A "..." suffix is appended at the end
+ * of the stack buffer so that it will show up in case the output length
+ * matches the given buffer size to indicate possible path name truncation.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...) \
+{ \
+ if (spin_trylock(&sched_debug_lock)) { \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, fmt, group_path); \
+ spin_unlock(&sched_debug_lock); \
+ } else { \
+ char buf[128]; \
+ char *bufend = buf + sizeof(buf) - 3; \
+ task_group_path(tg, buf, bufend - buf); \
+ strcpy(bufend - 1, "..."); \
+ SEQ_printf(m, fmt, buf); \
+ } \
}
#endif
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
- if (rq->curr == p)
+ if (task_current(rq, p))
SEQ_printf(m, ">R");
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ p->se.custom_slice ? 'S' : ' ',
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
- SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
- SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
- SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
+ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
+ SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
+ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
- SEQ_printf(m, " %s", task_group_path(task_group(p)));
+ SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
@@ -464,10 +772,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
- SEQ_printf(m, " S task PID tree-key switches prio"
- " wait-time sum-exec sum-sleep\n");
+ SEQ_printf(m, " S task PID vruntime eligible "
+ "deadline slice sum-exec switches "
+ "prio wait-time sum-sleep sum-block"
+#ifdef CONFIG_NUMA_BALANCING
+ " node group-id"
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ " group-path"
+#endif
+ "\n");
SEQ_printf(m, "-------------------------------------------------------"
- "------------------------------------------------------\n");
+ "------------------------------------------------------"
+ "------------------------------------------------------"
+#ifdef CONFIG_NUMA_BALANCING
+ "--------------"
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ "--------------"
+#endif
+ "\n");
rcu_read_lock();
for_each_process_thread(g, p) {
@@ -481,46 +805,48 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+ SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
#else
SEQ_printf(m, "\n");
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
#endif
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
- SPLIT_NS(cfs_rq->exec_clock));
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rb_first_cached(&cfs_rq->tasks_timeline))
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ raw_spin_rq_lock_irqsave(rq, flags);
+ root = __pick_root_entity(cfs_rq);
+ if (root)
+ left_vruntime = root->min_vruntime;
+ first = __pick_first_entity(cfs_rq);
+ if (first)
+ left_deadline = first->deadline;
last = __pick_last_entity(cfs_rq);
if (last)
- max_vruntime = last->vruntime;
+ right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
+ SPLIT_NS(left_deadline));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
+ SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
- SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
- cfs_rq->nr_spread_over);
- SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
+ SPLIT_NS(avg_vruntime(cfs_rq)));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+ SPLIT_NS(right_vruntime));
+ spread = right_vruntime - left_vruntime;
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
+ SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
@@ -529,8 +855,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.runnable_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
- SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
- cfs_rq->avg.util_est.enqueued);
+ SEQ_printf(m, " .%-30s: %u\n", "util_est",
+ cfs_rq->avg.util_est);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -560,7 +886,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#ifdef CONFIG_RT_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+ SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
#else
SEQ_printf(m, "\n");
SEQ_printf(m, "rt_rq[%d]:\n", cpu);
@@ -574,12 +900,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
PU(rt_nr_running);
-#ifdef CONFIG_SMP
- PU(rt_nr_migratory);
-#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
+#endif
#undef PN
#undef PU
@@ -598,7 +924,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
PU(dl_nr_running);
#ifdef CONFIG_SMP
- PU(dl_nr_migratory);
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
#else
dl_bw = &dl_rq->dl_bw;
@@ -612,7 +937,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
#ifdef CONFIG_X86
{
@@ -628,7 +952,7 @@ static void print_cpu(struct seq_file *m, int cpu)
#define P(x) \
do { \
if (sizeof(rq->x) == 4) \
- SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \
else \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
} while (0)
@@ -663,13 +987,11 @@ do { \
}
#undef P
- spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
print_dl_stats(m, cpu);
print_rq(m, rq, cpu);
- spin_unlock_irqrestore(&sched_debug_lock, flags);
SEQ_printf(m, "\n");
}
@@ -716,10 +1038,7 @@ static void sched_debug_header(struct seq_file *m)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- PN(sysctl_sched_latency);
- PN(sysctl_sched_min_granularity);
- PN(sysctl_sched_wakeup_granularity);
- P(sysctl_sched_child_runs_first);
+ PN(sysctl_sched_base_slice);
P(sysctl_sched_features);
#undef PN
#undef P
@@ -761,7 +1080,7 @@ void sysrq_sched_debug_show(void)
}
/*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is CPU 0.
* In a hotplugged system some CPUs, including CPU 0, may be missing so we have
@@ -806,18 +1125,10 @@ static const struct seq_operations sched_debug_sops = {
.show = sched_debug_show,
};
-static int __init init_sched_debug_procfs(void)
-{
- if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
- return -ENOMEM;
- return 0;
-}
-
-__initcall(init_sched_debug_procfs);
-
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
#define __P(F) __PS(#F, F)
#define P(F) __PS(#F, p->F)
+#define PM(F, M) __PS(#F, p->F & (M))
#define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
#define __PN(F) __PSN(#F, F)
#define PN(F) __PSN(#F, p->F)
@@ -837,25 +1148,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
- struct mempolicy *pol;
-
if (p->mm)
P(mm->numa_scan_seq);
- task_lock(p);
- pol = p->mempolicy;
- if (pol && !(pol->flags & MPOL_F_MORON))
- pol = NULL;
- mpol_get(pol);
- task_unlock(p);
-
P(numa_pages_migrated);
P(numa_preferred_nid);
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
- mpol_put(pol);
#endif
}
@@ -870,8 +1171,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"---------------------------------------------------------"
"----------\n");
-#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F))
-#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
+#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F))
+#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F))
PN(se.exec_start);
PN(se.vruntime);
@@ -884,33 +1185,34 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
- PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
- PN_SCHEDSTAT(se.statistics.wait_start);
- PN_SCHEDSTAT(se.statistics.sleep_start);
- PN_SCHEDSTAT(se.statistics.block_start);
- PN_SCHEDSTAT(se.statistics.sleep_max);
- PN_SCHEDSTAT(se.statistics.block_max);
- PN_SCHEDSTAT(se.statistics.exec_max);
- PN_SCHEDSTAT(se.statistics.slice_max);
- PN_SCHEDSTAT(se.statistics.wait_max);
- PN_SCHEDSTAT(se.statistics.wait_sum);
- P_SCHEDSTAT(se.statistics.wait_count);
- PN_SCHEDSTAT(se.statistics.iowait_sum);
- P_SCHEDSTAT(se.statistics.iowait_count);
- P_SCHEDSTAT(se.statistics.nr_migrations_cold);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
- P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
- P_SCHEDSTAT(se.statistics.nr_forced_migrations);
- P_SCHEDSTAT(se.statistics.nr_wakeups);
- P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
- P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
- P_SCHEDSTAT(se.statistics.nr_wakeups_local);
- P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
- P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
- P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
- P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
- P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
+ PN_SCHEDSTAT(sum_sleep_runtime);
+ PN_SCHEDSTAT(sum_block_runtime);
+ PN_SCHEDSTAT(wait_start);
+ PN_SCHEDSTAT(sleep_start);
+ PN_SCHEDSTAT(block_start);
+ PN_SCHEDSTAT(sleep_max);
+ PN_SCHEDSTAT(block_max);
+ PN_SCHEDSTAT(exec_max);
+ PN_SCHEDSTAT(slice_max);
+ PN_SCHEDSTAT(wait_max);
+ PN_SCHEDSTAT(wait_sum);
+ P_SCHEDSTAT(wait_count);
+ PN_SCHEDSTAT(iowait_sum);
+ P_SCHEDSTAT(iowait_count);
+ P_SCHEDSTAT(nr_migrations_cold);
+ P_SCHEDSTAT(nr_failed_migrations_affine);
+ P_SCHEDSTAT(nr_failed_migrations_running);
+ P_SCHEDSTAT(nr_failed_migrations_hot);
+ P_SCHEDSTAT(nr_forced_migrations);
+ P_SCHEDSTAT(nr_wakeups);
+ P_SCHEDSTAT(nr_wakeups_sync);
+ P_SCHEDSTAT(nr_wakeups_migrate);
+ P_SCHEDSTAT(nr_wakeups_local);
+ P_SCHEDSTAT(nr_wakeups_remote);
+ P_SCHEDSTAT(nr_wakeups_affine);
+ P_SCHEDSTAT(nr_wakeups_affine_attempts);
+ P_SCHEDSTAT(nr_wakeups_passive);
+ P_SCHEDSTAT(nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
@@ -928,6 +1230,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PN(avg_atom);
__PN(avg_per_cpu);
+
+#ifdef CONFIG_SCHED_CORE
+ PN_SCHEDSTAT(core_forceidle_sum);
+#endif
}
__P(nr_switches);
@@ -943,8 +1249,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.runnable_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
- P(se.avg.util_est.ewma);
- P(se.avg.util_est.enqueued);
+ PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
@@ -957,7 +1262,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
if (task_has_dl_policy(p)) {
P(dl.runtime);
P(dl.deadline);
+ } else if (fair_policy(p->policy)) {
+ P(se.slice);
}
+#ifdef CONFIG_SCHED_CLASS_EXT
+ __PS("ext.enabled", task_on_scx(p));
+#endif
#undef PN_SCHEDSTAT
#undef P_SCHEDSTAT
@@ -976,6 +1286,18 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
- memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+ memset(&p->stats, 0, sizeof(p->stats));
#endif
}
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+ static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+ if (likely(!__ratelimit(&latency_check_ratelimit)))
+ return;
+
+ pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n",
+ cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+ dump_stack();
+}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
new file mode 100644
index 000000000000..7b9dfee858e7
--- /dev/null
+++ b/kernel/sched/ext.c
@@ -0,0 +1,7870 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_OPS_TASK_ITER_BATCH = 32,
+};
+
+enum scx_exit_kind {
+ SCX_EXIT_NONE,
+ SCX_EXIT_DONE,
+
+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+
+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
+};
+
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or
+ * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
+ * respectively. The codes are 64bit of the format:
+ *
+ * Bits: [63 .. 48 47 .. 32 31 .. 0]
+ * [ SYS ACT ] [ SYS RSN ] [ USR ]
+ *
+ * SYS ACT: System-defined exit actions
+ * SYS RSN: System-defined exit reasons
+ * USR : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+ /* %SCX_EXIT_* - broad category of the exit reason */
+ enum scx_exit_kind kind;
+
+ /* exit code if gracefully exiting */
+ s64 exit_code;
+
+ /* textual representation of the above */
+ const char *reason;
+
+ /* backtrace if exiting due to an error */
+ unsigned long *bt;
+ u32 bt_len;
+
+ /* informational message */
+ char *msg;
+
+ /* debug dump */
+ char *dump;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+ /*
+ * Keep built-in idle tracking even if ops.update_idle() is implemented.
+ */
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+ /*
+ * By default, if there are no other task to run on the CPU, ext core
+ * keeps running the current task even after its slice expires. If this
+ * flag is specified, such tasks are passed to ops.enqueue() with
+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+ */
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
+
+ /*
+ * An exiting task may schedule after PF_EXITING is set. In such cases,
+ * bpf_task_from_pid() may not be able to find the task and if the BPF
+ * scheduler depends on pid lookup for dispatching, the task will be
+ * lost leading to various issues including RCU grace period stalls.
+ *
+ * To mask this problem, by default, unhashed tasks are automatically
+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+ * depend on pid lookups and wants to handle these tasks directly, the
+ * following flag can be used.
+ */
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
+
+ /*
+ * If set, only tasks with policy set to SCHED_EXT are attached to
+ * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ /*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * CPU cgroup support flags
+ */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
+
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+ /*
+ * Set if ops.init_task() is being invoked on the fork path, as opposed
+ * to the scheduler transition path.
+ */
+ bool fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /* the cgroup the task is joining */
+ struct cgroup *cgroup;
+#endif
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+ /* Whether the task exited before running on sched_ext. */
+ bool cancelled;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+ /* the weight of the cgroup [1..10000] */
+ u32 weight;
+};
+
+enum scx_cpu_preempt_reason {
+ /* next task is being scheduled by &sched_class_rt */
+ SCX_CPU_PREEMPT_RT,
+ /* next task is being scheduled by &sched_class_dl */
+ SCX_CPU_PREEMPT_DL,
+ /* next task is being scheduled by &sched_class_stop */
+ SCX_CPU_PREEMPT_STOP,
+ /* unknown reason for SCX being preempted */
+ SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+ /* the reason the CPU was preempted */
+ enum scx_cpu_preempt_reason reason;
+
+ /* the task that's going to be scheduled on the CPU */
+ struct task_struct *task;
+};
+
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+ enum scx_exit_kind kind;
+ s64 exit_code;
+ const char *reason;
+ u64 at_ns;
+ u64 at_jiffies;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
+ */
+struct sched_ext_ops {
+ /**
+ * @select_cpu: Pick the target CPU for a task which is being woken up
+ * @p: task being woken up
+ * @prev_cpu: the cpu @p was on before sleeping
+ * @wake_flags: SCX_WAKE_*
+ *
+ * Decision made here isn't final. @p may be moved to any CPU while it
+ * is getting dispatched for execution later. However, as @p is not on
+ * the rq at this point, getting the eventual execution CPU right here
+ * saves a small bit of overhead down the line.
+ *
+ * If an idle CPU is returned, the CPU is kicked and will try to
+ * dispatch. While an explicit custom mechanism can be added,
+ * select_cpu() serves as the default way to wake up idle CPUs.
+ *
+ * @p may be inserted into a DSQ directly by calling
+ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+ * of the CPU returned by this operation.
+ *
+ * Note that select_cpu() is never called for tasks that can only run
+ * on a single CPU or tasks with migration disabled, as they don't have
+ * the option to select a different CPU. See select_task_rq() for
+ * details.
+ */
+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+ /**
+ * @enqueue: Enqueue a task on the BPF scheduler
+ * @p: task being enqueued
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * @p is ready to run. Insert directly into a DSQ by calling
+ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+ * the task will stall.
+ *
+ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
+ * skipped.
+ */
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @dequeue: Remove a task from the BPF scheduler
+ * @p: task being dequeued
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * Remove @p from the BPF scheduler. This is usually called to isolate
+ * the task while updating its scheduling properties (e.g. priority).
+ *
+ * The ext core keeps track of whether the BPF side owns a given task or
+ * not and can gracefully ignore spurious dispatches from BPF side,
+ * which makes it safe to not implement this method. However, depending
+ * on the scheduling logic, this can lead to confusing behaviors - e.g.
+ * scheduling position not being updated across a priority change.
+ */
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @cpu: CPU to dispatch tasks for
+ * @prev: previous task being switched out
+ *
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+ * using scx_bpf_dsq_move_to_local().
+ *
+ * The maximum number of times scx_bpf_dsq_insert() can be called
+ * without an intervening scx_bpf_dsq_move_to_local() is specified by
+ * ops.dispatch_max_batch. See the comments on top of the two functions
+ * for more details.
+ *
+ * When not %NULL, @prev is an SCX task with its slice depleted. If
+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+ * ops.dispatch() returns. To keep executing @prev, return without
+ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
+ */
+ void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+ /**
+ * @tick: Periodic tick
+ * @p: task running currently
+ *
+ * This operation is called every 1/HZ seconds on CPUs which are
+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+ * immediate dispatch cycle on the CPU.
+ */
+ void (*tick)(struct task_struct *p);
+
+ /**
+ * @runnable: A task is becoming runnable on its associated CPU
+ * @p: task becoming runnable
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * This and the following three functions can be used to track a task's
+ * execution state transitions. A task becomes ->runnable() on a CPU,
+ * and then goes through one or more ->running() and ->stopping() pairs
+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+ * done running on the CPU.
+ *
+ * @p is becoming runnable on the CPU because it's
+ *
+ * - waking up (%SCX_ENQ_WAKEUP)
+ * - being moved from another CPU
+ * - being restored after temporarily taken off the queue for an
+ * attribute change.
+ *
+ * This and ->enqueue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be followed by ->enqueue()
+ * e.g. when @p is being dispatched to a remote CPU, or when @p is
+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+ * task may be ->enqueue()'d without being preceded by this operation
+ * e.g. after exhausting its slice.
+ */
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @running: A task is starting to run on its associated CPU
+ * @p: task starting to run
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ */
+ void (*running)(struct task_struct *p);
+
+ /**
+ * @stopping: A task is stopping execution
+ * @p: task stopping to run
+ * @runnable: is task @p still runnable?
+ *
+ * See ->runnable() for explanation on the task state notifiers. If
+ * !@runnable, ->quiescent() will be invoked after this operation
+ * returns.
+ */
+ void (*stopping)(struct task_struct *p, bool runnable);
+
+ /**
+ * @quiescent: A task is becoming not runnable on its associated CPU
+ * @p: task becoming not runnable
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ *
+ * @p is becoming quiescent on the CPU because it's
+ *
+ * - sleeping (%SCX_DEQ_SLEEP)
+ * - being moved to another CPU
+ * - being temporarily taken off the queue for an attribute change
+ * (%SCX_DEQ_SAVE)
+ *
+ * This and ->dequeue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be preceded by ->dequeue()
+ * e.g. when @p is being dispatched to a remote CPU.
+ */
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @yield: Yield CPU
+ * @from: yielding task
+ * @to: optional yield target task
+ *
+ * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+ * The BPF scheduler should ensure that other available tasks are
+ * dispatched before the yielding task. Return value is ignored in this
+ * case.
+ *
+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+ * scheduler can implement the request, return %true; otherwise, %false.
+ */
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+ /**
+ * @core_sched_before: Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Used by core-sched to determine the ordering between two tasks. See
+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+ * core-sched.
+ *
+ * Both @a and @b are runnable and may or may not currently be queued on
+ * the BPF scheduler. Should return %true if @a should run before @b.
+ * %false if there's no required ordering or @b should run before @a.
+ *
+ * If not specified, the default is ordering them according to when they
+ * became runnable.
+ */
+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
+ /**
+ * @set_weight: Set task weight
+ * @p: task to set weight for
+ * @weight: new weight [1..10000]
+ *
+ * Update @p's weight to @weight.
+ */
+ void (*set_weight)(struct task_struct *p, u32 weight);
+
+ /**
+ * @set_cpumask: Set CPU affinity
+ * @p: task to set CPU affinity for
+ * @cpumask: cpumask of cpus that @p can run on
+ *
+ * Update @p's CPU affinity to @cpumask.
+ */
+ void (*set_cpumask)(struct task_struct *p,
+ const struct cpumask *cpumask);
+
+ /**
+ * @update_idle: Update the idle state of a CPU
+ * @cpu: CPU to update the idle state for
+ * @idle: whether entering or exiting the idle state
+ *
+ * This operation is called when @rq's CPU goes or leaves the idle
+ * state. By default, implementing this operation disables the built-in
+ * idle CPU tracking and the following helpers become unavailable:
+ *
+ * - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_test_and_clear_cpu_idle()
+ * - scx_bpf_pick_idle_cpu()
+ *
+ * The user also must implement ops.select_cpu() as the default
+ * implementation relies on scx_bpf_select_cpu_dfl().
+ *
+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+ * tracking.
+ */
+ void (*update_idle)(s32 cpu, bool idle);
+
+ /**
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * @cpu_release: A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
+ /**
+ * @init_task: Initialize a task to run in a BPF scheduler
+ * @p: task to initialize for BPF scheduling
+ * @args: init arguments, see the struct definition
+ *
+ * Either we're loading a BPF scheduler or a new task is being forked.
+ * Initialize @p for BPF scheduling. This operation may block and can
+ * be used for allocations, and is called exactly once for a task.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During a fork, it
+ * will abort that specific fork.
+ */
+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+ /**
+ * @exit_task: Exit a previously-running task from the system
+ * @p: task to exit
+ * @args: exit arguments, see the struct definition
+ *
+ * @p is exiting or the BPF scheduler is being unloaded. Perform any
+ * necessary cleanup for @p.
+ */
+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+ /**
+ * @enable: Enable BPF scheduling for a task
+ * @p: task to enable BPF scheduling for
+ *
+ * Enable @p for BPF scheduling. enable() is called on @p any time it
+ * enters SCX, and is always paired with a matching disable().
+ */
+ void (*enable)(struct task_struct *p);
+
+ /**
+ * @disable: Disable BPF scheduling for a task
+ * @p: task to disable BPF scheduling for
+ *
+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+ * Disable BPF scheduling for @p. A disable() call is always matched
+ * with a prior enable() call.
+ */
+ void (*disable)(struct task_struct *p);
+
+ /**
+ * @dump: Dump BPF scheduler state on error
+ * @ctx: debug dump context
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+ */
+ void (*dump)(struct scx_dump_ctx *ctx);
+
+ /**
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
+ * @ctx: debug dump context
+ * @cpu: CPU to generate debug dump for
+ * @idle: @cpu is currently idle without any runnable tasks
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @cpu. If @idle is %true and this operation doesn't produce any
+ * output, @cpu is skipped for dump.
+ */
+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+ /**
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
+ * @ctx: debug dump context
+ * @p: runnable task to generate debug dump for
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @p.
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /**
+ * @cgroup_init: Initialize a cgroup
+ * @cgrp: cgroup being initialized
+ * @args: init arguments, see the struct definition
+ *
+ * Either the BPF scheduler is being loaded or @cgrp created, initialize
+ * @cgrp for sched_ext. This operation may block.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During cgroup
+ * creation, it will abort the specific cgroup creation.
+ */
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+
+ /**
+ * @cgroup_exit: Exit a cgroup
+ * @cgrp: cgroup being exited
+ *
+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+ * @cgrp for sched_ext. This operation my block.
+ */
+ void (*cgroup_exit)(struct cgroup *cgrp);
+
+ /**
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Prepare @p for move from cgroup @from to @to. This operation may
+ * block and can be used for allocations.
+ *
+ * Return 0 for success, -errno for failure. An error return aborts the
+ * migration.
+ */
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_move: Commit cgroup move
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Commit the move. @p is dequeued during this operation.
+ */
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_cancel_move: Cancel cgroup move
+ * @p: task whose cgroup move is being canceled
+ * @from: cgroup @p was being moved from
+ * @to: cgroup @p was being moved to
+ *
+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+ * Undo the preparation.
+ */
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_set_weight: A cgroup's weight is being changed
+ * @cgrp: cgroup whose weight is being updated
+ * @weight: new weight [1..10000]
+ *
+ * Update @tg's weight to @weight.
+ */
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+ /*
+ * All online ops must come before ops.cpu_online().
+ */
+
+ /**
+ * @cpu_online: A CPU became online
+ * @cpu: CPU which just came up
+ *
+ * @cpu just came online. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+ */
+ void (*cpu_online)(s32 cpu);
+
+ /**
+ * @cpu_offline: A CPU is going offline
+ * @cpu: CPU which is going offline
+ *
+ * @cpu is going offline. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+ */
+ void (*cpu_offline)(s32 cpu);
+
+ /*
+ * All CPU hotplug ops must come before ops.init().
+ */
+
+ /**
+ * @init: Initialize the BPF scheduler
+ */
+ s32 (*init)(void);
+
+ /**
+ * @exit: Clean up after the BPF scheduler
+ * @info: Exit info
+ *
+ * ops.exit() is also called on ops.init() failure, which is a bit
+ * unusual. This is to allow rich reporting through @info on how
+ * ops.init() failed.
+ */
+ void (*exit)(struct scx_exit_info *info);
+
+ /**
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
+ */
+ u32 dispatch_max_batch;
+
+ /**
+ * @flags: %SCX_OPS_* flags
+ */
+ u64 flags;
+
+ /**
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
+ /**
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
+ /**
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
+ * detect when a hotplug event has occurred during the loading process.
+ * If 0, no detection occurs. Otherwise, the scheduler will fail to
+ * load if the sequence number does not match @scx_hotplug_seq on the
+ * enable path.
+ */
+ u64 hotplug_seq;
+
+ /**
+ * @name: BPF scheduler's name
+ *
+ * Must be a non-zero valid BPF object name including only isalnum(),
+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+ * BPF scheduler is enabled.
+ */
+ char name[SCX_OPS_NAME_LEN];
+};
+
+enum scx_opi {
+ SCX_OPI_BEGIN = 0,
+ SCX_OPI_NORMAL_BEGIN = 0,
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
+ SCX_OPI_END = SCX_OP_IDX(init),
+};
+
+enum scx_wake_flags {
+ /* expose select WF_* flags as enums */
+ SCX_WAKE_FORK = WF_FORK,
+ SCX_WAKE_TTWU = WF_TTWU,
+ SCX_WAKE_SYNC = WF_SYNC,
+};
+
+enum scx_enq_flags {
+ /* expose select ENQUEUE_* flags as enums */
+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
+ SCX_ENQ_HEAD = ENQUEUE_HEAD,
+ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * Set the following to trigger preemption when calling
+ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
+ * current task is cleared to zero and the CPU is kicked into the
+ * scheduling path. Implies %SCX_ENQ_HEAD.
+ */
+ SCX_ENQ_PREEMPT = 1LLU << 32,
+
+ /*
+ * The task being enqueued was previously enqueued on the current CPU's
+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+ * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+ * invoked in a ->cpu_release() callback, and the task is again
+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+ * task will not be scheduled on the CPU until at least the next invocation
+ * of the ->cpu_acquire() callback.
+ */
+ SCX_ENQ_REENQ = 1LLU << 40,
+
+ /*
+ * The task being enqueued is the only task available for the cpu. By
+ * default, ext core keeps executing such tasks but when
+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+ * %SCX_ENQ_LAST flag set.
+ *
+ * The BPF scheduler is responsible for triggering a follow-up
+ * scheduling event. Otherwise, Execution may stall.
+ */
+ SCX_ENQ_LAST = 1LLU << 41,
+
+ /* high 8 bits are internal */
+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+};
+
+enum scx_deq_flags {
+ /* expose select DEQUEUE_* flags as enums */
+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * The generic core-sched layer decided to execute the task even though
+ * it hasn't been dispatched yet. Dequeue from the BPF side.
+ */
+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+};
+
+enum scx_pick_idle_cpu_flags {
+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+};
+
+enum scx_kick_flags {
+ /*
+ * Kick the target CPU if idle. Guarantees that the target CPU goes
+ * through at least one full scheduling cycle before going idle. If the
+ * target CPU can be determined to be currently not idle and going to go
+ * through a scheduling cycle before going idle, noop.
+ */
+ SCX_KICK_IDLE = 1LLU << 0,
+
+ /*
+ * Preempt the current task and execute the dispatch path. If the
+ * current task of the target CPU is an SCX task, its ->scx.slice is
+ * cleared to zero before the scheduling path is invoked so that the
+ * task expires and the dispatch path is invoked.
+ */
+ SCX_KICK_PREEMPT = 1LLU << 1,
+
+ /*
+ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+ * return after the target CPU finishes picking the next task.
+ */
+ SCX_KICK_WAIT = 1LLU << 2,
+};
+
+enum scx_tg_flags {
+ SCX_TG_ONLINE = 1U << 0,
+ SCX_TG_INITED = 1U << 1,
+};
+
+enum scx_ops_enable_state {
+ SCX_OPS_ENABLING,
+ SCX_OPS_ENABLED,
+ SCX_OPS_DISABLING,
+ SCX_OPS_DISABLED,
+};
+
+static const char *scx_ops_enable_state_str[] = {
+ [SCX_OPS_ENABLING] = "enabling",
+ [SCX_OPS_ENABLED] = "enabled",
+ [SCX_OPS_DISABLING] = "disabling",
+ [SCX_OPS_DISABLED] = "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ * ^ | |
+ * | v v
+ * \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+ SCX_OPSS_NONE, /* owned by the SCX core */
+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
+
+ /*
+ * QSEQ brands each QUEUED instance so that, when dispatch races
+ * dequeue/requeue, the dispatcher can tell whether it still has a claim
+ * on the task being dispatched.
+ *
+ * As some 32bit archs can't do 64bit store_release/load_acquire,
+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+ * 32bit machines. The dispatch race window QSEQ protects is very narrow
+ * and runs with IRQ disabled. 30 bits should be sufficient.
+ */
+ SCX_OPSS_QSEQ_SHIFT = 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+
+/*
+ * During exit, a task may schedule after losing its PIDs. When disabling the
+ * BPF scheduler, we need to be able to iterate tasks in every state to
+ * guarantee system safety. Maintain a dedicated task list which contains every
+ * task between its fork and eventual free.
+ */
+static DEFINE_SPINLOCK(scx_tasks_lock);
+static LIST_HEAD(scx_tasks);
+
+/* ops enable/disable */
+static struct kthread_worker *scx_ops_helper;
+static DEFINE_MUTEX(scx_ops_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static unsigned long scx_in_softlockup;
+static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
+static int scx_ops_bypass_depth;
+static bool scx_ops_init_task_enabled;
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
+static struct sched_ext_ops scx_ops;
+static bool scx_warned_zero_slice;
+
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+#ifdef CONFIG_SMP
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+#endif
+
+static struct static_key_false scx_has_op[SCX_OPI_END] =
+ { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
+
+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
+static struct scx_exit_info *scx_exit_info;
+
+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
+
+/*
+ * A monotically increasing sequence number that is incremented every time a
+ * scheduler is enabled. This can be used by to check if any custom sched_ext
+ * scheduler has ever been used in the system.
+ */
+static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
+
+/*
+ * The maximum amount of time in jiffies that a task may be runnable without
+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_ops_error().
+ */
+static unsigned long scx_watchdog_timeout;
+
+/*
+ * The last time the delayed work was run. This delayed work relies on
+ * ksoftirqd being able to run to service timer interrupts, so it's possible
+ * that this work itself could get wedged. To account for this, we check that
+ * it's not stalled in the timer tick, and trigger an error if it is.
+ */
+static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
+
+static struct delayed_work scx_watchdog_work;
+
+/* idle tracking */
+#ifdef CONFIG_SMP
+#ifdef CONFIG_CPUMASK_OFFSTACK
+#define CL_ALIGNED_IF_ONSTACK
+#else
+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
+#endif
+
+static struct {
+ cpumask_var_t cpu;
+ cpumask_var_t smt;
+} idle_masks CL_ALIGNED_IF_ONSTACK;
+
+#endif /* CONFIG_SMP */
+
+/* for %SCX_KICK_WAIT */
+static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+
+/*
+ * Direct dispatch marker.
+ *
+ * Non-NULL values are used for direct dispatch from enqueue path. A valid
+ * pointer points to the task currently being enqueued. An ERR_PTR value is used
+ * to indicate that direct dispatch has already happened.
+ */
+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
+
+/*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
+ * to avoid live-locking in bypass mode where all tasks are dispatched to
+ * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
+ * sufficient, it can be further split.
+ */
+static struct scx_dispatch_q **global_dsqs;
+
+static const struct rhashtable_params dsq_hash_params = {
+ .key_len = sizeof_field(struct scx_dispatch_q, id),
+ .key_offset = offsetof(struct scx_dispatch_q, id),
+ .head_offset = offsetof(struct scx_dispatch_q, hash_node),
+};
+
+static struct rhashtable dsq_hash;
+static LLIST_HEAD(dsqs_to_free);
+
+/* dispatch buf */
+struct scx_dsp_buf_ent {
+ struct task_struct *task;
+ unsigned long qseq;
+ u64 dsq_id;
+ u64 enq_flags;
+};
+
+static u32 scx_dsp_max_batch;
+
+struct scx_dsp_ctx {
+ struct rq *rq;
+ u32 cursor;
+ u32 nr_tasks;
+ struct scx_dsp_buf_ent buf[];
+};
+
+static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
+
+/* string formatting from BPF */
+struct scx_bstr_buf {
+ u64 data[MAX_BPRINTF_VARARGS];
+ char line[SCX_EXIT_MSG_LEN];
+};
+
+static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
+static struct scx_bstr_buf scx_exit_bstr_buf;
+
+/* ops debug dump */
+struct scx_dump_data {
+ s32 cpu;
+ bool first;
+ s32 cursor;
+ struct seq_buf *s;
+ const char *prefix;
+ struct scx_bstr_buf buf;
+};
+
+static struct scx_dump_data scx_dump_data = {
+ .cpu = -1,
+};
+
+/* /sys/kernel/sched_ext interface */
+static struct kset *scx_kset;
+static struct kobject *scx_root_kobj;
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched_ext.h>
+
+static void process_ddsp_deferred_locals(struct rq *rq);
+static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
+ s64 exit_code,
+ const char *fmt, ...);
+
+#define scx_ops_error_kind(err, fmt, args...) \
+ scx_ops_exit_kind((err), 0, fmt, ##args)
+
+#define scx_ops_exit(code, fmt, args...) \
+ scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
+
+#define scx_ops_error(fmt, args...) \
+ scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
+
+#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
+
+static long jiffies_delta_msecs(unsigned long at, unsigned long now)
+{
+ if (time_after(at, now))
+ return jiffies_to_msecs(at - now);
+ else
+ return -(long)jiffies_to_msecs(now - at);
+}
+
+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
+static u32 higher_bits(u32 flags)
+{
+ return ~((1 << fls(flags)) - 1);
+}
+
+/* return the mask with only the highest bit set */
+static u32 highest_bit(u32 flags)
+{
+ int bit = fls(flags);
+ return ((u64)1 << bit) >> 1;
+}
+
+static bool u32_before(u32 a, u32 b)
+{
+ return (s32)(a - b) < 0;
+}
+
+static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
+{
+ return global_dsqs[cpu_to_node(task_cpu(p))];
+}
+
+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+{
+ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+}
+
+/*
+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
+ * whether it's running from an allowed context.
+ *
+ * @mask is constant, always inline to cull the mask calculations.
+ */
+static __always_inline void scx_kf_allow(u32 mask)
+{
+ /* nesting is allowed only in increasing scx_kf_mask order */
+ WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
+ "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
+ current->scx.kf_mask, mask);
+ current->scx.kf_mask |= mask;
+ barrier();
+}
+
+static void scx_kf_disallow(u32 mask)
+{
+ barrier();
+ current->scx.kf_mask &= ~mask;
+}
+
+#define SCX_CALL_OP(mask, op, args...) \
+do { \
+ if (mask) { \
+ scx_kf_allow(mask); \
+ scx_ops.op(args); \
+ scx_kf_disallow(mask); \
+ } else { \
+ scx_ops.op(args); \
+ } \
+} while (0)
+
+#define SCX_CALL_OP_RET(mask, op, args...) \
+({ \
+ __typeof__(scx_ops.op(args)) __ret; \
+ if (mask) { \
+ scx_kf_allow(mask); \
+ __ret = scx_ops.op(args); \
+ scx_kf_disallow(mask); \
+ } else { \
+ __ret = scx_ops.op(args); \
+ } \
+ __ret; \
+})
+
+/*
+ * Some kfuncs are allowed only on the tasks that are subjects of the
+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
+ * restrictions, the following SCX_CALL_OP_*() variants should be used when
+ * invoking scx_ops operations that take task arguments. These can only be used
+ * for non-nesting operations due to the way the tasks are tracked.
+ *
+ * kfuncs which can only operate on such tasks can in turn use
+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
+ * the specific task.
+ */
+#define SCX_CALL_OP_TASK(mask, op, task, args...) \
+do { \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task; \
+ SCX_CALL_OP(mask, op, task, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+} while (0)
+
+#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \
+({ \
+ __typeof__(scx_ops.op(task, ##args)) __ret; \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task; \
+ __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+ __ret; \
+})
+
+#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \
+({ \
+ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
+ current->scx.kf_tasks[0] = task0; \
+ current->scx.kf_tasks[1] = task1; \
+ __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \
+ current->scx.kf_tasks[0] = NULL; \
+ current->scx.kf_tasks[1] = NULL; \
+ __ret; \
+})
+
+/* @mask is constant, always inline to cull unnecessary branches */
+static __always_inline bool scx_kf_allowed(u32 mask)
+{
+ if (unlikely(!(current->scx.kf_mask & mask))) {
+ scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
+ return false;
+ }
+
+ /*
+ * Enforce nesting boundaries. e.g. A kfunc which can be called from
+ * DISPATCH must not be called if we're running DEQUEUE which is nested
+ * inside ops.dispatch(). We don't need to check boundaries for any
+ * blocking kfuncs as the verifier ensures they're only called from
+ * sleepable progs.
+ */
+ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
+ (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
+ scx_ops_error("cpu_release kfunc called from a nested operation");
+ return false;
+ }
+
+ if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
+ (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
+ scx_ops_error("dispatch kfunc called from a nested operation");
+ return false;
+ }
+
+ return true;
+}
+
+/* see SCX_CALL_OP_TASK() */
+static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
+ struct task_struct *p)
+{
+ if (!scx_kf_allowed(mask))
+ return false;
+
+ if (unlikely((p != current->scx.kf_tasks[0] &&
+ p != current->scx.kf_tasks[1]))) {
+ scx_ops_error("called on a task not being operated on");
+ return false;
+ }
+
+ return true;
+}
+
+static bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
+/**
+ * nldsq_next_task - Iterate to the next task in a non-local DSQ
+ * @dsq: user dsq being iterated
+ * @cur: current position, %NULL to start iteration
+ * @rev: walk backwards
+ *
+ * Returns %NULL when iteration is finished.
+ */
+static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
+ struct task_struct *cur, bool rev)
+{
+ struct list_head *list_node;
+ struct scx_dsq_list_node *dsq_lnode;
+
+ lockdep_assert_held(&dsq->lock);
+
+ if (cur)
+ list_node = &cur->scx.dsq_list.node;
+ else
+ list_node = &dsq->list;
+
+ /* find the next task, need to skip BPF iteration cursors */
+ do {
+ if (rev)
+ list_node = list_node->prev;
+ else
+ list_node = list_node->next;
+
+ if (list_node == &dsq->list)
+ return NULL;
+
+ dsq_lnode = container_of(list_node, struct scx_dsq_list_node,
+ node);
+ } while (dsq_lnode->flags & SCX_DSQ_LNODE_ITER_CURSOR);
+
+ return container_of(dsq_lnode, struct task_struct, scx.dsq_list);
+}
+
+#define nldsq_for_each_task(p, dsq) \
+ for ((p) = nldsq_next_task((dsq), NULL, false); (p); \
+ (p) = nldsq_next_task((dsq), (p), false))
+
+
+/*
+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
+ * dispatch order. BPF-visible iterator is opaque and larger to allow future
+ * changes without breaking backward compatibility. Can be used with
+ * bpf_for_each(). See bpf_iter_scx_dsq_*().
+ */
+enum scx_dsq_iter_flags {
+ /* iterate in the reverse dispatch order */
+ SCX_DSQ_ITER_REV = 1U << 16,
+
+ __SCX_DSQ_ITER_HAS_SLICE = 1U << 30,
+ __SCX_DSQ_ITER_HAS_VTIME = 1U << 31,
+
+ __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV,
+ __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS |
+ __SCX_DSQ_ITER_HAS_SLICE |
+ __SCX_DSQ_ITER_HAS_VTIME,
+};
+
+struct bpf_iter_scx_dsq_kern {
+ struct scx_dsq_list_node cursor;
+ struct scx_dispatch_q *dsq;
+ u64 slice;
+ u64 vtime;
+} __attribute__((aligned(8)));
+
+struct bpf_iter_scx_dsq {
+ u64 __opaque[6];
+} __attribute__((aligned(8)));
+
+
+/*
+ * SCX task iterator.
+ */
+struct scx_task_iter {
+ struct sched_ext_entity cursor;
+ struct task_struct *locked;
+ struct rq *rq;
+ struct rq_flags rf;
+ u32 cnt;
+};
+
+/**
+ * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
+ * @iter: iterator to init
+ *
+ * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
+ * must eventually be stopped with scx_task_iter_stop().
+ *
+ * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
+ * between this and the first next() call or between any two next() calls. If
+ * the locks are released between two next() calls, the caller is responsible
+ * for ensuring that the task being iterated remains accessible either through
+ * RCU read lock or obtaining a reference count.
+ *
+ * All tasks which existed when the iteration started are guaranteed to be
+ * visited as long as they still exist.
+ */
+static void scx_task_iter_start(struct scx_task_iter *iter)
+{
+ BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
+ ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
+
+ spin_lock_irq(&scx_tasks_lock);
+
+ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
+ list_add(&iter->cursor.tasks_node, &scx_tasks);
+ iter->locked = NULL;
+ iter->cnt = 0;
+}
+
+static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
+{
+ if (iter->locked) {
+ task_rq_unlock(iter->rq, iter->locked, &iter->rf);
+ iter->locked = NULL;
+ }
+}
+
+/**
+ * scx_task_iter_unlock - Unlock rq and scx_tasks_lock held by a task iterator
+ * @iter: iterator to unlock
+ *
+ * If @iter is in the middle of a locked iteration, it may be locking the rq of
+ * the task currently being visited in addition to scx_tasks_lock. Unlock both.
+ * This function can be safely called anytime during an iteration.
+ */
+static void scx_task_iter_unlock(struct scx_task_iter *iter)
+{
+ __scx_task_iter_rq_unlock(iter);
+ spin_unlock_irq(&scx_tasks_lock);
+}
+
+/**
+ * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
+ * @iter: iterator to re-lock
+ *
+ * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
+ * doesn't re-lock the rq lock. Must be called before other iterator operations.
+ */
+static void scx_task_iter_relock(struct scx_task_iter *iter)
+{
+ spin_lock_irq(&scx_tasks_lock);
+}
+
+/**
+ * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock
+ * @iter: iterator to exit
+ *
+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held
+ * which is released on return. If the iterator holds a task's rq lock, that rq
+ * lock is also released. See scx_task_iter_start() for details.
+ */
+static void scx_task_iter_stop(struct scx_task_iter *iter)
+{
+ list_del_init(&iter->cursor.tasks_node);
+ scx_task_iter_unlock(iter);
+}
+
+/**
+ * scx_task_iter_next - Next task
+ * @iter: iterator to walk
+ *
+ * Visit the next task. See scx_task_iter_start() for details. Locks are dropped
+ * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
+ * stalls by holding scx_tasks_lock for too long.
+ */
+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
+{
+ struct list_head *cursor = &iter->cursor.tasks_node;
+ struct sched_ext_entity *pos;
+
+ if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
+ scx_task_iter_unlock(iter);
+ cond_resched();
+ scx_task_iter_relock(iter);
+ }
+
+ list_for_each_entry(pos, cursor, tasks_node) {
+ if (&pos->tasks_node == &scx_tasks)
+ return NULL;
+ if (!(pos->flags & SCX_TASK_CURSOR)) {
+ list_move(cursor, &pos->tasks_node);
+ return container_of(pos, struct task_struct, scx);
+ }
+ }
+
+ /* can't happen, should always terminate at scx_tasks above */
+ BUG();
+}
+
+/**
+ * scx_task_iter_next_locked - Next non-idle task with its rq locked
+ * @iter: iterator to walk
+ *
+ * Visit the non-idle task with its rq lock held. Allows callers to specify
+ * whether they would like to filter out dead tasks. See scx_task_iter_start()
+ * for details.
+ */
+static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
+{
+ struct task_struct *p;
+
+ __scx_task_iter_rq_unlock(iter);
+
+ while ((p = scx_task_iter_next(iter))) {
+ /*
+ * scx_task_iter is used to prepare and move tasks into SCX
+ * while loading the BPF scheduler and vice-versa while
+ * unloading. The init_tasks ("swappers") should be excluded
+ * from the iteration because:
+ *
+ * - It's unsafe to use __setschduler_prio() on an init_task to
+ * determine the sched_class to use as it won't preserve its
+ * idle_sched_class.
+ *
+ * - ops.init/exit_task() can easily be confused if called with
+ * init_tasks as they, e.g., share PID 0.
+ *
+ * As init_tasks are never scheduled through SCX, they can be
+ * skipped safely. Note that is_idle_task() which tests %PF_IDLE
+ * doesn't work here:
+ *
+ * - %PF_IDLE may not be set for an init_task whose CPU hasn't
+ * yet been onlined.
+ *
+ * - %PF_IDLE can be set on tasks that are not init_tasks. See
+ * play_idle_precise() used by CONFIG_IDLE_INJECT.
+ *
+ * Test for idle_sched_class as only init_tasks are on it.
+ */
+ if (p->sched_class != &idle_sched_class)
+ break;
+ }
+ if (!p)
+ return NULL;
+
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked = p;
+
+ return p;
+}
+
+static enum scx_ops_enable_state scx_ops_enable_state(void)
+{
+ return atomic_read(&scx_ops_enable_state_var);
+}
+
+static enum scx_ops_enable_state
+scx_ops_set_enable_state(enum scx_ops_enable_state to)
+{
+ return atomic_xchg(&scx_ops_enable_state_var, to);
+}
+
+static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
+ enum scx_ops_enable_state from)
+{
+ int from_v = from;
+
+ return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
+}
+
+static bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
+
+/**
+ * wait_ops_state - Busy-wait the specified ops state to end
+ * @p: target task
+ * @opss: state to wait the end of
+ *
+ * Busy-wait for @p to transition out of @opss. This can only be used when the
+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
+ * has load_acquire semantics to ensure that the caller can see the updates made
+ * in the enqueueing and dispatching paths.
+ */
+static void wait_ops_state(struct task_struct *p, unsigned long opss)
+{
+ do {
+ cpu_relax();
+ } while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
+}
+
+/**
+ * ops_cpu_valid - Verify a cpu number
+ * @cpu: cpu number which came from a BPF ops
+ * @where: extra information reported on error
+ *
+ * @cpu is a cpu number which came from the BPF scheduler and can be any value.
+ * Verify that it is in range and one of the possible cpus. If invalid, trigger
+ * an ops error.
+ */
+static bool ops_cpu_valid(s32 cpu, const char *where)
+{
+ if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
+ return true;
+ } else {
+ scx_ops_error("invalid CPU %d%s%s", cpu,
+ where ? " " : "", where ?: "");
+ return false;
+ }
+}
+
+/**
+ * ops_sanitize_err - Sanitize a -errno value
+ * @ops_name: operation to blame on failure
+ * @err: -errno value to sanitize
+ *
+ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
+ * cause misbehaviors. For an example, a large negative return from
+ * ops.init_task() triggers an oops when passed up the call chain because the
+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
+ * handled as a pointer.
+ */
+static int ops_sanitize_err(const char *ops_name, s32 err)
+{
+ if (err < 0 && err >= -MAX_ERRNO)
+ return err;
+
+ scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
+ return -EPROTO;
+}
+
+static void run_deferred(struct rq *rq)
+{
+ process_ddsp_deferred_locals(rq);
+}
+
+#ifdef CONFIG_SMP
+static void deferred_bal_cb_workfn(struct rq *rq)
+{
+ run_deferred(rq);
+}
+#endif
+
+static void deferred_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *rq = container_of(irq_work, struct rq, scx.deferred_irq_work);
+
+ raw_spin_rq_lock(rq);
+ run_deferred(rq);
+ raw_spin_rq_unlock(rq);
+}
+
+/**
+ * schedule_deferred - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Must be called with @rq
+ * locked. Deferred actions are executed with @rq locked but unpinned, and thus
+ * can unlock @rq to e.g. migrate tasks to other rqs.
+ */
+static void schedule_deferred(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SMP
+ /*
+ * If in the middle of waking up a task, task_woken_scx() will be called
+ * afterwards which will then run the deferred actions, no need to
+ * schedule anything.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
+ return;
+
+ /*
+ * If in balance, the balance callbacks will be called before rq lock is
+ * released. Schedule one.
+ */
+ if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
+ queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+ deferred_bal_cb_workfn);
+ return;
+ }
+#endif
+ /*
+ * No scheduler hooks available. Queue an irq work. They are executed on
+ * IRQ re-enable which may take a bit longer than the scheduler hooks.
+ * The above WAKEUP and BALANCE paths should cover most of the cases and
+ * the time to IRQ re-enable shouldn't be long.
+ */
+ irq_work_queue(&rq->scx.deferred_irq_work);
+}
+
+/**
+ * touch_core_sched - Update timestamp used for core-sched task ordering
+ * @rq: rq to read clock from, must be locked
+ * @p: task to update the timestamp for
+ *
+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called
+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice
+ * exhaustion).
+ */
+static void touch_core_sched(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * It's okay to update the timestamp spuriously. Use
+ * sched_core_disabled() which is cheaper than enabled().
+ *
+ * As this is used to determine ordering between tasks of sibling CPUs,
+ * it may be better to use per-core dispatch sequence instead.
+ */
+ if (!sched_core_disabled())
+ p->scx.core_sched_at = sched_clock_cpu(cpu_of(rq));
+#endif
+}
+
+/**
+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch
+ * @rq: rq to read clock from, must be locked
+ * @p: task being dispatched
+ *
+ * If the BPF scheduler implements custom core-sched ordering via
+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
+ * ordering within each local DSQ. This function is called from dispatch paths
+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
+ */
+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+#ifdef CONFIG_SCHED_CORE
+ if (SCX_HAS_OP(core_sched_before))
+ touch_core_sched(rq, p);
+#endif
+}
+
+static void update_curr_scx(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ s64 delta_exec;
+
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
+ return;
+
+ if (curr->scx.slice != SCX_SLICE_INF) {
+ curr->scx.slice -= min_t(u64, curr->scx.slice, delta_exec);
+ if (!curr->scx.slice)
+ touch_core_sched(rq, curr);
+ }
+}
+
+static bool scx_dsq_priq_less(struct rb_node *node_a,
+ const struct rb_node *node_b)
+{
+ const struct task_struct *a =
+ container_of(node_a, struct task_struct, scx.dsq_priq);
+ const struct task_struct *b =
+ container_of(node_b, struct task_struct, scx.dsq_priq);
+
+ return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
+}
+
+static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
+{
+ /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
+ WRITE_ONCE(dsq->nr, dsq->nr + delta);
+}
+
+static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
+ u64 enq_flags)
+{
+ bool is_local = dsq->id == SCX_DSQ_LOCAL;
+
+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
+ WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) ||
+ !RB_EMPTY_NODE(&p->scx.dsq_priq));
+
+ if (!is_local) {
+ raw_spin_lock(&dsq->lock);
+ if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
+ scx_ops_error("attempting to dispatch to a destroyed dsq");
+ /* fall back to the global dsq */
+ raw_spin_unlock(&dsq->lock);
+ dsq = find_global_dsq(p);
+ raw_spin_lock(&dsq->lock);
+ }
+ }
+
+ if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) &&
+ (enq_flags & SCX_ENQ_DSQ_PRIQ))) {
+ /*
+ * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from
+ * their FIFO queues. To avoid confusion and accidentally
+ * starving vtime-dispatched tasks by FIFO-dispatched tasks, we
+ * disallow any internal DSQ from doing vtime ordering of
+ * tasks.
+ */
+ scx_ops_error("cannot use vtime ordering for built-in DSQs");
+ enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
+ }
+
+ if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
+ struct rb_node *rbp;
+
+ /*
+ * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are
+ * linked to both the rbtree and list on PRIQs, this can only be
+ * tested easily when adding the first task.
+ */
+ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
+ nldsq_next_task(dsq, NULL, false)))
+ scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+ dsq->id);
+
+ p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
+ rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
+
+ /*
+ * Find the previous task and insert after it on the list so
+ * that @dsq->list is vtime ordered.
+ */
+ rbp = rb_prev(&p->scx.dsq_priq);
+ if (rbp) {
+ struct task_struct *prev =
+ container_of(rbp, struct task_struct,
+ scx.dsq_priq);
+ list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
+ } else {
+ list_add(&p->scx.dsq_list.node, &dsq->list);
+ }
+ } else {
+ /* a FIFO DSQ shouldn't be using PRIQ enqueuing */
+ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
+ scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ dsq->id);
+
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ list_add(&p->scx.dsq_list.node, &dsq->list);
+ else
+ list_add_tail(&p->scx.dsq_list.node, &dsq->list);
+ }
+
+ /* seq records the order tasks are queued, used by BPF DSQ iterator */
+ dsq->seq++;
+ p->scx.dsq_seq = dsq->seq;
+
+ dsq_mod_nr(dsq, 1);
+ p->scx.dsq = dsq;
+
+ /*
+ * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
+ * direct dispatch path, but we clear them here because the direct
+ * dispatch verdict may be overridden on the enqueue path during e.g.
+ * bypass.
+ */
+ p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
+ p->scx.ddsp_enq_flags = 0;
+
+ /*
+ * We're transitioning out of QUEUEING or DISPATCHING. store_release to
+ * match waiters' load_acquire.
+ */
+ if (enq_flags & SCX_ENQ_CLEAR_OPSS)
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
+ if (is_local) {
+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+ bool preempt = false;
+
+ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+ rq->curr->sched_class == &ext_sched_class) {
+ rq->curr->scx.slice = 0;
+ preempt = true;
+ }
+
+ if (preempt || sched_class_above(&ext_sched_class,
+ rq->curr->sched_class))
+ resched_curr(rq);
+ } else {
+ raw_spin_unlock(&dsq->lock);
+ }
+}
+
+static void task_unlink_from_dsq(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
+ WARN_ON_ONCE(list_empty(&p->scx.dsq_list.node));
+
+ if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) {
+ rb_erase(&p->scx.dsq_priq, &dsq->priq);
+ RB_CLEAR_NODE(&p->scx.dsq_priq);
+ p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ;
+ }
+
+ list_del_init(&p->scx.dsq_list.node);
+ dsq_mod_nr(dsq, -1);
+}
+
+static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
+{
+ struct scx_dispatch_q *dsq = p->scx.dsq;
+ bool is_local = dsq == &rq->scx.local_dsq;
+
+ if (!dsq) {
+ /*
+ * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
+ * Unlinking is all that's needed to cancel.
+ */
+ if (unlikely(!list_empty(&p->scx.dsq_list.node)))
+ list_del_init(&p->scx.dsq_list.node);
+
+ /*
+ * When dispatching directly from the BPF scheduler to a local
+ * DSQ, the task isn't associated with any DSQ but
+ * @p->scx.holding_cpu may be set under the protection of
+ * %SCX_OPSS_DISPATCHING.
+ */
+ if (p->scx.holding_cpu >= 0)
+ p->scx.holding_cpu = -1;
+
+ return;
+ }
+
+ if (!is_local)
+ raw_spin_lock(&dsq->lock);
+
+ /*
+ * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_* can't
+ * change underneath us.
+ */
+ if (p->scx.holding_cpu < 0) {
+ /* @p must still be on @dsq, dequeue */
+ task_unlink_from_dsq(p, dsq);
+ } else {
+ /*
+ * We're racing against dispatch_to_local_dsq() which already
+ * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
+ * holding_cpu which tells dispatch_to_local_dsq() that it lost
+ * the race.
+ */
+ WARN_ON_ONCE(!list_empty(&p->scx.dsq_list.node));
+ p->scx.holding_cpu = -1;
+ }
+ p->scx.dsq = NULL;
+
+ if (!is_local)
+ raw_spin_unlock(&dsq->lock);
+}
+
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+ struct task_struct *p)
+{
+ struct scx_dispatch_q *dsq;
+
+ if (dsq_id == SCX_DSQ_LOCAL)
+ return &rq->scx.local_dsq;
+
+ if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+
+ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ return find_global_dsq(p);
+
+ return &cpu_rq(cpu)->scx.local_dsq;
+ }
+
+ if (dsq_id == SCX_DSQ_GLOBAL)
+ dsq = find_global_dsq(p);
+ else
+ dsq = find_user_dsq(dsq_id);
+
+ if (unlikely(!dsq)) {
+ scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
+ return find_global_dsq(p);
+ }
+
+ return dsq;
+}
+
+static void mark_direct_dispatch(struct task_struct *ddsp_task,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ /*
+ * Mark that dispatch already happened from ops.select_cpu() or
+ * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
+ * which can never match a valid task pointer.
+ */
+ __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
+
+ /* @p must match the task on the enqueue path */
+ if (unlikely(p != ddsp_task)) {
+ if (IS_ERR(ddsp_task))
+ scx_ops_error("%s[%d] already direct-dispatched",
+ p->comm, p->pid);
+ else
+ scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ ddsp_task->comm, ddsp_task->pid,
+ p->comm, p->pid);
+ return;
+ }
+
+ WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
+ WARN_ON_ONCE(p->scx.ddsp_enq_flags);
+
+ p->scx.ddsp_dsq_id = dsq_id;
+ p->scx.ddsp_enq_flags = enq_flags;
+}
+
+static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+{
+ struct rq *rq = task_rq(p);
+ struct scx_dispatch_q *dsq =
+ find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+
+ touch_core_sched_dispatch(rq, p);
+
+ p->scx.ddsp_enq_flags |= enq_flags;
+
+ /*
+ * We are in the enqueue path with @rq locked and pinned, and thus can't
+ * double lock a remote rq and enqueue to its local DSQ. For
+ * DSQ_LOCAL_ON verdicts targeting the local DSQ of a remote CPU, defer
+ * the enqueue so that it's executed when @rq can be unlocked.
+ */
+ if (dsq->id == SCX_DSQ_LOCAL && dsq != &rq->scx.local_dsq) {
+ unsigned long opss;
+
+ opss = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_STATE_MASK;
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_NONE:
+ break;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * As @p was never passed to the BPF side, _release is
+ * not strictly necessary. Still do it for consistency.
+ */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+ break;
+ default:
+ WARN_ONCE(true, "sched_ext: %s[%d] has invalid ops state 0x%lx in direct_dispatch()",
+ p->comm, p->pid, opss);
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+ break;
+ }
+
+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
+ list_add_tail(&p->scx.dsq_list.node,
+ &rq->scx.ddsp_deferred_locals);
+ schedule_deferred(rq);
+ return;
+ }
+
+ dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+}
+
+static bool scx_rq_online(struct rq *rq)
+{
+ /*
+ * Test both cpu_active() and %SCX_RQ_ONLINE. %SCX_RQ_ONLINE indicates
+ * the online state as seen from the BPF scheduler. cpu_active() test
+ * guarantees that, if this function returns %true, %SCX_RQ_ONLINE will
+ * stay set until the current scheduling operation is complete even if
+ * we aren't locking @rq.
+ */
+ return likely((rq->scx.flags & SCX_RQ_ONLINE) && cpu_active(cpu_of(rq)));
+}
+
+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
+ int sticky_cpu)
+{
+ struct task_struct **ddsp_taskp;
+ unsigned long qseq;
+
+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
+
+ /* rq migration */
+ if (sticky_cpu == cpu_of(rq))
+ goto local_norefill;
+
+ /*
+ * If !scx_rq_online(), we already told the BPF scheduler that the CPU
+ * is offline and are just running the hotplug path. Don't bother the
+ * BPF scheduler.
+ */
+ if (!scx_rq_online(rq))
+ goto local;
+
+ if (scx_rq_bypassing(rq))
+ goto global;
+
+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
+ goto direct;
+
+ /* see %SCX_OPS_ENQ_EXITING */
+ if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
+ unlikely(p->flags & PF_EXITING))
+ goto local;
+
+ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
+ if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ is_migration_disabled(p))
+ goto local;
+
+ if (!SCX_HAS_OP(enqueue))
+ goto global;
+
+ /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
+ qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
+
+ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+ atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
+
+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
+ WARN_ON_ONCE(*ddsp_taskp);
+ *ddsp_taskp = p;
+
+ SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+
+ *ddsp_taskp = NULL;
+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
+ goto direct;
+
+ /*
+ * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
+ * dequeue may be waiting. The store_release matches their load_acquire.
+ */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
+ return;
+
+direct:
+ direct_dispatch(p, enq_flags);
+ return;
+
+local:
+ /*
+ * For task-ordering, slice refill must be treated as implying the end
+ * of the current slice. Otherwise, the longer @p stays on the CPU, the
+ * higher priority it becomes from scx_prio_less()'s POV.
+ */
+ touch_core_sched(rq, p);
+ p->scx.slice = SCX_SLICE_DFL;
+local_norefill:
+ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
+ return;
+
+global:
+ touch_core_sched(rq, p); /* see the comment in local: */
+ p->scx.slice = SCX_SLICE_DFL;
+ dispatch_enqueue(find_global_dsq(p), p, enq_flags);
+}
+
+static bool task_runnable(const struct task_struct *p)
+{
+ return !list_empty(&p->scx.runnable_node);
+}
+
+static void set_task_runnable(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+
+ if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
+ p->scx.runnable_at = jiffies;
+ p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
+ }
+
+ /*
+ * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
+ * appended to the runnable_list.
+ */
+ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
+}
+
+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
+{
+ list_del_init(&p->scx.runnable_node);
+ if (reset_runnable_at)
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+}
+
+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+{
+ int sticky_cpu = p->scx.sticky_cpu;
+
+ if (enq_flags & ENQUEUE_WAKEUP)
+ rq->scx.flags |= SCX_RQ_IN_WAKEUP;
+
+ enq_flags |= rq->scx.extra_enq_flags;
+
+ if (sticky_cpu >= 0)
+ p->scx.sticky_cpu = -1;
+
+ /*
+ * Restoring a running task will be immediately followed by
+ * set_next_task_scx() which expects the task to not be on the BPF
+ * scheduler as tasks can only start running through local DSQs. Force
+ * direct-dispatch into the local DSQ by setting the sticky_cpu.
+ */
+ if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
+ sticky_cpu = cpu_of(rq);
+
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ WARN_ON_ONCE(!task_runnable(p));
+ goto out;
+ }
+
+ set_task_runnable(rq, p);
+ p->scx.flags |= SCX_TASK_QUEUED;
+ rq->scx.nr_running++;
+ add_nr_running(rq, 1);
+
+ if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+
+ if (enq_flags & SCX_ENQ_WAKEUP)
+ touch_core_sched(rq, p);
+
+ do_enqueue_task(rq, p, enq_flags, sticky_cpu);
+out:
+ rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
+}
+
+static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+{
+ unsigned long opss;
+
+ /* dequeue is always temporary, don't reset runnable_at */
+ clr_task_runnable(p, false);
+
+ /* acquire ensures that we see the preceding updates on QUEUED */
+ opss = atomic_long_read_acquire(&p->scx.ops_state);
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_NONE:
+ break;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * QUEUEING is started and finished while holding @p's rq lock.
+ * As we're holding the rq lock now, we shouldn't see QUEUEING.
+ */
+ BUG();
+ case SCX_OPSS_QUEUED:
+ if (SCX_HAS_OP(dequeue))
+ SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
+
+ if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
+ SCX_OPSS_NONE))
+ break;
+ fallthrough;
+ case SCX_OPSS_DISPATCHING:
+ /*
+ * If @p is being dispatched from the BPF scheduler to a DSQ,
+ * wait for the transfer to complete so that @p doesn't get
+ * added to its DSQ after dequeueing is complete.
+ *
+ * As we're waiting on DISPATCHING with the rq locked, the
+ * dispatching side shouldn't try to lock the rq while
+ * DISPATCHING is set. See dispatch_to_local_dsq().
+ *
+ * DISPATCHING shouldn't have qseq set and control can reach
+ * here with NONE @opss from the above QUEUED case block.
+ * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
+ */
+ wait_ops_state(p, SCX_OPSS_DISPATCHING);
+ BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
+ break;
+ }
+}
+
+static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
+{
+ if (!(p->scx.flags & SCX_TASK_QUEUED)) {
+ WARN_ON_ONCE(task_runnable(p));
+ return true;
+ }
+
+ ops_dequeue(p, deq_flags);
+
+ /*
+ * A currently running task which is going off @rq first gets dequeued
+ * and then stops running. As we want running <-> stopping transitions
+ * to be contained within runnable <-> quiescent transitions, trigger
+ * ->stopping() early here instead of in put_prev_task_scx().
+ *
+ * @p may go through multiple stopping <-> running transitions between
+ * here and put_prev_task_scx() if task attribute changes occur while
+ * balance_scx() leaves @rq unlocked. However, they don't contain any
+ * information meaningful to the BPF scheduler and can be suppressed by
+ * skipping the callbacks if the task is !QUEUED.
+ */
+ if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+ update_curr_scx(rq);
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+ }
+
+ if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+
+ if (deq_flags & SCX_DEQ_SLEEP)
+ p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
+ else
+ p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
+
+ p->scx.flags &= ~SCX_TASK_QUEUED;
+ rq->scx.nr_running--;
+ sub_nr_running(rq, 1);
+
+ dispatch_dequeue(rq, p);
+ return true;
+}
+
+static void yield_task_scx(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ if (SCX_HAS_OP(yield))
+ SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
+ else
+ p->scx.slice = 0;
+}
+
+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
+{
+ struct task_struct *from = rq->curr;
+
+ if (SCX_HAS_OP(yield))
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
+ else
+ return false;
+}
+
+static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+ struct scx_dispatch_q *src_dsq,
+ struct rq *dst_rq)
+{
+ struct scx_dispatch_q *dst_dsq = &dst_rq->scx.local_dsq;
+
+ /* @dsq is locked and @p is on @dst_rq */
+ lockdep_assert_held(&src_dsq->lock);
+ lockdep_assert_rq_held(dst_rq);
+
+ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ list_add(&p->scx.dsq_list.node, &dst_dsq->list);
+ else
+ list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
+
+ dsq_mod_nr(dst_dsq, 1);
+ p->scx.dsq = dst_dsq;
+}
+
+#ifdef CONFIG_SMP
+/**
+ * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
+ * @p: task to move
+ * @enq_flags: %SCX_ENQ_*
+ * @src_rq: rq to move the task from, locked on entry, released on return
+ * @dst_rq: rq to move the task into, locked on return
+ *
+ * Move @p which is currently on @src_rq to @dst_rq's local DSQ.
+ */
+static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
+ struct rq *src_rq, struct rq *dst_rq)
+{
+ lockdep_assert_rq_held(src_rq);
+
+ /* the following marks @p MIGRATING which excludes dequeue */
+ deactivate_task(src_rq, p, 0);
+ set_task_cpu(p, cpu_of(dst_rq));
+ p->scx.sticky_cpu = cpu_of(dst_rq);
+
+ raw_spin_rq_unlock(src_rq);
+ raw_spin_rq_lock(dst_rq);
+
+ /*
+ * We want to pass scx-specific enq_flags but activate_task() will
+ * truncate the upper 32 bit. As we own @rq, we can pass them through
+ * @rq->scx.extra_enq_flags instead.
+ */
+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr));
+ WARN_ON_ONCE(dst_rq->scx.extra_enq_flags);
+ dst_rq->scx.extra_enq_flags = enq_flags;
+ activate_task(dst_rq, p, 0);
+ dst_rq->scx.extra_enq_flags = 0;
+}
+
+/*
+ * Similar to kernel/sched/core.c::is_cpu_allowed(). However, there are two
+ * differences:
+ *
+ * - is_cpu_allowed() asks "Can this task run on this CPU?" while
+ * task_can_run_on_remote_rq() asks "Can the BPF scheduler migrate the task to
+ * this CPU?".
+ *
+ * While migration is disabled, is_cpu_allowed() has to say "yes" as the task
+ * must be allowed to finish on the CPU that it's currently on regardless of
+ * the CPU state. However, task_can_run_on_remote_rq() must say "no" as the
+ * BPF scheduler shouldn't attempt to migrate a task which has migration
+ * disabled.
+ *
+ * - The BPF scheduler is bypassed while the rq is offline and we can always say
+ * no to the BPF scheduler initiated migrations while offline.
+ *
+ * The caller must ensure that @p and @rq are on different CPUs.
+ */
+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
+ bool trigger_error)
+{
+ int cpu = cpu_of(rq);
+
+ SCHED_WARN_ON(task_cpu(p) == cpu);
+
+ /*
+ * If @p has migration disabled, @p->cpus_ptr is updated to contain only
+ * the pinned CPU in migrate_disable_switch() while @p is being switched
+ * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
+ * updated and thus another CPU may see @p on a DSQ inbetween leading to
+ * @p passing the below task_allowed_on_cpu() check while migration is
+ * disabled.
+ *
+ * Test the migration disabled state first as the race window is narrow
+ * and the BPF scheduler failing to check migration disabled state can
+ * easily be masked if task_allowed_on_cpu() is done first.
+ */
+ if (unlikely(is_migration_disabled(p))) {
+ if (trigger_error)
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
+ return false;
+ }
+
+ /*
+ * We don't require the BPF scheduler to avoid dispatching to offline
+ * CPUs mostly for convenience but also because CPUs can go offline
+ * between scx_bpf_dsq_insert() calls and here. Trigger error iff the
+ * picked CPU is outside the allowed mask.
+ */
+ if (!task_allowed_on_cpu(p, cpu)) {
+ if (trigger_error)
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
+ return false;
+ }
+
+ if (!scx_rq_online(rq))
+ return false;
+
+ return true;
+}
+
+/**
+ * unlink_dsq_and_lock_src_rq() - Unlink task from its DSQ and lock its task_rq
+ * @p: target task
+ * @dsq: locked DSQ @p is currently on
+ * @src_rq: rq @p is currently on, stable with @dsq locked
+ *
+ * Called with @dsq locked but no rq's locked. We want to move @p to a different
+ * DSQ, including any local DSQ, but are not locking @src_rq. Locking @src_rq is
+ * required when transferring into a local DSQ. Even when transferring into a
+ * non-local DSQ, it's better to use the same mechanism to protect against
+ * dequeues and maintain the invariant that @p->scx.dsq can only change while
+ * @src_rq is locked, which e.g. scx_dump_task() depends on.
+ *
+ * We want to grab @src_rq but that can deadlock if we try while locking @dsq,
+ * so we want to unlink @p from @dsq, drop its lock and then lock @src_rq. As
+ * this may race with dequeue, which can't drop the rq lock or fail, do a little
+ * dancing from our side.
+ *
+ * @p->scx.holding_cpu is set to this CPU before @dsq is unlocked. If @p gets
+ * dequeued after we unlock @dsq but before locking @src_rq, the holding_cpu
+ * would be cleared to -1. While other cpus may have updated it to different
+ * values afterwards, as this operation can't be preempted or recurse, the
+ * holding_cpu can never become this CPU again before we're done. Thus, we can
+ * tell whether we lost to dequeue by testing whether the holding_cpu still
+ * points to this CPU. See dispatch_dequeue() for the counterpart.
+ *
+ * On return, @dsq is unlocked and @src_rq is locked. Returns %true if @p is
+ * still valid. %false if lost to dequeue.
+ */
+static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
+ struct scx_dispatch_q *dsq,
+ struct rq *src_rq)
+{
+ s32 cpu = raw_smp_processor_id();
+
+ lockdep_assert_held(&dsq->lock);
+
+ WARN_ON_ONCE(p->scx.holding_cpu >= 0);
+ task_unlink_from_dsq(p, dsq);
+ p->scx.holding_cpu = cpu;
+
+ raw_spin_unlock(&dsq->lock);
+ raw_spin_rq_lock(src_rq);
+
+ /* task_rq couldn't have changed if we're still the holding cpu */
+ return likely(p->scx.holding_cpu == cpu) &&
+ !WARN_ON_ONCE(src_rq != task_rq(p));
+}
+
+static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
+ struct scx_dispatch_q *dsq, struct rq *src_rq)
+{
+ raw_spin_rq_unlock(this_rq);
+
+ if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
+ move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
+ return true;
+ } else {
+ raw_spin_rq_unlock(src_rq);
+ raw_spin_rq_lock(this_rq);
+ return false;
+ }
+}
+#else /* CONFIG_SMP */
+static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
+static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
+static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
+#endif /* CONFIG_SMP */
+
+/**
+ * move_task_between_dsqs() - Move a task from one DSQ to another
+ * @p: target task
+ * @enq_flags: %SCX_ENQ_*
+ * @src_dsq: DSQ @p is currently on, must not be a local DSQ
+ * @dst_dsq: DSQ @p is being moved to, can be any DSQ
+ *
+ * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local
+ * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq
+ * will change. As @p's task_rq is locked, this function doesn't need to use the
+ * holding_cpu mechanism.
+ *
+ * On return, @src_dsq is unlocked and only @p's new task_rq, which is the
+ * return value, is locked.
+ */
+static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
+ struct scx_dispatch_q *src_dsq,
+ struct scx_dispatch_q *dst_dsq)
+{
+ struct rq *src_rq = task_rq(p), *dst_rq;
+
+ BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
+ lockdep_assert_held(&src_dsq->lock);
+ lockdep_assert_rq_held(src_rq);
+
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ dst_dsq = find_global_dsq(p);
+ dst_rq = src_rq;
+ }
+ } else {
+ /* no need to migrate if destination is a non-local DSQ */
+ dst_rq = src_rq;
+ }
+
+ /*
+ * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
+ * CPU, @p will be migrated.
+ */
+ if (dst_dsq->id == SCX_DSQ_LOCAL) {
+ /* @p is going from a non-local DSQ to a local DSQ */
+ if (src_rq == dst_rq) {
+ task_unlink_from_dsq(p, src_dsq);
+ move_local_task_to_local_dsq(p, enq_flags,
+ src_dsq, dst_rq);
+ raw_spin_unlock(&src_dsq->lock);
+ } else {
+ raw_spin_unlock(&src_dsq->lock);
+ move_remote_task_to_local_dsq(p, enq_flags,
+ src_rq, dst_rq);
+ }
+ } else {
+ /*
+ * @p is going from a non-local DSQ to a non-local DSQ. As
+ * $src_dsq is already locked, do an abbreviated dequeue.
+ */
+ task_unlink_from_dsq(p, src_dsq);
+ p->scx.dsq = NULL;
+ raw_spin_unlock(&src_dsq->lock);
+
+ dispatch_enqueue(dst_dsq, p, enq_flags);
+ }
+
+ return dst_rq;
+}
+
+/*
+ * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
+ * banging on the same DSQ on a large NUMA system to the point where switching
+ * to the bypass mode can take a long time. Inject artificial delays while the
+ * bypass mode is switching to guarantee timely completion.
+ */
+static void scx_ops_breather(struct rq *rq)
+{
+ u64 until;
+
+ lockdep_assert_rq_held(rq);
+
+ if (likely(!atomic_read(&scx_ops_breather_depth)))
+ return;
+
+ raw_spin_rq_unlock(rq);
+
+ until = ktime_get_ns() + NSEC_PER_MSEC;
+
+ do {
+ int cnt = 1024;
+ while (atomic_read(&scx_ops_breather_depth) && --cnt)
+ cpu_relax();
+ } while (atomic_read(&scx_ops_breather_depth) &&
+ time_before64(ktime_get_ns(), until));
+
+ raw_spin_rq_lock(rq);
+}
+
+static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
+{
+ struct task_struct *p;
+retry:
+ /*
+ * This retry loop can repeatedly race against scx_ops_bypass()
+ * dequeueing tasks from @dsq trying to put the system into the bypass
+ * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
+ * live-lock the machine into soft lockups. Give a breather.
+ */
+ scx_ops_breather(rq);
+
+ /*
+ * The caller can't expect to successfully consume a task if the task's
+ * addition to @dsq isn't guaranteed to be visible somehow. Test
+ * @dsq->list without locking and skip if it seems empty.
+ */
+ if (list_empty(&dsq->list))
+ return false;
+
+ raw_spin_lock(&dsq->lock);
+
+ nldsq_for_each_task(p, dsq) {
+ struct rq *task_rq = task_rq(p);
+
+ if (rq == task_rq) {
+ task_unlink_from_dsq(p, dsq);
+ move_local_task_to_local_dsq(p, 0, dsq, rq);
+ raw_spin_unlock(&dsq->lock);
+ return true;
+ }
+
+ if (task_can_run_on_remote_rq(p, rq, false)) {
+ if (likely(consume_remote_task(rq, p, dsq, task_rq)))
+ return true;
+ goto retry;
+ }
+ }
+
+ raw_spin_unlock(&dsq->lock);
+ return false;
+}
+
+static bool consume_global_dsq(struct rq *rq)
+{
+ int node = cpu_to_node(cpu_of(rq));
+
+ return consume_dispatch_q(rq, global_dsqs[node]);
+}
+
+/**
+ * dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @rq: current rq which is locked
+ * @dst_dsq: destination DSQ
+ * @p: task to dispatch
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * We're holding @rq lock and want to dispatch @p to @dst_dsq which is a local
+ * DSQ. This function performs all the synchronization dancing needed because
+ * local DSQs are protected with rq locks.
+ *
+ * The caller must have exclusive ownership of @p (e.g. through
+ * %SCX_OPSS_DISPATCHING).
+ */
+static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
+ struct task_struct *p, u64 enq_flags)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+#ifdef CONFIG_SMP
+ struct rq *locked_rq = rq;
+#endif
+
+ /*
+ * We're synchronized against dequeue through DISPATCHING. As @p can't
+ * be dequeued, its task_rq and cpus_allowed are stable too.
+ *
+ * If dispatching to @rq that @p is already on, no lock dancing needed.
+ */
+ if (rq == src_rq && rq == dst_rq) {
+ dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ return;
+ }
+
+#ifdef CONFIG_SMP
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ dispatch_enqueue(find_global_dsq(p), p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS);
+ return;
+ }
+
+ /*
+ * @p is on a possibly remote @src_rq which we need to lock to move the
+ * task. If dequeue is in progress, it'd be locking @src_rq and waiting
+ * on DISPATCHING, so we can't grab @src_rq lock while holding
+ * DISPATCHING.
+ *
+ * As DISPATCHING guarantees that @p is wholly ours, we can pretend that
+ * we're moving from a DSQ and use the same mechanism - mark the task
+ * under transfer with holding_cpu, release DISPATCHING and then follow
+ * the same protocol. See unlink_dsq_and_lock_src_rq().
+ */
+ p->scx.holding_cpu = raw_smp_processor_id();
+
+ /* store_release ensures that dequeue sees the above */
+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
+
+ /* switch to @src_rq lock */
+ if (locked_rq != src_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ locked_rq = src_rq;
+ raw_spin_rq_lock(src_rq);
+ }
+
+ /* task_rq couldn't have changed if we're still the holding cpu */
+ if (likely(p->scx.holding_cpu == raw_smp_processor_id()) &&
+ !WARN_ON_ONCE(src_rq != task_rq(p))) {
+ /*
+ * If @p is staying on the same rq, there's no need to go
+ * through the full deactivate/activate cycle. Optimize by
+ * abbreviating move_remote_task_to_local_dsq().
+ */
+ if (src_rq == dst_rq) {
+ p->scx.holding_cpu = -1;
+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
+ } else {
+ move_remote_task_to_local_dsq(p, enq_flags,
+ src_rq, dst_rq);
+ /* task has been moved to dst_rq, which is now locked */
+ locked_rq = dst_rq;
+ }
+
+ /* if the destination CPU is idle, wake it up */
+ if (sched_class_above(p->sched_class, dst_rq->curr->sched_class))
+ resched_curr(dst_rq);
+ }
+
+ /* switch back to @rq lock */
+ if (locked_rq != rq) {
+ raw_spin_rq_unlock(locked_rq);
+ raw_spin_rq_lock(rq);
+ }
+#else /* CONFIG_SMP */
+ BUG(); /* control can not reach here on UP */
+#endif /* CONFIG_SMP */
+}
+
+/**
+ * finish_dispatch - Asynchronously finish dispatching a task
+ * @rq: current rq which is locked
+ * @p: task to finish dispatching
+ * @qseq_at_dispatch: qseq when @p started getting dispatched
+ * @dsq_id: destination DSQ ID
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * Dispatching to local DSQs may need to wait for queueing to complete or
+ * require rq lock dancing. As we don't wanna do either while inside
+ * ops.dispatch() to avoid locking order inversion, we split dispatching into
+ * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the
+ * task and its qseq. Once ops.dispatch() returns, this function is called to
+ * finish up.
+ *
+ * There is no guarantee that @p is still valid for dispatching or even that it
+ * was valid in the first place. Make sure that the task is still owned by the
+ * BPF scheduler and claim the ownership before dispatching.
+ */
+static void finish_dispatch(struct rq *rq, struct task_struct *p,
+ unsigned long qseq_at_dispatch,
+ u64 dsq_id, u64 enq_flags)
+{
+ struct scx_dispatch_q *dsq;
+ unsigned long opss;
+
+ touch_core_sched_dispatch(rq, p);
+retry:
+ /*
+ * No need for _acquire here. @p is accessed only after a successful
+ * try_cmpxchg to DISPATCHING.
+ */
+ opss = atomic_long_read(&p->scx.ops_state);
+
+ switch (opss & SCX_OPSS_STATE_MASK) {
+ case SCX_OPSS_DISPATCHING:
+ case SCX_OPSS_NONE:
+ /* someone else already got to it */
+ return;
+ case SCX_OPSS_QUEUED:
+ /*
+ * If qseq doesn't match, @p has gone through at least one
+ * dispatch/dequeue and re-enqueue cycle between
+ * scx_bpf_dsq_insert() and here and we have no claim on it.
+ */
+ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
+ return;
+
+ /*
+ * While we know @p is accessible, we don't yet have a claim on
+ * it - the BPF scheduler is allowed to dispatch tasks
+ * spuriously and there can be a racing dequeue attempt. Let's
+ * claim @p by atomically transitioning it from QUEUED to
+ * DISPATCHING.
+ */
+ if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
+ SCX_OPSS_DISPATCHING)))
+ break;
+ goto retry;
+ case SCX_OPSS_QUEUEING:
+ /*
+ * do_enqueue_task() is in the process of transferring the task
+ * to the BPF scheduler while holding @p's rq lock. As we aren't
+ * holding any kernel or BPF resource that the enqueue path may
+ * depend upon, it's safe to wait.
+ */
+ wait_ops_state(p, opss);
+ goto retry;
+ }
+
+ BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
+
+ dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
+
+ if (dsq->id == SCX_DSQ_LOCAL)
+ dispatch_to_local_dsq(rq, dsq, p, enq_flags);
+ else
+ dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+}
+
+static void flush_dispatch_buf(struct rq *rq)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ u32 u;
+
+ for (u = 0; u < dspc->cursor; u++) {
+ struct scx_dsp_buf_ent *ent = &dspc->buf[u];
+
+ finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
+ ent->enq_flags);
+ }
+
+ dspc->nr_tasks += dspc->cursor;
+ dspc->cursor = 0;
+}
+
+static int balance_one(struct rq *rq, struct task_struct *prev)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
+ int nr_loops = SCX_DSP_MAX_LOOPS;
+
+ lockdep_assert_rq_held(rq);
+ rq->scx.flags |= SCX_RQ_IN_BALANCE;
+ rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
+
+ if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+ unlikely(rq->scx.cpu_released)) {
+ /*
+ * If the previous sched_class for the current CPU was not SCX,
+ * notify the BPF scheduler that it again has control of the
+ * core. This callback complements ->cpu_release(), which is
+ * emitted in switch_class().
+ */
+ if (SCX_HAS_OP(cpu_acquire))
+ SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
+ rq->scx.cpu_released = false;
+ }
+
+ if (prev_on_scx) {
+ update_curr_scx(rq);
+
+ /*
+ * If @prev is runnable & has slice left, it has priority and
+ * fetching more just increases latency for the fetched tasks.
+ * Tell pick_task_scx() to keep running @prev. If the BPF
+ * scheduler wants to handle this explicitly, it should
+ * implement ->cpu_release().
+ *
+ * See scx_ops_disable_workfn() for the explanation on the
+ * bypassing test.
+ */
+ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
+ }
+
+ /* if there already are tasks to run, nothing to do */
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
+
+ if (consume_global_dsq(rq))
+ goto has_tasks;
+
+ if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ goto no_tasks;
+
+ dspc->rq = rq;
+
+ /*
+ * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
+ * the local DSQ might still end up empty after a successful
+ * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
+ * produced some tasks, retry. The BPF scheduler may depend on this
+ * looping behavior to simplify its implementation.
+ */
+ do {
+ dspc->nr_tasks = 0;
+
+ SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
+ prev_on_scx ? prev : NULL);
+
+ flush_dispatch_buf(rq);
+
+ if (prev_on_rq && prev->scx.slice) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
+ if (rq->scx.local_dsq.nr)
+ goto has_tasks;
+ if (consume_global_dsq(rq))
+ goto has_tasks;
+
+ /*
+ * ops.dispatch() can trap us in this loop by repeatedly
+ * dispatching ineligible tasks. Break out once in a while to
+ * allow the watchdog to run. As IRQ can't be enabled in
+ * balance(), we want to complete this scheduling cycle and then
+ * start a new one. IOW, we want to call resched_curr() on the
+ * next, most likely idle, task, not the current one. Use
+ * scx_bpf_kick_cpu() for deferred kicking.
+ */
+ if (unlikely(!--nr_loops)) {
+ scx_bpf_kick_cpu(cpu_of(rq), 0);
+ break;
+ }
+ } while (dspc->nr_tasks);
+
+no_tasks:
+ /*
+ * Didn't find another task to run. Keep running @prev unless
+ * %SCX_OPS_ENQ_LAST is in effect.
+ */
+ if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
+ scx_rq_bypassing(rq))) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
+ rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+ return false;
+
+has_tasks:
+ rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
+ return true;
+}
+
+static int balance_scx(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
+ int ret;
+
+ rq_unpin_lock(rq, rf);
+
+ ret = balance_one(rq, prev);
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * When core-sched is enabled, this ops.balance() call will be followed
+ * by pick_task_scx() on this CPU and the SMT siblings. Balance the
+ * siblings too.
+ */
+ if (sched_core_enabled(rq)) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+ int scpu;
+
+ for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
+ struct rq *srq = cpu_rq(scpu);
+ struct task_struct *sprev = srq->curr;
+
+ WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
+ update_rq_clock(srq);
+ balance_one(srq, sprev);
+ }
+ }
+#endif
+ rq_repin_lock(rq, rf);
+
+ return ret;
+}
+
+static void process_ddsp_deferred_locals(struct rq *rq)
+{
+ struct task_struct *p;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Now that @rq can be unlocked, execute the deferred enqueueing of
+ * tasks directly dispatched to the local DSQs of other CPUs. See
+ * direct_dispatch(). Keep popping from the head instead of using
+ * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
+ * temporarily.
+ */
+ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
+ struct task_struct, scx.dsq_list.node))) {
+ struct scx_dispatch_q *dsq;
+
+ list_del_init(&p->scx.dsq_list.node);
+
+ dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
+ dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
+ }
+}
+
+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
+{
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ /*
+ * Core-sched might decide to execute @p before it is
+ * dispatched. Call ops_dequeue() to notify the BPF scheduler.
+ */
+ ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
+ dispatch_dequeue(rq, p);
+ }
+
+ p->se.exec_start = rq_clock_task(rq);
+
+ /* see dequeue_task_scx() on why we skip when !QUEUED */
+ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
+
+ clr_task_runnable(p, true);
+
+ /*
+ * @p is getting newly scheduled or got kicked after someone updated its
+ * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
+ */
+ if ((p->scx.slice == SCX_SLICE_INF) !=
+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+ if (p->scx.slice == SCX_SLICE_INF)
+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+ else
+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+ sched_update_tick_dependency(rq);
+
+ /*
+ * For now, let's refresh the load_avgs just when transitioning
+ * in and out of nohz. In the future, we might want to add a
+ * mechanism which calls the following periodically on
+ * tick-stopped CPUs.
+ */
+ update_other_load_avgs(rq);
+ }
+}
+
+static enum scx_cpu_preempt_reason
+preempt_reason_from_class(const struct sched_class *class)
+{
+#ifdef CONFIG_SMP
+ if (class == &stop_sched_class)
+ return SCX_CPU_PREEMPT_STOP;
+#endif
+ if (class == &dl_sched_class)
+ return SCX_CPU_PREEMPT_DL;
+ if (class == &rt_sched_class)
+ return SCX_CPU_PREEMPT_RT;
+ return SCX_CPU_PREEMPT_UNKNOWN;
+}
+
+static void switch_class(struct rq *rq, struct task_struct *next)
+{
+ const struct sched_class *next_class = next->sched_class;
+
+#ifdef CONFIG_SMP
+ /*
+ * Pairs with the smp_load_acquire() issued by a CPU in
+ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
+ * resched.
+ */
+ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
+#endif
+ if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+ return;
+
+ /*
+ * The callback is conceptually meant to convey that the CPU is no
+ * longer under the control of SCX. Therefore, don't invoke the callback
+ * if the next class is below SCX (in which case the BPF scheduler has
+ * actively decided not to schedule any tasks on the CPU).
+ */
+ if (sched_class_above(&ext_sched_class, next_class))
+ return;
+
+ /*
+ * At this point we know that SCX was preempted by a higher priority
+ * sched_class, so invoke the ->cpu_release() callback if we have not
+ * done so already. We only send the callback once between SCX being
+ * preempted, and it regaining control of the CPU.
+ *
+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the
+ * next time that balance_scx() is invoked.
+ */
+ if (!rq->scx.cpu_released) {
+ if (SCX_HAS_OP(cpu_release)) {
+ struct scx_cpu_release_args args = {
+ .reason = preempt_reason_from_class(next_class),
+ .task = next,
+ };
+
+ SCX_CALL_OP(SCX_KF_CPU_RELEASE,
+ cpu_release, cpu_of(rq), &args);
+ }
+ rq->scx.cpu_released = true;
+ }
+}
+
+static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
+ struct task_struct *next)
+{
+ update_curr_scx(rq);
+
+ /* see dequeue_task_scx() on why we skip when !QUEUED */
+ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
+
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ set_task_runnable(rq, p);
+
+ /*
+ * If @p has slice left and is being put, @p is getting
+ * preempted by a higher priority scheduler class or core-sched
+ * forcing a different task. Leave it at the head of the local
+ * DSQ.
+ */
+ if (p->scx.slice && !scx_rq_bypassing(rq)) {
+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+ goto switch_class;
+ }
+
+ /*
+ * If @p is runnable but we're about to enter a lower
+ * sched_class, %SCX_OPS_ENQ_LAST must be set. Tell
+ * ops.enqueue() that @p is the only one available for this cpu,
+ * which should trigger an explicit follow-up scheduling event.
+ */
+ if (sched_class_above(&ext_sched_class, next->sched_class)) {
+ WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
+ do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
+ } else {
+ do_enqueue_task(rq, p, 0, -1);
+ }
+ }
+
+switch_class:
+ if (next && next->sched_class != &ext_sched_class)
+ switch_class(rq, next);
+}
+
+static struct task_struct *first_local_task(struct rq *rq)
+{
+ return list_first_entry_or_null(&rq->scx.local_dsq.list,
+ struct task_struct, scx.dsq_list.node);
+}
+
+static struct task_struct *pick_task_scx(struct rq *rq)
+{
+ struct task_struct *prev = rq->curr;
+ struct task_struct *p;
+ bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ bool kick_idle = false;
+
+ /*
+ * WORKAROUND:
+ *
+ * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
+ * have gone through balance_scx(). Unfortunately, there currently is a
+ * bug where fair could say yes on balance() but no on pick_task(),
+ * which then ends up calling pick_task_scx() without preceding
+ * balance_scx().
+ *
+ * Keep running @prev if possible and avoid stalling from entering idle
+ * without balancing.
+ *
+ * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
+ * if pick_task_scx() is called without preceding balance_scx().
+ */
+ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
+ if (prev->scx.flags & SCX_TASK_QUEUED) {
+ keep_prev = true;
+ } else {
+ keep_prev = false;
+ kick_idle = true;
+ }
+ } else if (unlikely(keep_prev &&
+ prev->sched_class != &ext_sched_class)) {
+ /*
+ * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
+ * conditional on scx_enabled() and may have been skipped.
+ */
+ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ keep_prev = false;
+ }
+
+ /*
+ * If balance_scx() is telling us to keep running @prev, replenish slice
+ * if necessary and keep running @prev. Otherwise, pop the first one
+ * from the local DSQ.
+ */
+ if (keep_prev) {
+ p = prev;
+ if (!p->scx.slice)
+ p->scx.slice = SCX_SLICE_DFL;
+ } else {
+ p = first_local_task(rq);
+ if (!p) {
+ if (kick_idle)
+ scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
+ return NULL;
+ }
+
+ if (unlikely(!p->scx.slice)) {
+ if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
+ p->comm, p->pid, __func__);
+ scx_warned_zero_slice = true;
+ }
+ p->scx.slice = SCX_SLICE_DFL;
+ }
+ }
+
+ return p;
+}
+
+#ifdef CONFIG_SCHED_CORE
+/**
+ * scx_prio_less - Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ * @in_fi: in forced idle state
+ *
+ * Core-sched is implemented as an additional scheduling layer on top of the
+ * usual sched_class'es and needs to find out the expected task ordering. For
+ * SCX, core-sched calls this function to interrogate the task ordering.
+ *
+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
+ * to implement the default task ordering. The older the timestamp, the higher
+ * priority the task - the global FIFO ordering matching the default scheduling
+ * behavior.
+ *
+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
+ * implement FIFO ordering within each local DSQ. See pick_task_scx().
+ */
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
+{
+ /*
+ * The const qualifiers are dropped from task_struct pointers when
+ * calling ops.core_sched_before(). Accesses are controlled by the
+ * verifier.
+ */
+ if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+ (struct task_struct *)a,
+ (struct task_struct *)b);
+ else
+ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
+}
+#endif /* CONFIG_SCHED_CORE */
+
+#ifdef CONFIG_SMP
+
+static bool test_and_clear_cpu_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT
+ * cluster is not wholly idle either way. This also prevents
+ * scx_pick_idle_cpu() from getting caught in an infinite loop.
+ */
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+
+ /*
+ * If offline, @cpu is not its own sibling and
+ * scx_pick_idle_cpu() can get caught in an infinite loop as
+ * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
+ * is eventually cleared.
+ *
+ * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+ * reduce memory writes, which may help alleviate cache
+ * coherence pressure.
+ */
+ if (cpumask_intersects(smt, idle_masks.smt))
+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
+ else if (cpumask_test_cpu(cpu, idle_masks.smt))
+ __cpumask_clear_cpu(cpu, idle_masks.smt);
+ }
+#endif
+ return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
+}
+
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
+{
+ int cpu;
+
+retry:
+ if (sched_smt_active()) {
+ cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ goto found;
+
+ if (flags & SCX_PICK_IDLE_CORE)
+ return -EBUSY;
+ }
+
+ cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
+ if (cpu >= nr_cpu_ids)
+ return -EBUSY;
+
+found:
+ if (test_and_clear_cpu_idle(cpu))
+ return cpu;
+ else
+ goto retry;
+}
+
+/*
+ * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
+ * domain is not defined).
+ */
+static unsigned int llc_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sd->span_weight;
+}
+
+/*
+ * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
+ * domain is not defined).
+ */
+static struct cpumask *llc_span(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sched_domain_span(sd);
+}
+
+/*
+ * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
+ * NUMA domain is not defined).
+ */
+static unsigned int numa_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return 0;
+ sg = sd->groups;
+ if (!sg)
+ return 0;
+
+ return sg->group_weight;
+}
+
+/*
+ * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
+ * domain is not defined).
+ */
+static struct cpumask *numa_span(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return NULL;
+ sg = sd->groups;
+ if (!sg)
+ return NULL;
+
+ return sched_group_span(sg);
+}
+
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+ int cpu;
+
+ /*
+ * We need to scan all online CPUs to verify whether their scheduling
+ * domains overlap.
+ *
+ * While it is rare to encounter architectures with asymmetric NUMA
+ * topologies, CPU hotplugging or virtualized environments can result
+ * in asymmetric configurations.
+ *
+ * For example:
+ *
+ * NUMA 0:
+ * - LLC 0: cpu0..cpu7
+ * - LLC 1: cpu8..cpu15 [offline]
+ *
+ * NUMA 1:
+ * - LLC 0: cpu16..cpu23
+ * - LLC 1: cpu24..cpu31
+ *
+ * In this case, if we only check the first online CPU (cpu0), we might
+ * incorrectly assume that the LLC and NUMA domains are fully
+ * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+ * domains).
+ */
+ for_each_online_cpu(cpu)
+ if (llc_weight(cpu) != numa_weight(cpu))
+ return true;
+
+ return false;
+}
+
+/*
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
+ */
+static void update_selcpu_topology(void)
+{
+ bool enable_llc = false, enable_numa = false;
+ unsigned int nr_cpus;
+ s32 cpu = cpumask_first(cpu_online_mask);
+
+ /*
+ * Enable LLC domain optimization only when there are multiple LLC
+ * domains among the online CPUs. If all online CPUs are part of a
+ * single LLC domain, the idle CPU selection logic can choose any
+ * online CPU without bias.
+ *
+ * Note that it is sufficient to check the LLC domain of the first
+ * online CPU to determine whether a single LLC domain includes all
+ * CPUs.
+ */
+ rcu_read_lock();
+ nr_cpus = llc_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus())
+ enable_llc = true;
+ pr_debug("sched_ext: LLC=%*pb weight=%u\n",
+ cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
+ }
+
+ /*
+ * Enable NUMA optimization only when there are multiple NUMA domains
+ * among the online CPUs and the NUMA domains don't perfectly overlaps
+ * with the LLC domains.
+ *
+ * If all CPUs belong to the same NUMA node and the same LLC domain,
+ * enabling both NUMA and LLC optimizations is unnecessary, as checking
+ * for an idle CPU in the same domain twice is redundant.
+ */
+ nr_cpus = numa_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
+ enable_numa = true;
+ pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
+ cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
+ }
+ rcu_read_unlock();
+
+ pr_debug("sched_ext: LLC idle selection %s\n",
+ str_enabled_disabled(enable_llc));
+ pr_debug("sched_ext: NUMA idle selection %s\n",
+ str_enabled_disabled(enable_numa));
+
+ if (enable_llc)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+ if (enable_numa)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+}
+
+/*
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
+ * to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ * - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * 5. Pick any idle CPU usable by the task.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively, multiple
+ * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa).
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *found)
+{
+ const struct cpumask *llc_cpus = NULL;
+ const struct cpumask *numa_cpus = NULL;
+ s32 cpu;
+
+ *found = false;
+
+ /*
+ * This is necessary to protect llc_cpus.
+ */
+ rcu_read_lock();
+
+ /*
+ * Determine the scheduling domain only if the task is allowed to run
+ * on all CPUs.
+ *
+ * This is done primarily for efficiency, as it avoids the overhead of
+ * updating a cpumask every time we need to select an idle CPU (which
+ * can be costly in large SMP systems), but it also aligns logically:
+ * if a task's scheduling domain is restricted by user-space (through
+ * CPU affinity), the task will simply use the flat scheduling domain
+ * defined by user-space.
+ */
+ if (p->nr_cpus_allowed >= num_possible_cpus()) {
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+ numa_cpus = numa_span(prev_cpu);
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
+ llc_cpus = llc_span(prev_cpu);
+ }
+
+ /*
+ * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
+ */
+ if (wake_flags & SCX_WAKE_SYNC) {
+ cpu = smp_processor_id();
+
+ /*
+ * If the waker's CPU is cache affine and prev_cpu is idle,
+ * then avoid a migration.
+ */
+ if (cpus_share_cache(cpu, prev_cpu) &&
+ test_and_clear_cpu_idle(prev_cpu)) {
+ cpu = prev_cpu;
+ goto cpu_found;
+ }
+
+ /*
+ * If the waker's local DSQ is empty, and the system is under
+ * utilized, try to wake up @p to the local DSQ of the waker.
+ *
+ * Checking only for an empty local DSQ is insufficient as it
+ * could give the wakee an unfair advantage when the system is
+ * oversaturated.
+ *
+ * Checking only for the presence of idle CPUs is also
+ * insufficient as the local DSQ of the waker could have tasks
+ * piled up on it even if there is an idle core elsewhere on
+ * the system.
+ */
+ if (!cpumask_empty(idle_masks.cpu) &&
+ !(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ goto cpu_found;
+ }
+ }
+
+ /*
+ * If CPU has SMT, any wholly idle CPU is likely a better pick than
+ * partially idle @prev_cpu.
+ */
+ if (sched_smt_active()) {
+ /*
+ * Keep using @prev_cpu if it's part of a fully idle core.
+ */
+ if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
+ test_and_clear_cpu_idle(prev_cpu)) {
+ cpu = prev_cpu;
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any fully idle core in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any full idle core usable by the task.
+ */
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Use @prev_cpu if it's idle.
+ */
+ if (test_and_clear_cpu_idle(prev_cpu)) {
+ cpu = prev_cpu;
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any idle CPU in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = scx_pick_idle_cpu(llc_cpus, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = scx_pick_idle_cpu(numa_cpus, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+ }
+
+ /*
+ * Search for any idle CPU usable by the task.
+ */
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
+ if (cpu >= 0)
+ goto cpu_found;
+
+ rcu_read_unlock();
+ return prev_cpu;
+
+cpu_found:
+ rcu_read_unlock();
+
+ *found = true;
+ return cpu;
+}
+
+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
+{
+ /*
+ * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
+ * can be a good migration opportunity with low cache and memory
+ * footprint. Returning a CPU different than @prev_cpu triggers
+ * immediate rq migration. However, for SCX, as the current rq
+ * association doesn't dictate where the task is going to run, this
+ * doesn't fit well. If necessary, we can later add a dedicated method
+ * which can decide to preempt self to force it through the regular
+ * scheduling path.
+ */
+ if (unlikely(wake_flags & WF_EXEC))
+ return prev_cpu;
+
+ if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
+ s32 cpu;
+ struct task_struct **ddsp_taskp;
+
+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
+ WARN_ON_ONCE(*ddsp_taskp);
+ *ddsp_taskp = p;
+
+ cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+ select_cpu, p, prev_cpu, wake_flags);
+ *ddsp_taskp = NULL;
+ if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
+ return cpu;
+ else
+ return prev_cpu;
+ } else {
+ bool found;
+ s32 cpu;
+
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
+ if (found) {
+ p->scx.slice = SCX_SLICE_DFL;
+ p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
+ }
+ return cpu;
+ }
+}
+
+static void task_woken_scx(struct rq *rq, struct task_struct *p)
+{
+ run_deferred(rq);
+}
+
+static void set_cpus_allowed_scx(struct task_struct *p,
+ struct affinity_context *ac)
+{
+ set_cpus_allowed_common(p, ac);
+
+ /*
+ * The effective cpumask is stored in @p->cpus_ptr which may temporarily
+ * differ from the configured one in @p->cpus_mask. Always tell the bpf
+ * scheduler the effective one.
+ *
+ * Fine-grained memory write control is enforced by BPF making the const
+ * designation pointless. Cast it away when calling the operation.
+ */
+ if (SCX_HAS_OP(set_cpumask))
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
+ (struct cpumask *)p->cpus_ptr);
+}
+
+static void reset_idle_masks(void)
+{
+ /*
+ * Consider all online cpus idle. Should converge to the actual state
+ * quickly.
+ */
+ cpumask_copy(idle_masks.cpu, cpu_online_mask);
+ cpumask_copy(idle_masks.smt, cpu_online_mask);
+}
+
+static void update_builtin_idle(int cpu, bool idle)
+{
+ assign_cpu(cpu, idle_masks.cpu, idle);
+
+#ifdef CONFIG_SCHED_SMT
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+
+ if (idle) {
+ /*
+ * idle_masks.smt handling is racy but that's fine as
+ * it's only for optimization and self-correcting.
+ */
+ if (!cpumask_subset(smt, idle_masks.cpu))
+ return;
+ cpumask_or(idle_masks.smt, idle_masks.smt, smt);
+ } else {
+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
+ }
+ }
+#endif
+}
+
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ */
+ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+}
+
+static void handle_hotplug(struct rq *rq, bool online)
+{
+ int cpu = cpu_of(rq);
+
+ atomic_long_inc(&scx_hotplug_seq);
+
+ if (scx_enabled())
+ update_selcpu_topology();
+
+ if (online && SCX_HAS_OP(cpu_online))
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
+ else if (!online && SCX_HAS_OP(cpu_offline))
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
+ else
+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
+}
+
+void scx_rq_activate(struct rq *rq)
+{
+ handle_hotplug(rq, true);
+}
+
+void scx_rq_deactivate(struct rq *rq)
+{
+ handle_hotplug(rq, false);
+}
+
+static void rq_online_scx(struct rq *rq)
+{
+ rq->scx.flags |= SCX_RQ_ONLINE;
+}
+
+static void rq_offline_scx(struct rq *rq)
+{
+ rq->scx.flags &= ~SCX_RQ_ONLINE;
+}
+
+#else /* CONFIG_SMP */
+
+static bool test_and_clear_cpu_idle(int cpu) { return false; }
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
+static void reset_idle_masks(void) {}
+
+#endif /* CONFIG_SMP */
+
+static bool check_rq_for_timeouts(struct rq *rq)
+{
+ struct task_struct *p;
+ struct rq_flags rf;
+ bool timed_out = false;
+
+ rq_lock_irqsave(rq, &rf);
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+ unsigned long last_runnable = p->scx.runnable_at;
+
+ if (unlikely(time_after(jiffies,
+ last_runnable + scx_watchdog_timeout))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
+
+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid,
+ dur_ms / 1000, dur_ms % 1000);
+ timed_out = true;
+ break;
+ }
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
+ return timed_out;
+}
+
+static void scx_watchdog_workfn(struct work_struct *work)
+{
+ int cpu;
+
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+
+ for_each_online_cpu(cpu) {
+ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
+ break;
+
+ cond_resched();
+ }
+ queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ scx_watchdog_timeout / 2);
+}
+
+void scx_tick(struct rq *rq)
+{
+ unsigned long last_check;
+
+ if (!scx_enabled())
+ return;
+
+ last_check = READ_ONCE(scx_watchdog_timestamp);
+ if (unlikely(time_after(jiffies,
+ last_check + READ_ONCE(scx_watchdog_timeout)))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
+
+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
+ }
+
+ update_other_load_avgs(rq);
+}
+
+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
+{
+ update_curr_scx(rq);
+
+ /*
+ * While disabling, always resched and refresh core-sched timestamp as
+ * we can't trust the slice management or ops.core_sched_before().
+ */
+ if (scx_rq_bypassing(rq)) {
+ curr->scx.slice = 0;
+ touch_core_sched(rq, curr);
+ } else if (SCX_HAS_OP(tick)) {
+ SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
+ }
+
+ if (!curr->scx.slice)
+ resched_curr(rq);
+}
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+static struct cgroup *tg_cgrp(struct task_group *tg)
+{
+ /*
+ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup,
+ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the
+ * root cgroup.
+ */
+ if (tg && tg->css.cgroup)
+ return tg->css.cgroup;
+ else
+ return &cgrp_dfl_root.cgrp;
+}
+
+#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg),
+
+#else /* CONFIG_EXT_GROUP_SCHED */
+
+#define SCX_INIT_TASK_ARGS_CGROUP(tg)
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+static enum scx_task_state scx_get_task_state(const struct task_struct *p)
+{
+ return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
+}
+
+static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
+{
+ enum scx_task_state prev_state = scx_get_task_state(p);
+ bool warn = false;
+
+ BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
+
+ switch (state) {
+ case SCX_TASK_NONE:
+ break;
+ case SCX_TASK_INIT:
+ warn = prev_state != SCX_TASK_NONE;
+ break;
+ case SCX_TASK_READY:
+ warn = prev_state == SCX_TASK_NONE;
+ break;
+ case SCX_TASK_ENABLED:
+ warn = prev_state != SCX_TASK_READY;
+ break;
+ default:
+ warn = true;
+ return;
+ }
+
+ WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
+ prev_state, state, p->comm, p->pid);
+
+ p->scx.flags &= ~SCX_TASK_STATE_MASK;
+ p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
+}
+
+static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+{
+ int ret;
+
+ p->scx.disallow = false;
+
+ if (SCX_HAS_OP(init_task)) {
+ struct scx_init_task_args args = {
+ SCX_INIT_TASK_ARGS_CGROUP(tg)
+ .fork = fork,
+ };
+
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
+ if (unlikely(ret)) {
+ ret = ops_sanitize_err("init_task", ret);
+ return ret;
+ }
+ }
+
+ scx_set_task_state(p, SCX_TASK_INIT);
+
+ if (p->scx.disallow) {
+ if (!fork) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * We're in the load path and @p->policy will be applied
+ * right after. Reverting @p->policy here and rejecting
+ * %SCHED_EXT transitions from scx_check_setscheduler()
+ * guarantees that if ops.init_task() sets @p->disallow,
+ * @p can never be in SCX.
+ */
+ if (p->policy == SCHED_EXT) {
+ p->policy = SCHED_NORMAL;
+ atomic_long_inc(&scx_nr_rejected);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+ } else if (p->policy == SCHED_EXT) {
+ scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
+ }
+ }
+
+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+ return 0;
+}
+
+static void scx_ops_enable_task(struct task_struct *p)
+{
+ u32 weight;
+
+ lockdep_assert_rq_held(task_rq(p));
+
+ /*
+ * Set the weight before calling ops.enable() so that the scheduler
+ * doesn't see a stale value if they inspect the task struct.
+ */
+ if (task_has_idle_policy(p))
+ weight = WEIGHT_IDLEPRIO;
+ else
+ weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
+
+ p->scx.weight = sched_weight_to_cgroup(weight);
+
+ if (SCX_HAS_OP(enable))
+ SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
+ scx_set_task_state(p, SCX_TASK_ENABLED);
+
+ if (SCX_HAS_OP(set_weight))
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+}
+
+static void scx_ops_disable_task(struct task_struct *p)
+{
+ lockdep_assert_rq_held(task_rq(p));
+ WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
+
+ if (SCX_HAS_OP(disable))
+ SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
+ scx_set_task_state(p, SCX_TASK_READY);
+}
+
+static void scx_ops_exit_task(struct task_struct *p)
+{
+ struct scx_exit_task_args args = {
+ .cancelled = false,
+ };
+
+ lockdep_assert_rq_held(task_rq(p));
+
+ switch (scx_get_task_state(p)) {
+ case SCX_TASK_NONE:
+ return;
+ case SCX_TASK_INIT:
+ args.cancelled = true;
+ break;
+ case SCX_TASK_READY:
+ break;
+ case SCX_TASK_ENABLED:
+ scx_ops_disable_task(p);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return;
+ }
+
+ if (SCX_HAS_OP(exit_task))
+ SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
+ scx_set_task_state(p, SCX_TASK_NONE);
+}
+
+void init_scx_entity(struct sched_ext_entity *scx)
+{
+ memset(scx, 0, sizeof(*scx));
+ INIT_LIST_HEAD(&scx->dsq_list.node);
+ RB_CLEAR_NODE(&scx->dsq_priq);
+ scx->sticky_cpu = -1;
+ scx->holding_cpu = -1;
+ INIT_LIST_HEAD(&scx->runnable_node);
+ scx->runnable_at = jiffies;
+ scx->ddsp_dsq_id = SCX_DSQ_INVALID;
+ scx->slice = SCX_SLICE_DFL;
+}
+
+void scx_pre_fork(struct task_struct *p)
+{
+ /*
+ * BPF scheduler enable/disable paths want to be able to iterate and
+ * update all tasks which can become complex when racing forks. As
+ * enable/disable are very cold paths, let's use a percpu_rwsem to
+ * exclude forks.
+ */
+ percpu_down_read(&scx_fork_rwsem);
+}
+
+int scx_fork(struct task_struct *p)
+{
+ percpu_rwsem_assert_held(&scx_fork_rwsem);
+
+ if (scx_ops_init_task_enabled)
+ return scx_ops_init_task(p, task_group(p), true);
+ else
+ return 0;
+}
+
+void scx_post_fork(struct task_struct *p)
+{
+ if (scx_ops_init_task_enabled) {
+ scx_set_task_state(p, SCX_TASK_READY);
+
+ /*
+ * Enable the task immediately if it's running on sched_ext.
+ * Otherwise, it'll be enabled in switching_to_scx() if and
+ * when it's ever configured to run with a SCHED_EXT policy.
+ */
+ if (p->sched_class == &ext_sched_class) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ scx_ops_enable_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+ }
+
+ spin_lock_irq(&scx_tasks_lock);
+ list_add_tail(&p->scx.tasks_node, &scx_tasks);
+ spin_unlock_irq(&scx_tasks_lock);
+
+ percpu_up_read(&scx_fork_rwsem);
+}
+
+void scx_cancel_fork(struct task_struct *p)
+{
+ if (scx_enabled()) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+ WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
+ scx_ops_exit_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ percpu_up_read(&scx_fork_rwsem);
+}
+
+void sched_ext_free(struct task_struct *p)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&scx_tasks_lock, flags);
+ list_del_init(&p->scx.tasks_node);
+ spin_unlock_irqrestore(&scx_tasks_lock, flags);
+
+ /*
+ * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
+ * ENABLED transitions can't race us. Disable ops for @p.
+ */
+ if (scx_get_task_state(p) != SCX_TASK_NONE) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ scx_ops_exit_task(p);
+ task_rq_unlock(rq, p, &rf);
+ }
+}
+
+static void reweight_task_scx(struct rq *rq, struct task_struct *p,
+ const struct load_weight *lw)
+{
+ lockdep_assert_rq_held(task_rq(p));
+
+ p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
+ if (SCX_HAS_OP(set_weight))
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+}
+
+static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
+{
+}
+
+static void switching_to_scx(struct rq *rq, struct task_struct *p)
+{
+ scx_ops_enable_task(p);
+
+ /*
+ * set_cpus_allowed_scx() is not called while @p is associated with a
+ * different scheduler class. Keep the BPF scheduler up-to-date.
+ */
+ if (SCX_HAS_OP(set_cpumask))
+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
+ (struct cpumask *)p->cpus_ptr);
+}
+
+static void switched_from_scx(struct rq *rq, struct task_struct *p)
+{
+ scx_ops_disable_task(p);
+}
+
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
+
+int scx_check_setscheduler(struct task_struct *p, int policy)
+{
+ lockdep_assert_rq_held(task_rq(p));
+
+ /* if disallow, reject transitioning into SCX */
+ if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
+ p->policy != policy && policy == SCHED_EXT)
+ return -EACCES;
+
+ return 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ if (scx_rq_bypassing(rq))
+ return false;
+
+ if (p->sched_class != &ext_sched_class)
+ return true;
+
+ /*
+ * @rq can dispatch from different DSQs, so we can't tell whether it
+ * needs the tick or not by looking at nr_running. Allow stopping ticks
+ * iff the BPF scheduler indicated so. See set_next_task_scx().
+ */
+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+static bool scx_cgroup_enabled;
+static bool cgroup_warned_missing_weight;
+static bool cgroup_warned_missing_idle;
+
+static void scx_cgroup_warn_missing_weight(struct task_group *tg)
+{
+ if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
+ cgroup_warned_missing_weight)
+ return;
+
+ if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)
+ return;
+
+ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n",
+ scx_ops.name);
+ cgroup_warned_missing_weight = true;
+}
+
+static void scx_cgroup_warn_missing_idle(struct task_group *tg)
+{
+ if (!scx_cgroup_enabled || cgroup_warned_missing_idle)
+ return;
+
+ if (!tg->idle)
+ return;
+
+ pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n",
+ scx_ops.name);
+ cgroup_warned_missing_idle = true;
+}
+
+int scx_tg_online(struct task_group *tg)
+{
+ int ret = 0;
+
+ WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ scx_cgroup_warn_missing_weight(tg);
+
+ if (scx_cgroup_enabled) {
+ if (SCX_HAS_OP(cgroup_init)) {
+ struct scx_cgroup_init_args args =
+ { .weight = tg->scx_weight };
+
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ tg->css.cgroup, &args);
+ if (ret)
+ ret = ops_sanitize_err("cgroup_init", ret);
+ }
+ if (ret == 0)
+ tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+ } else {
+ tg->scx_flags |= SCX_TG_ONLINE;
+ }
+
+ percpu_up_read(&scx_cgroup_rwsem);
+ return ret;
+}
+
+void scx_tg_offline(struct task_group *tg)
+{
+ WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
+ tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+int scx_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *p;
+ int ret;
+
+ /* released in scx_finish/cancel_attach() */
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (!scx_cgroup_enabled)
+ return 0;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ struct cgroup *from = tg_cgrp(task_group(p));
+ struct cgroup *to = tg_cgrp(css_tg(css));
+
+ WARN_ON_ONCE(p->scx.cgrp_moving_from);
+
+ /*
+ * sched_move_task() omits identity migrations. Let's match the
+ * behavior so that ops.cgroup_prep_move() and ops.cgroup_move()
+ * always match one-to-one.
+ */
+ if (from == to)
+ continue;
+
+ if (SCX_HAS_OP(cgroup_prep_move)) {
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
+ p, from, css->cgroup);
+ if (ret)
+ goto err;
+ }
+
+ p->scx.cgrp_moving_from = from;
+ }
+
+ return 0;
+
+err:
+ cgroup_taskset_for_each(p, css, tset) {
+ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
+ p->scx.cgrp_moving_from, css->cgroup);
+ p->scx.cgrp_moving_from = NULL;
+ }
+
+ percpu_up_read(&scx_cgroup_rwsem);
+ return ops_sanitize_err("cgroup_prep_move", ret);
+}
+
+void scx_cgroup_move_task(struct task_struct *p)
+{
+ if (!scx_cgroup_enabled)
+ return;
+
+ /*
+ * @p must have ops.cgroup_prep_move() called on it and thus
+ * cgrp_moving_from set.
+ */
+ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
+ p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+ p->scx.cgrp_moving_from = NULL;
+}
+
+void scx_cgroup_finish_attach(void)
+{
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *p;
+
+ if (!scx_cgroup_enabled)
+ goto out_unlock;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
+ p->scx.cgrp_moving_from, css->cgroup);
+ p->scx.cgrp_moving_from = NULL;
+ }
+out_unlock:
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_group_set_weight(struct task_group *tg, unsigned long weight)
+{
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (scx_cgroup_enabled && tg->scx_weight != weight) {
+ if (SCX_HAS_OP(cgroup_set_weight))
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
+ tg_cgrp(tg), weight);
+ tg->scx_weight = weight;
+ }
+
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+void scx_group_set_idle(struct task_group *tg, bool idle)
+{
+ percpu_down_read(&scx_cgroup_rwsem);
+ scx_cgroup_warn_missing_idle(tg);
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_lock(void)
+{
+ percpu_down_write(&scx_cgroup_rwsem);
+}
+
+static void scx_cgroup_unlock(void)
+{
+ percpu_up_write(&scx_cgroup_rwsem);
+}
+
+#else /* CONFIG_EXT_GROUP_SCHED */
+
+static inline void scx_cgroup_lock(void) {}
+static inline void scx_cgroup_unlock(void) {}
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+/*
+ * Omitted operations:
+ *
+ * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
+ * isn't tied to the CPU at that point. Preemption is implemented by resetting
+ * the victim task's slice to 0 and triggering reschedule on the target CPU.
+ *
+ * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
+ *
+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
+ * their current sched_class. Call them directly from sched core instead.
+ */
+DEFINE_SCHED_CLASS(ext) = {
+ .enqueue_task = enqueue_task_scx,
+ .dequeue_task = dequeue_task_scx,
+ .yield_task = yield_task_scx,
+ .yield_to_task = yield_to_task_scx,
+
+ .wakeup_preempt = wakeup_preempt_scx,
+
+ .balance = balance_scx,
+ .pick_task = pick_task_scx,
+
+ .put_prev_task = put_prev_task_scx,
+ .set_next_task = set_next_task_scx,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_scx,
+ .task_woken = task_woken_scx,
+ .set_cpus_allowed = set_cpus_allowed_scx,
+
+ .rq_online = rq_online_scx,
+ .rq_offline = rq_offline_scx,
+#endif
+
+ .task_tick = task_tick_scx,
+
+ .switching_to = switching_to_scx,
+ .switched_from = switched_from_scx,
+ .switched_to = switched_to_scx,
+ .reweight_task = reweight_task_scx,
+ .prio_changed = prio_changed_scx,
+
+ .update_curr = update_curr_scx,
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
+};
+
+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+{
+ memset(dsq, 0, sizeof(*dsq));
+
+ raw_spin_lock_init(&dsq->lock);
+ INIT_LIST_HEAD(&dsq->list);
+ dsq->id = dsq_id;
+}
+
+static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
+{
+ struct scx_dispatch_q *dsq;
+ int ret;
+
+ if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
+ return ERR_PTR(-EINVAL);
+
+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq)
+ return ERR_PTR(-ENOMEM);
+
+ init_dsq(dsq, dsq_id);
+
+ ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ if (ret) {
+ kfree(dsq);
+ return ERR_PTR(ret);
+ }
+ return dsq;
+}
+
+static void free_dsq_irq_workfn(struct irq_work *irq_work)
+{
+ struct llist_node *to_free = llist_del_all(&dsqs_to_free);
+ struct scx_dispatch_q *dsq, *tmp_dsq;
+
+ llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
+ kfree_rcu(dsq, rcu);
+}
+
+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
+
+static void destroy_dsq(u64 dsq_id)
+{
+ struct scx_dispatch_q *dsq;
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ dsq = find_user_dsq(dsq_id);
+ if (!dsq)
+ goto out_unlock_rcu;
+
+ raw_spin_lock_irqsave(&dsq->lock, flags);
+
+ if (dsq->nr) {
+ scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+ dsq->id, dsq->nr);
+ goto out_unlock_dsq;
+ }
+
+ if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+ goto out_unlock_dsq;
+
+ /*
+ * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
+ * queueing more tasks. As this function can be called from anywhere,
+ * freeing is bounced through an irq work to avoid nesting RCU
+ * operations inside scheduler locks.
+ */
+ dsq->id = SCX_DSQ_INVALID;
+ llist_add(&dsq->free_node, &dsqs_to_free);
+ irq_work_queue(&free_dsq_irq_work);
+
+out_unlock_dsq:
+ raw_spin_unlock_irqrestore(&dsq->lock, flags);
+out_unlock_rcu:
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+static void scx_cgroup_exit(void)
+{
+ struct cgroup_subsys_state *css;
+
+ percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+ scx_cgroup_enabled = false;
+
+ /*
+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * cgroups and exit all the inited ones, all online cgroups are exited.
+ */
+ rcu_read_lock();
+ css_for_each_descendant_post(css, &root_task_group.css) {
+ struct task_group *tg = css_tg(css);
+
+ if (!(tg->scx_flags & SCX_TG_INITED))
+ continue;
+ tg->scx_flags &= ~SCX_TG_INITED;
+
+ if (!scx_ops.cgroup_exit)
+ continue;
+
+ if (WARN_ON_ONCE(!css_tryget(css)))
+ continue;
+ rcu_read_unlock();
+
+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+
+ rcu_read_lock();
+ css_put(css);
+ }
+ rcu_read_unlock();
+}
+
+static int scx_cgroup_init(void)
+{
+ struct cgroup_subsys_state *css;
+ int ret;
+
+ percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+
+ cgroup_warned_missing_weight = false;
+ cgroup_warned_missing_idle = false;
+
+ /*
+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * cgroups and init, all online cgroups are initialized.
+ */
+ rcu_read_lock();
+ css_for_each_descendant_pre(css, &root_task_group.css) {
+ struct task_group *tg = css_tg(css);
+ struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+
+ scx_cgroup_warn_missing_weight(tg);
+ scx_cgroup_warn_missing_idle(tg);
+
+ if ((tg->scx_flags &
+ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
+ continue;
+
+ if (!scx_ops.cgroup_init) {
+ tg->scx_flags |= SCX_TG_INITED;
+ continue;
+ }
+
+ if (WARN_ON_ONCE(!css_tryget(css)))
+ continue;
+ rcu_read_unlock();
+
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ css->cgroup, &args);
+ if (ret) {
+ css_put(css);
+ scx_ops_error("ops.cgroup_init() failed (%d)", ret);
+ return ret;
+ }
+ tg->scx_flags |= SCX_TG_INITED;
+
+ rcu_read_lock();
+ css_put(css);
+ }
+ rcu_read_unlock();
+
+ WARN_ON_ONCE(scx_cgroup_enabled);
+ scx_cgroup_enabled = true;
+
+ return 0;
+}
+
+#else
+static void scx_cgroup_exit(void) {}
+static int scx_cgroup_init(void) { return 0; }
+#endif
+
+
+/********************************************************************************
+ * Sysfs interface and ops enable/disable.
+ */
+
+#define SCX_ATTR(_name) \
+ static struct kobj_attribute scx_attr_##_name = { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .show = scx_attr_##_name##_show, \
+ }
+
+static ssize_t scx_attr_state_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ scx_ops_enable_state_str[scx_ops_enable_state()]);
+}
+SCX_ATTR(state);
+
+static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
+}
+SCX_ATTR(switch_all);
+
+static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
+}
+SCX_ATTR(nr_rejected);
+
+static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq));
+}
+SCX_ATTR(hotplug_seq);
+
+static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
+}
+SCX_ATTR(enable_seq);
+
+static struct attribute *scx_global_attrs[] = {
+ &scx_attr_state.attr,
+ &scx_attr_switch_all.attr,
+ &scx_attr_nr_rejected.attr,
+ &scx_attr_hotplug_seq.attr,
+ &scx_attr_enable_seq.attr,
+ NULL,
+};
+
+static const struct attribute_group scx_global_attr_group = {
+ .attrs = scx_global_attrs,
+};
+
+static void scx_kobj_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
+static ssize_t scx_attr_ops_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", scx_ops.name);
+}
+SCX_ATTR(ops);
+
+static struct attribute *scx_sched_attrs[] = {
+ &scx_attr_ops.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(scx_sched);
+
+static const struct kobj_type scx_ktype = {
+ .release = scx_kobj_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = scx_sched_groups,
+};
+
+static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
+{
+ return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+}
+
+static const struct kset_uevent_ops scx_uevent_ops = {
+ .uevent = scx_uevent,
+};
+
+/*
+ * Used by sched_fork() and __setscheduler_prio() to pick the matching
+ * sched_class. dl/rt are already handled.
+ */
+bool task_should_scx(int policy)
+{
+ if (!scx_enabled() ||
+ unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
+ return false;
+ if (READ_ONCE(scx_switching_all))
+ return true;
+ return policy == SCHED_EXT;
+}
+
+/**
+ * scx_softlockup - sched_ext softlockup handler
+ * @dur_s: number of seconds of CPU stuck due to soft lockup
+ *
+ * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
+ * live-lock the system by making many CPUs target the same DSQ to the point
+ * where soft-lockup detection triggers. This function is called from
+ * soft-lockup watchdog when the triggering point is close and tries to unjam
+ * the system by enabling the breather and aborting the BPF scheduler.
+ */
+void scx_softlockup(u32 dur_s)
+{
+ switch (scx_ops_enable_state()) {
+ case SCX_OPS_ENABLING:
+ case SCX_OPS_ENABLED:
+ break;
+ default:
+ return;
+ }
+
+ /* allow only one instance, cleared at the end of scx_ops_bypass() */
+ if (test_and_set_bit(0, &scx_in_softlockup))
+ return;
+
+ printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
+ smp_processor_id(), dur_s, scx_ops.name);
+
+ /*
+ * Some CPUs may be trapped in the dispatch paths. Enable breather
+ * immediately; otherwise, we might even be able to get to
+ * scx_ops_bypass().
+ */
+ atomic_inc(&scx_ops_breather_depth);
+
+ scx_ops_error("soft lockup - CPU#%d stuck for %us",
+ smp_processor_id(), dur_s);
+}
+
+static void scx_clear_softlockup(void)
+{
+ if (test_and_clear_bit(0, &scx_in_softlockup))
+ atomic_dec(&scx_ops_breather_depth);
+}
+
+/**
+ * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * @bypass: true for bypass, false for unbypass
+ *
+ * Bypassing guarantees that all runnable tasks make forward progress without
+ * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
+ * be held by tasks that the BPF scheduler is forgetting to run, which
+ * unfortunately also excludes toggling the static branches.
+ *
+ * Let's work around by overriding a couple ops and modifying behaviors based on
+ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
+ * to force global FIFO scheduling.
+ *
+ * - ops.select_cpu() is ignored and the default select_cpu() is used.
+ *
+ * - ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
+ * %SCX_OPS_ENQ_LAST is also ignored.
+ *
+ * - ops.dispatch() is ignored.
+ *
+ * - balance_scx() does not set %SCX_RQ_BAL_KEEP on non-zero slice as slice
+ * can't be trusted. Whenever a tick triggers, the running task is rotated to
+ * the tail of the queue with core_sched_at touched.
+ *
+ * - pick_next_task() suppresses zero slice warning.
+ *
+ * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * operations.
+ *
+ * - scx_prio_less() reverts to the default core_sched_at order.
+ */
+static void scx_ops_bypass(bool bypass)
+{
+ static DEFINE_RAW_SPINLOCK(bypass_lock);
+ int cpu;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&bypass_lock, flags);
+ if (bypass) {
+ scx_ops_bypass_depth++;
+ WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
+ if (scx_ops_bypass_depth != 1)
+ goto unlock;
+ } else {
+ scx_ops_bypass_depth--;
+ WARN_ON_ONCE(scx_ops_bypass_depth < 0);
+ if (scx_ops_bypass_depth != 0)
+ goto unlock;
+ }
+
+ atomic_inc(&scx_ops_breather_depth);
+
+ /*
+ * No task property is changing. We just need to make sure all currently
+ * queued tasks are re-queued according to the new scx_rq_bypassing()
+ * state. As an optimization, walk each rq's runnable_list instead of
+ * the scx_tasks list.
+ *
+ * This function can't trust the scheduler and thus can't use
+ * cpus_read_lock(). Walk all possible CPUs instead of online.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p, *n;
+
+ raw_spin_rq_lock(rq);
+
+ if (bypass) {
+ WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
+ rq->scx.flags |= SCX_RQ_BYPASSING;
+ } else {
+ WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
+ rq->scx.flags &= ~SCX_RQ_BYPASSING;
+ }
+
+ /*
+ * We need to guarantee that no tasks are on the BPF scheduler
+ * while bypassing. Either we see enabled or the enable path
+ * sees scx_rq_bypassing() before moving tasks to SCX.
+ */
+ if (!scx_enabled()) {
+ raw_spin_rq_unlock(rq);
+ continue;
+ }
+
+ /*
+ * The use of list_for_each_entry_safe_reverse() is required
+ * because each task is going to be removed from and added back
+ * to the runnable_list during iteration. Because they're added
+ * to the tail of the list, safe reverse iteration can still
+ * visit all nodes.
+ */
+ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
+ scx.runnable_node) {
+ struct sched_enq_and_set_ctx ctx;
+
+ /* cycling deq/enq is enough, see the function comment */
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+ sched_enq_and_set_task(&ctx);
+ }
+
+ /* resched to restore ticks and idle state */
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+
+ raw_spin_rq_unlock(rq);
+ }
+
+ atomic_dec(&scx_ops_breather_depth);
+unlock:
+ raw_spin_unlock_irqrestore(&bypass_lock, flags);
+ scx_clear_softlockup();
+}
+
+static void free_exit_info(struct scx_exit_info *ei)
+{
+ kfree(ei->dump);
+ kfree(ei->msg);
+ kfree(ei->bt);
+ kfree(ei);
+}
+
+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
+{
+ struct scx_exit_info *ei;
+
+ ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ if (!ei)
+ return NULL;
+
+ ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
+ ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
+ ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+
+ if (!ei->bt || !ei->msg || !ei->dump) {
+ free_exit_info(ei);
+ return NULL;
+ }
+
+ return ei;
+}
+
+static const char *scx_exit_reason(enum scx_exit_kind kind)
+{
+ switch (kind) {
+ case SCX_EXIT_UNREG:
+ return "unregistered from user space";
+ case SCX_EXIT_UNREG_BPF:
+ return "unregistered from BPF";
+ case SCX_EXIT_UNREG_KERN:
+ return "unregistered from the main kernel";
+ case SCX_EXIT_SYSRQ:
+ return "disabled by sysrq-S";
+ case SCX_EXIT_ERROR:
+ return "runtime error";
+ case SCX_EXIT_ERROR_BPF:
+ return "scx_bpf_error";
+ case SCX_EXIT_ERROR_STALL:
+ return "runnable task stall";
+ default:
+ return "<UNKNOWN>";
+ }
+}
+
+static void scx_ops_disable_workfn(struct kthread_work *work)
+{
+ struct scx_exit_info *ei = scx_exit_info;
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ struct rhashtable_iter rht_iter;
+ struct scx_dispatch_q *dsq;
+ int i, kind, cpu;
+
+ kind = atomic_read(&scx_exit_kind);
+ while (true) {
+ /*
+ * NONE indicates that a new scx_ops has been registered since
+ * disable was scheduled - don't kill the new ops. DONE
+ * indicates that the ops has already been disabled.
+ */
+ if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
+ return;
+ if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
+ break;
+ }
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
+ /* guarantee forward progress by bypassing scx_ops */
+ scx_ops_bypass(true);
+
+ switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
+ case SCX_OPS_DISABLING:
+ WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
+ break;
+ case SCX_OPS_DISABLED:
+ pr_warn("sched_ext: ops error detected without ops (%s)\n",
+ scx_exit_info->msg);
+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
+ SCX_OPS_DISABLING);
+ goto done;
+ default:
+ break;
+ }
+
+ /*
+ * Here, every runnable task is guaranteed to make forward progress and
+ * we can safely use blocking synchronization constructs. Actually
+ * disable ops.
+ */
+ mutex_lock(&scx_ops_enable_mutex);
+
+ static_branch_disable(&__scx_switched_all);
+ WRITE_ONCE(scx_switching_all, false);
+
+ /*
+ * Shut down cgroup support before tasks so that the cgroup attach path
+ * doesn't race against scx_ops_exit_task().
+ */
+ scx_cgroup_lock();
+ scx_cgroup_exit();
+ scx_cgroup_unlock();
+
+ /*
+ * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
+ * must be switched out and exited synchronously.
+ */
+ percpu_down_write(&scx_fork_rwsem);
+
+ scx_ops_init_task_enabled = false;
+
+ scx_task_iter_start(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ const struct sched_class *old_class = p->sched_class;
+ const struct sched_class *new_class =
+ __setscheduler_class(p->policy, p->prio);
+ struct sched_enq_and_set_ctx ctx;
+
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+
+ p->sched_class = new_class;
+ check_class_changing(task_rq(p), p, old_class);
+
+ sched_enq_and_set_task(&ctx);
+
+ check_class_changed(task_rq(p), p, old_class, p->prio);
+ scx_ops_exit_task(p);
+ }
+ scx_task_iter_stop(&sti);
+ percpu_up_write(&scx_fork_rwsem);
+
+ /*
+ * Invalidate all the rq clocks to prevent getting outdated
+ * rq clocks from a previous scx scheduler.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ scx_rq_clock_invalidate(rq);
+ }
+
+ /* no task is on scx, turn off all the switches and flush in-progress calls */
+ static_branch_disable(&__scx_ops_enabled);
+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
+ static_branch_disable(&scx_has_op[i]);
+ static_branch_disable(&scx_ops_enq_last);
+ static_branch_disable(&scx_ops_enq_exiting);
+ static_branch_disable(&scx_ops_enq_migration_disabled);
+ static_branch_disable(&scx_ops_cpu_preempt);
+ static_branch_disable(&scx_builtin_idle_enabled);
+ synchronize_rcu();
+
+ if (ei->kind >= SCX_EXIT_ERROR) {
+ pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
+ scx_ops.name, ei->reason);
+
+ if (ei->msg[0] != '\0')
+ pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
+#ifdef CONFIG_STACKTRACE
+ stack_trace_print(ei->bt, ei->bt_len, 2);
+#endif
+ } else {
+ pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
+ scx_ops.name, ei->reason);
+ }
+
+ if (scx_ops.exit)
+ SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+
+ cancel_delayed_work_sync(&scx_watchdog_work);
+
+ /*
+ * Delete the kobject from the hierarchy eagerly in addition to just
+ * dropping a reference. Otherwise, if the object is deleted
+ * asynchronously, sysfs could observe an object of the same name still
+ * in the hierarchy when another scheduler is loaded.
+ */
+ kobject_del(scx_root_kobj);
+ kobject_put(scx_root_kobj);
+ scx_root_kobj = NULL;
+
+ memset(&scx_ops, 0, sizeof(scx_ops));
+
+ rhashtable_walk_enter(&dsq_hash, &rht_iter);
+ do {
+ rhashtable_walk_start(&rht_iter);
+
+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ destroy_dsq(dsq->id);
+
+ rhashtable_walk_stop(&rht_iter);
+ } while (dsq == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&rht_iter);
+
+ free_percpu(scx_dsp_ctx);
+ scx_dsp_ctx = NULL;
+ scx_dsp_max_batch = 0;
+
+ free_exit_info(scx_exit_info);
+ scx_exit_info = NULL;
+
+ mutex_unlock(&scx_ops_enable_mutex);
+
+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
+ SCX_OPS_DISABLING);
+done:
+ scx_ops_bypass(false);
+}
+
+static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
+
+static void schedule_scx_ops_disable_work(void)
+{
+ struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
+
+ /*
+ * We may be called spuriously before the first bpf_sched_ext_reg(). If
+ * scx_ops_helper isn't set up yet, there's nothing to do.
+ */
+ if (helper)
+ kthread_queue_work(helper, &scx_ops_disable_work);
+}
+
+static void scx_ops_disable(enum scx_exit_kind kind)
+{
+ int none = SCX_EXIT_NONE;
+
+ if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
+ kind = SCX_EXIT_ERROR;
+
+ atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
+
+ schedule_scx_ops_disable_work();
+}
+
+static void dump_newline(struct seq_buf *s)
+{
+ trace_sched_ext_dump("");
+
+ /* @s may be zero sized and seq_buf triggers WARN if so */
+ if (s->size)
+ seq_buf_putc(s, '\n');
+}
+
+static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
+{
+ va_list args;
+
+#ifdef CONFIG_TRACEPOINTS
+ if (trace_sched_ext_dump_enabled()) {
+ /* protected by scx_dump_state()::dump_lock */
+ static char line_buf[SCX_EXIT_MSG_LEN];
+
+ va_start(args, fmt);
+ vscnprintf(line_buf, sizeof(line_buf), fmt, args);
+ va_end(args);
+
+ trace_sched_ext_dump(line_buf);
+ }
+#endif
+ /* @s may be zero sized and seq_buf triggers WARN if so */
+ if (s->size) {
+ va_start(args, fmt);
+ seq_buf_vprintf(s, fmt, args);
+ va_end(args);
+
+ seq_buf_putc(s, '\n');
+ }
+}
+
+static void dump_stack_trace(struct seq_buf *s, const char *prefix,
+ const unsigned long *bt, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; i++)
+ dump_line(s, "%s%pS", prefix, (void *)bt[i]);
+}
+
+static void ops_dump_init(struct seq_buf *s, const char *prefix)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+
+ lockdep_assert_irqs_disabled();
+
+ dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */
+ dd->first = true;
+ dd->cursor = 0;
+ dd->s = s;
+ dd->prefix = prefix;
+}
+
+static void ops_dump_flush(void)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+ char *line = dd->buf.line;
+
+ if (!dd->cursor)
+ return;
+
+ /*
+ * There's something to flush and this is the first line. Insert a blank
+ * line to distinguish ops dump.
+ */
+ if (dd->first) {
+ dump_newline(dd->s);
+ dd->first = false;
+ }
+
+ /*
+ * There may be multiple lines in $line. Scan and emit each line
+ * separately.
+ */
+ while (true) {
+ char *end = line;
+ char c;
+
+ while (*end != '\n' && *end != '\0')
+ end++;
+
+ /*
+ * If $line overflowed, it may not have newline at the end.
+ * Always emit with a newline.
+ */
+ c = *end;
+ *end = '\0';
+ dump_line(dd->s, "%s%s", dd->prefix, line);
+ if (c == '\0')
+ break;
+
+ /* move to the next line */
+ end++;
+ if (*end == '\0')
+ break;
+ line = end;
+ }
+
+ dd->cursor = 0;
+}
+
+static void ops_dump_exit(void)
+{
+ ops_dump_flush();
+ scx_dump_data.cpu = -1;
+}
+
+static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
+ struct task_struct *p, char marker)
+{
+ static unsigned long bt[SCX_EXIT_BT_LEN];
+ char dsq_id_buf[19] = "(n/a)";
+ unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
+ unsigned int bt_len = 0;
+
+ if (p->scx.dsq)
+ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
+ (unsigned long long)p->scx.dsq->id);
+
+ dump_newline(s);
+ dump_line(s, " %c%c %s[%d] %+ldms",
+ marker, task_state_to_char(p), p->comm, p->pid,
+ jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
+ dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
+ scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
+ p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
+ ops_state >> SCX_OPSS_QSEQ_SHIFT);
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+ dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
+ p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
+ dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+
+ if (SCX_HAS_OP(dump_task)) {
+ ops_dump_init(s, " ");
+ SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+ ops_dump_exit();
+ }
+
+#ifdef CONFIG_STACKTRACE
+ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
+#endif
+ if (bt_len) {
+ dump_newline(s);
+ dump_stack_trace(s, " ", bt, bt_len);
+ }
+}
+
+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+{
+ static DEFINE_SPINLOCK(dump_lock);
+ static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ struct scx_dump_ctx dctx = {
+ .kind = ei->kind,
+ .exit_code = ei->exit_code,
+ .reason = ei->reason,
+ .at_ns = ktime_get_ns(),
+ .at_jiffies = jiffies,
+ };
+ struct seq_buf s;
+ unsigned long flags;
+ char *buf;
+ int cpu;
+
+ spin_lock_irqsave(&dump_lock, flags);
+
+ seq_buf_init(&s, ei->dump, dump_len);
+
+ if (ei->kind == SCX_EXIT_NONE) {
+ dump_line(&s, "Debug dump triggered by %s", ei->reason);
+ } else {
+ dump_line(&s, "%s[%d] triggered exit kind %d:",
+ current->comm, current->pid, ei->kind);
+ dump_line(&s, " %s (%s)", ei->reason, ei->msg);
+ dump_newline(&s);
+ dump_line(&s, "Backtrace:");
+ dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
+ }
+
+ if (SCX_HAS_OP(dump)) {
+ ops_dump_init(&s, "");
+ SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+ ops_dump_exit();
+ }
+
+ dump_newline(&s);
+ dump_line(&s, "CPU states");
+ dump_line(&s, "----------");
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct task_struct *p;
+ struct seq_buf ns;
+ size_t avail, used;
+ bool idle;
+
+ rq_lock(rq, &rf);
+
+ idle = list_empty(&rq->scx.runnable_list) &&
+ rq->curr->sched_class == &idle_sched_class;
+
+ if (idle && !SCX_HAS_OP(dump_cpu))
+ goto next;
+
+ /*
+ * We don't yet know whether ops.dump_cpu() will produce output
+ * and we may want to skip the default CPU dump if it doesn't.
+ * Use a nested seq_buf to generate the standard dump so that we
+ * can decide whether to commit later.
+ */
+ avail = seq_buf_get_buf(&s, &buf);
+ seq_buf_init(&ns, buf, avail);
+
+ dump_newline(&ns);
+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu",
+ cpu, rq->scx.nr_running, rq->scx.flags,
+ rq->scx.cpu_released, rq->scx.ops_qseq,
+ rq->scx.pnt_seq);
+ dump_line(&ns, " curr=%s[%d] class=%ps",
+ rq->curr->comm, rq->curr->pid,
+ rq->curr->sched_class);
+ if (!cpumask_empty(rq->scx.cpus_to_kick))
+ dump_line(&ns, " cpus_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick));
+ if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
+ dump_line(&ns, " idle_to_kick : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
+ if (!cpumask_empty(rq->scx.cpus_to_preempt))
+ dump_line(&ns, " cpus_to_preempt: %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_preempt));
+ if (!cpumask_empty(rq->scx.cpus_to_wait))
+ dump_line(&ns, " cpus_to_wait : %*pb",
+ cpumask_pr_args(rq->scx.cpus_to_wait));
+
+ used = seq_buf_used(&ns);
+ if (SCX_HAS_OP(dump_cpu)) {
+ ops_dump_init(&ns, " ");
+ SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+ ops_dump_exit();
+ }
+
+ /*
+ * If idle && nothing generated by ops.dump_cpu(), there's
+ * nothing interesting. Skip.
+ */
+ if (idle && used == seq_buf_used(&ns))
+ goto next;
+
+ /*
+ * $s may already have overflowed when $ns was created. If so,
+ * calling commit on it will trigger BUG.
+ */
+ if (avail) {
+ seq_buf_commit(&s, seq_buf_used(&ns));
+ if (seq_buf_has_overflowed(&ns))
+ seq_buf_set_overflow(&s);
+ }
+
+ if (rq->curr->sched_class == &ext_sched_class)
+ scx_dump_task(&s, &dctx, rq->curr, '*');
+
+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
+ scx_dump_task(&s, &dctx, p, ' ');
+ next:
+ rq_unlock(rq, &rf);
+ }
+
+ if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
+ memcpy(ei->dump + dump_len - sizeof(trunc_marker),
+ trunc_marker, sizeof(trunc_marker));
+
+ spin_unlock_irqrestore(&dump_lock, flags);
+}
+
+static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+{
+ struct scx_exit_info *ei = scx_exit_info;
+
+ if (ei->kind >= SCX_EXIT_ERROR)
+ scx_dump_state(ei, scx_ops.exit_dump_len);
+
+ schedule_scx_ops_disable_work();
+}
+
+static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
+
+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
+ s64 exit_code,
+ const char *fmt, ...)
+{
+ struct scx_exit_info *ei = scx_exit_info;
+ int none = SCX_EXIT_NONE;
+ va_list args;
+
+ if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
+ return;
+
+ ei->exit_code = exit_code;
+#ifdef CONFIG_STACKTRACE
+ if (kind >= SCX_EXIT_ERROR)
+ ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
+#endif
+ va_start(args, fmt);
+ vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
+ va_end(args);
+
+ /*
+ * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
+ * in scx_ops_disable_workfn().
+ */
+ ei->kind = kind;
+ ei->reason = scx_exit_reason(ei->kind);
+
+ irq_work_queue(&scx_ops_error_irq_work);
+}
+
+static struct kthread_worker *scx_create_rt_helper(const char *name)
+{
+ struct kthread_worker *helper;
+
+ helper = kthread_run_worker(0, name);
+ if (helper)
+ sched_set_fifo(helper->task);
+ return helper;
+}
+
+static void check_hotplug_seq(const struct sched_ext_ops *ops)
+{
+ unsigned long long global_hotplug_seq;
+
+ /*
+ * If a hotplug event has occurred between when a scheduler was
+ * initialized, and when we were able to attach, exit and notify user
+ * space about it.
+ */
+ if (ops->hotplug_seq) {
+ global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
+ if (ops->hotplug_seq != global_hotplug_seq) {
+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
+ }
+ }
+}
+
+static int validate_ops(const struct sched_ext_ops *ops)
+{
+ /*
+ * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
+ * ops.enqueue() callback isn't implemented.
+ */
+ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
+ scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+ struct scx_task_iter sti;
+ struct task_struct *p;
+ unsigned long timeout;
+ int i, cpu, node, ret;
+
+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+ cpu_possible_mask)) {
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&scx_ops_enable_mutex);
+
+ if (!scx_ops_helper) {
+ WRITE_ONCE(scx_ops_helper,
+ scx_create_rt_helper("sched_ext_ops_helper"));
+ if (!scx_ops_helper) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+ }
+
+ if (!global_dsqs) {
+ struct scx_dispatch_q **dsqs;
+
+ dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
+ if (!dsqs) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct scx_dispatch_q *dsq;
+
+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq) {
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(dsqs[node]);
+ kfree(dsqs);
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+
+ init_dsq(dsq, SCX_DSQ_GLOBAL);
+ dsqs[node] = dsq;
+ }
+
+ global_dsqs = dsqs;
+ }
+
+ if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+ ret = -EBUSY;
+ goto err_unlock;
+ }
+
+ scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
+ if (!scx_root_kobj) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+
+ scx_root_kobj->kset = scx_kset;
+ ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err;
+
+ scx_exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!scx_exit_info) {
+ ret = -ENOMEM;
+ goto err_del;
+ }
+
+ /*
+ * Set scx_ops, transition to ENABLING and clear exit info to arm the
+ * disable path. Failure triggers full disabling from here on.
+ */
+ scx_ops = *ops;
+
+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
+ SCX_OPS_DISABLED);
+
+ atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
+ scx_warned_zero_slice = false;
+
+ atomic_long_set(&scx_nr_rejected, 0);
+
+ for_each_possible_cpu(cpu)
+ cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+
+ /*
+ * Keep CPUs stable during enable so that the BPF scheduler can track
+ * online CPUs by watching ->on/offline_cpu() after ->init().
+ */
+ cpus_read_lock();
+
+ if (scx_ops.init) {
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+ if (ret) {
+ ret = ops_sanitize_err("init", ret);
+ cpus_read_unlock();
+ scx_ops_error("ops.init() failed (%d)", ret);
+ goto err_disable;
+ }
+ }
+
+ for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
+ if (((void (**)(void))ops)[i])
+ static_branch_enable_cpuslocked(&scx_has_op[i]);
+
+ check_hotplug_seq(ops);
+#ifdef CONFIG_SMP
+ update_selcpu_topology();
+#endif
+ cpus_read_unlock();
+
+ ret = validate_ops(ops);
+ if (ret)
+ goto err_disable;
+
+ WARN_ON_ONCE(scx_dsp_ctx);
+ scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
+ scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
+ scx_dsp_max_batch),
+ __alignof__(struct scx_dsp_ctx));
+ if (!scx_dsp_ctx) {
+ ret = -ENOMEM;
+ goto err_disable;
+ }
+
+ if (ops->timeout_ms)
+ timeout = msecs_to_jiffies(ops->timeout_ms);
+ else
+ timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
+ WRITE_ONCE(scx_watchdog_timeout, timeout);
+ WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+ queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+ scx_watchdog_timeout / 2);
+
+ /*
+ * Once __scx_ops_enabled is set, %current can be switched to SCX
+ * anytime. This can lead to stalls as some BPF schedulers (e.g.
+ * userspace scheduling) may not function correctly before all tasks are
+ * switched. Init in bypass mode to guarantee forward progress.
+ */
+ scx_ops_bypass(true);
+
+ for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
+ if (((void (**)(void))ops)[i])
+ static_branch_enable(&scx_has_op[i]);
+
+ if (ops->flags & SCX_OPS_ENQ_LAST)
+ static_branch_enable(&scx_ops_enq_last);
+
+ if (ops->flags & SCX_OPS_ENQ_EXITING)
+ static_branch_enable(&scx_ops_enq_exiting);
+ if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
+ static_branch_enable(&scx_ops_enq_migration_disabled);
+ if (scx_ops.cpu_acquire || scx_ops.cpu_release)
+ static_branch_enable(&scx_ops_cpu_preempt);
+
+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
+ reset_idle_masks();
+ static_branch_enable(&scx_builtin_idle_enabled);
+ } else {
+ static_branch_disable(&scx_builtin_idle_enabled);
+ }
+
+ /*
+ * Lock out forks, cgroup on/offlining and moves before opening the
+ * floodgate so that they don't wander into the operations prematurely.
+ */
+ percpu_down_write(&scx_fork_rwsem);
+
+ WARN_ON_ONCE(scx_ops_init_task_enabled);
+ scx_ops_init_task_enabled = true;
+
+ /*
+ * Enable ops for every task. Fork is excluded by scx_fork_rwsem
+ * preventing new tasks from being added. No need to exclude tasks
+ * leaving as sched_ext_free() can handle both prepped and enabled
+ * tasks. Prep all tasks first and then enable them with preemption
+ * disabled.
+ *
+ * All cgroups should be initialized before scx_ops_init_task() so that
+ * the BPF scheduler can reliably track each task's cgroup membership
+ * from scx_ops_init_task(). Lock out cgroup on/offlining and task
+ * migrations while tasks are being initialized so that
+ * scx_cgroup_can_attach() never sees uninitialized tasks.
+ */
+ scx_cgroup_lock();
+ ret = scx_cgroup_init();
+ if (ret)
+ goto err_disable_unlock_all;
+
+ scx_task_iter_start(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ /*
+ * @p may already be dead, have lost all its usages counts and
+ * be waiting for RCU grace period before being freed. @p can't
+ * be initialized for SCX in such cases and should be ignored.
+ */
+ if (!tryget_task_struct(p))
+ continue;
+
+ scx_task_iter_unlock(&sti);
+
+ ret = scx_ops_init_task(p, task_group(p), false);
+ if (ret) {
+ put_task_struct(p);
+ scx_task_iter_relock(&sti);
+ scx_task_iter_stop(&sti);
+ scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
+ ret, p->comm, p->pid);
+ goto err_disable_unlock_all;
+ }
+
+ scx_set_task_state(p, SCX_TASK_READY);
+
+ put_task_struct(p);
+ scx_task_iter_relock(&sti);
+ }
+ scx_task_iter_stop(&sti);
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+
+ /*
+ * All tasks are READY. It's safe to turn on scx_enabled() and switch
+ * all eligible tasks.
+ */
+ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
+ static_branch_enable(&__scx_ops_enabled);
+
+ /*
+ * We're fully committed and can't fail. The task READY -> ENABLED
+ * transitions here are synchronized against sched_ext_free() through
+ * scx_tasks_lock.
+ */
+ percpu_down_write(&scx_fork_rwsem);
+ scx_task_iter_start(&sti);
+ while ((p = scx_task_iter_next_locked(&sti))) {
+ const struct sched_class *old_class = p->sched_class;
+ const struct sched_class *new_class =
+ __setscheduler_class(p->policy, p->prio);
+ struct sched_enq_and_set_ctx ctx;
+
+ if (old_class != new_class && p->se.sched_delayed)
+ dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
+
+ p->scx.slice = SCX_SLICE_DFL;
+ p->sched_class = new_class;
+ check_class_changing(task_rq(p), p, old_class);
+
+ sched_enq_and_set_task(&ctx);
+
+ check_class_changed(task_rq(p), p, old_class, p->prio);
+ }
+ scx_task_iter_stop(&sti);
+ percpu_up_write(&scx_fork_rwsem);
+
+ scx_ops_bypass(false);
+
+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ goto err_disable;
+ }
+
+ if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
+ static_branch_enable(&__scx_switched_all);
+
+ pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
+ scx_ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(scx_root_kobj, KOBJ_ADD);
+ mutex_unlock(&scx_ops_enable_mutex);
+
+ atomic_long_inc(&scx_enable_seq);
+
+ return 0;
+
+err_del:
+ kobject_del(scx_root_kobj);
+err:
+ kobject_put(scx_root_kobj);
+ scx_root_kobj = NULL;
+ if (scx_exit_info) {
+ free_exit_info(scx_exit_info);
+ scx_exit_info = NULL;
+ }
+err_unlock:
+ mutex_unlock(&scx_ops_enable_mutex);
+ return ret;
+
+err_disable_unlock_all:
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
+ scx_ops_bypass(false);
+err_disable:
+ mutex_unlock(&scx_ops_enable_mutex);
+ /*
+ * Returning an error code here would not pass all the error information
+ * to userspace. Record errno using scx_ops_error() for cases
+ * scx_ops_error() wasn't already invoked and exit indicating success so
+ * that the error is notified through ops.exit() with all the details.
+ *
+ * Flush scx_ops_disable_work to ensure that error is reported before
+ * init completion.
+ */
+ scx_ops_error("scx_ops_enable() failed (%d)", ret);
+ kthread_flush_work(&scx_ops_disable_work);
+ return 0;
+}
+
+
+/********************************************************************************
+ * bpf_struct_ops plumbing.
+ */
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+
+static const struct btf_type *task_struct_type;
+
+static bool bpf_scx_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type != BPF_READ)
+ return false;
+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ return btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg, int off,
+ int size)
+{
+ const struct btf_type *t;
+
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (t == task_struct_type) {
+ if (off >= offsetof(struct task_struct, scx.slice) &&
+ off + size <= offsetofend(struct task_struct, scx.slice))
+ return SCALAR_VALUE;
+ if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
+ off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
+ return SCALAR_VALUE;
+ if (off >= offsetof(struct task_struct, scx.disallow) &&
+ off + size <= offsetofend(struct task_struct, scx.disallow))
+ return SCALAR_VALUE;
+ }
+
+ return -EACCES;
+}
+
+static const struct bpf_func_proto *
+bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_task_storage_get:
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ return &bpf_task_storage_delete_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
+ .get_func_proto = bpf_scx_get_func_proto,
+ .is_valid_access = bpf_scx_is_valid_access,
+ .btf_struct_access = bpf_scx_btf_struct_access,
+};
+
+static int bpf_scx_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct sched_ext_ops *uops = udata;
+ struct sched_ext_ops *ops = kdata;
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+ int ret;
+
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, dispatch_max_batch):
+ if (*(u32 *)(udata + moff) > INT_MAX)
+ return -E2BIG;
+ ops->dispatch_max_batch = *(u32 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, flags):
+ if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
+ return -EINVAL;
+ ops->flags = *(u64 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, name):
+ ret = bpf_obj_name_cpy(ops->name, uops->name,
+ sizeof(ops->name));
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -EINVAL;
+ return 1;
+ case offsetof(struct sched_ext_ops, timeout_ms):
+ if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
+ SCX_WATCHDOG_MAX_TIMEOUT)
+ return -E2BIG;
+ ops->timeout_ms = *(u32 *)(udata + moff);
+ return 1;
+ case offsetof(struct sched_ext_ops, exit_dump_len):
+ ops->exit_dump_len =
+ *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
+ return 1;
+ case offsetof(struct sched_ext_ops, hotplug_seq):
+ ops->hotplug_seq = *(u64 *)(udata + moff);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_scx_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct sched_ext_ops, init_task):
+#ifdef CONFIG_EXT_GROUP_SCHED
+ case offsetof(struct sched_ext_ops, cgroup_init):
+ case offsetof(struct sched_ext_ops, cgroup_exit):
+ case offsetof(struct sched_ext_ops, cgroup_prep_move):
+#endif
+ case offsetof(struct sched_ext_ops, cpu_online):
+ case offsetof(struct sched_ext_ops, cpu_offline):
+ case offsetof(struct sched_ext_ops, init):
+ case offsetof(struct sched_ext_ops, exit):
+ break;
+ default:
+ if (prog->sleepable)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bpf_scx_reg(void *kdata, struct bpf_link *link)
+{
+ return scx_ops_enable(kdata, link);
+}
+
+static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
+{
+ scx_ops_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&scx_ops_disable_work);
+}
+
+static int bpf_scx_init(struct btf *btf)
+{
+ task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]);
+
+ return 0;
+}
+
+static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
+{
+ /*
+ * sched_ext does not support updating the actively-loaded BPF
+ * scheduler, as registering a BPF scheduler can always fail if the
+ * scheduler returns an error code for e.g. ops.init(), ops.init_task(),
+ * etc. Similarly, we can always race with unregistration happening
+ * elsewhere, such as with sysrq.
+ */
+ return -EOPNOTSUPP;
+}
+
+static int bpf_scx_validate(void *kdata)
+{
+ return 0;
+}
+
+static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
+static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {}
+static void sched_ext_ops__tick(struct task_struct *p) {}
+static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__running(struct task_struct *p) {}
+static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {}
+static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {}
+static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; }
+static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; }
+static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {}
+static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {}
+static void sched_ext_ops__update_idle(s32 cpu, bool idle) {}
+static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {}
+static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {}
+static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
+static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {}
+static void sched_ext_ops__enable(struct task_struct *p) {}
+static void sched_ext_ops__disable(struct task_struct *p) {}
+#ifdef CONFIG_EXT_GROUP_SCHED
+static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
+static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {}
+static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
+static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+#endif
+static void sched_ext_ops__cpu_online(s32 cpu) {}
+static void sched_ext_ops__cpu_offline(s32 cpu) {}
+static s32 sched_ext_ops__init(void) { return -EINVAL; }
+static void sched_ext_ops__exit(struct scx_exit_info *info) {}
+static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {}
+static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
+static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {}
+
+static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
+ .select_cpu = sched_ext_ops__select_cpu,
+ .enqueue = sched_ext_ops__enqueue,
+ .dequeue = sched_ext_ops__dequeue,
+ .dispatch = sched_ext_ops__dispatch,
+ .tick = sched_ext_ops__tick,
+ .runnable = sched_ext_ops__runnable,
+ .running = sched_ext_ops__running,
+ .stopping = sched_ext_ops__stopping,
+ .quiescent = sched_ext_ops__quiescent,
+ .yield = sched_ext_ops__yield,
+ .core_sched_before = sched_ext_ops__core_sched_before,
+ .set_weight = sched_ext_ops__set_weight,
+ .set_cpumask = sched_ext_ops__set_cpumask,
+ .update_idle = sched_ext_ops__update_idle,
+ .cpu_acquire = sched_ext_ops__cpu_acquire,
+ .cpu_release = sched_ext_ops__cpu_release,
+ .init_task = sched_ext_ops__init_task,
+ .exit_task = sched_ext_ops__exit_task,
+ .enable = sched_ext_ops__enable,
+ .disable = sched_ext_ops__disable,
+#ifdef CONFIG_EXT_GROUP_SCHED
+ .cgroup_init = sched_ext_ops__cgroup_init,
+ .cgroup_exit = sched_ext_ops__cgroup_exit,
+ .cgroup_prep_move = sched_ext_ops__cgroup_prep_move,
+ .cgroup_move = sched_ext_ops__cgroup_move,
+ .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
+ .cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
+#endif
+ .cpu_online = sched_ext_ops__cpu_online,
+ .cpu_offline = sched_ext_ops__cpu_offline,
+ .init = sched_ext_ops__init,
+ .exit = sched_ext_ops__exit,
+ .dump = sched_ext_ops__dump,
+ .dump_cpu = sched_ext_ops__dump_cpu,
+ .dump_task = sched_ext_ops__dump_task,
+};
+
+static struct bpf_struct_ops bpf_sched_ext_ops = {
+ .verifier_ops = &bpf_scx_verifier_ops,
+ .reg = bpf_scx_reg,
+ .unreg = bpf_scx_unreg,
+ .check_member = bpf_scx_check_member,
+ .init_member = bpf_scx_init_member,
+ .init = bpf_scx_init,
+ .update = bpf_scx_update,
+ .validate = bpf_scx_validate,
+ .name = "sched_ext_ops",
+ .owner = THIS_MODULE,
+ .cfi_stubs = &__bpf_ops_sched_ext_ops
+};
+
+
+/********************************************************************************
+ * System integration and init.
+ */
+
+static void sysrq_handle_sched_ext_reset(u8 key)
+{
+ if (scx_ops_helper)
+ scx_ops_disable(SCX_EXIT_SYSRQ);
+ else
+ pr_info("sched_ext: BPF scheduler not yet used\n");
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
+ .handler = sysrq_handle_sched_ext_reset,
+ .help_msg = "reset-sched-ext(S)",
+ .action_msg = "Disable sched_ext and revert all tasks to CFS",
+ .enable_mask = SYSRQ_ENABLE_RTNICE,
+};
+
+static void sysrq_handle_sched_ext_dump(u8 key)
+{
+ struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+
+ if (scx_enabled())
+ scx_dump_state(&ei, 0);
+}
+
+static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
+ .handler = sysrq_handle_sched_ext_dump,
+ .help_msg = "dump-sched-ext(D)",
+ .action_msg = "Trigger sched_ext debug dump",
+ .enable_mask = SYSRQ_ENABLE_RTNICE,
+};
+
+static bool can_skip_idle_kick(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * We can skip idle kicking if @rq is going to go through at least one
+ * full SCX scheduling cycle before going idle. Just checking whether
+ * curr is not idle is insufficient because we could be racing
+ * balance_one() trying to pull the next task from a remote rq, which
+ * may fail, and @rq may become idle afterwards.
+ *
+ * The race window is small and we don't and can't guarantee that @rq is
+ * only kicked while idle anyway. Skip only when sure.
+ */
+ return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_IN_BALANCE);
+}
+
+static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_rq *this_scx = &this_rq->scx;
+ bool should_wait = false;
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+
+ /*
+ * During CPU hotplug, a CPU may depend on kicking itself to make
+ * forward progress. Allow kicking self regardless of online state.
+ */
+ if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
+ if (rq->curr->sched_class == &ext_sched_class)
+ rq->curr->scx.slice = 0;
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+ }
+
+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) {
+ pseqs[cpu] = rq->scx.pnt_seq;
+ should_wait = true;
+ }
+
+ resched_curr(rq);
+ } else {
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
+
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+
+ return should_wait;
+}
+
+static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ raw_spin_rq_lock_irqsave(rq, flags);
+
+ if (!can_skip_idle_kick(rq) &&
+ (cpu_online(cpu) || cpu == cpu_of(this_rq)))
+ resched_curr(rq);
+
+ raw_spin_rq_unlock_irqrestore(rq, flags);
+}
+
+static void kick_cpus_irq_workfn(struct irq_work *irq_work)
+{
+ struct rq *this_rq = this_rq();
+ struct scx_rq *this_scx = &this_rq->scx;
+ unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+ bool should_wait = false;
+ s32 cpu;
+
+ for_each_cpu(cpu, this_scx->cpus_to_kick) {
+ should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
+ }
+
+ for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
+ kick_one_cpu_if_idle(cpu, this_rq);
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
+ }
+
+ if (!should_wait)
+ return;
+
+ for_each_cpu(cpu, this_scx->cpus_to_wait) {
+ unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq;
+
+ if (cpu != cpu_of(this_rq)) {
+ /*
+ * Pairs with smp_store_release() issued by this CPU in
+ * switch_class() on the resched path.
+ *
+ * We busy-wait here to guarantee that no other task can
+ * be scheduled on our core before the target CPU has
+ * entered the resched path.
+ */
+ while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu])
+ cpu_relax();
+ }
+
+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait);
+ }
+}
+
+/**
+ * print_scx_info - print out sched_ext scheduler state
+ * @log_lvl: the log level to use when printing
+ * @p: target task
+ *
+ * If a sched_ext scheduler is enabled, print the name and state of the
+ * scheduler. If @p is on sched_ext, print further information about the task.
+ *
+ * This function can be safely called on any task as long as the task_struct
+ * itself is accessible. While safe, this function isn't synchronized and may
+ * print out mixups or garbages of limited length.
+ */
+void print_scx_info(const char *log_lvl, struct task_struct *p)
+{
+ enum scx_ops_enable_state state = scx_ops_enable_state();
+ const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
+ char runnable_at_buf[22] = "?";
+ struct sched_class *class;
+ unsigned long runnable_at;
+
+ if (state == SCX_OPS_DISABLED)
+ return;
+
+ /*
+ * Carefully check if the task was running on sched_ext, and then
+ * carefully copy the time it's been runnable, and its state.
+ */
+ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
+ class != &ext_sched_class) {
+ printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
+ scx_ops_enable_state_str[state], all);
+ return;
+ }
+
+ if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
+ sizeof(runnable_at)))
+ scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
+ jiffies_delta_msecs(runnable_at, jiffies));
+
+ /* print everything onto one line to conserve console space */
+ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
+ log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
+ runnable_at_buf);
+}
+
+static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
+{
+ /*
+ * SCX schedulers often have userspace components which are sometimes
+ * involved in critial scheduling paths. PM operations involve freezing
+ * userspace which can lead to scheduling misbehaviors including stalls.
+ * Let's bypass while PM operations are in progress.
+ */
+ switch (event) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ case PM_RESTORE_PREPARE:
+ scx_ops_bypass(true);
+ break;
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ case PM_POST_RESTORE:
+ scx_ops_bypass(false);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block scx_pm_notifier = {
+ .notifier_call = scx_pm_handler,
+};
+
+void __init init_sched_ext_class(void)
+{
+ s32 cpu, v;
+
+ /*
+ * The following is to prevent the compiler from optimizing out the enum
+ * definitions so that BPF scheduler implementations can use them
+ * through the generated vmlinux.h.
+ */
+ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
+ SCX_TG_ONLINE);
+
+ BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
+#ifdef CONFIG_SMP
+ BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
+#endif
+ scx_kick_cpus_pnt_seqs =
+ __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
+ __alignof__(scx_kick_cpus_pnt_seqs[0]));
+ BUG_ON(!scx_kick_cpus_pnt_seqs);
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+ INIT_LIST_HEAD(&rq->scx.runnable_list);
+ INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
+
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+ init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
+ init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
+
+ if (cpu_online(cpu))
+ cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
+ }
+
+ register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+ register_sysrq_key('D', &sysrq_sched_ext_dump_op);
+ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
+}
+
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+#include <linux/btf_ids.h>
+
+__bpf_kfunc_start_defs();
+
+static bool check_builtin_idle_enabled(void)
+{
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ return true;
+
+ scx_ops_error("built-in idle tracking is disabled");
+ return false;
+}
+
+/**
+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @is_idle: out parameter indicating whether the returned CPU is idle
+ *
+ * Can only be called from ops.select_cpu() if the built-in CPU selection is
+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
+ * currently idle and thus a good candidate for direct dispatching.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *is_idle)
+{
+ if (!ops_cpu_valid(prev_cpu, NULL))
+ goto prev_cpu;
+
+ if (!check_builtin_idle_enabled())
+ goto prev_cpu;
+
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
+ goto prev_cpu;
+
+#ifdef CONFIG_SMP
+ return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
+#endif
+
+prev_cpu:
+ *is_idle = false;
+ return prev_cpu;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_select_cpu,
+};
+
+static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
+{
+ if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+ return false;
+
+ lockdep_assert_irqs_disabled();
+
+ if (unlikely(!p)) {
+ scx_ops_error("called with NULL task");
+ return false;
+ }
+
+ if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
+ scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
+ return false;
+ }
+
+ return true;
+}
+
+static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct task_struct *ddsp_task;
+
+ ddsp_task = __this_cpu_read(direct_dispatch_task);
+ if (ddsp_task) {
+ mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+ return;
+ }
+
+ if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
+ scx_ops_error("dispatch buffer overflow");
+ return;
+ }
+
+ dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
+ .task = p,
+ .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
+ .dsq_id = dsq_id,
+ .enq_flags = enq_flags,
+ };
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
+ * call this function spuriously. Can be called from ops.enqueue(),
+ * ops.select_cpu(), and ops.dispatch().
+ *
+ * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
+ * and @p must match the task being enqueued.
+ *
+ * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
+ * will be directly inserted into the corresponding dispatch queue after
+ * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be
+ * inserted into the local DSQ of the CPU returned by ops.select_cpu().
+ * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
+ * task is inserted.
+ *
+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
+ * and this function can be called upto ops.dispatch_max_batch times to insert
+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
+ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
+ *
+ * This function doesn't have any locking restrictions and may be called under
+ * BPF locks (in the future when BPF introduces more flexible locking).
+ *
+ * @p is allowed to run for @slice. The scheduling path is triggered on slice
+ * exhaustion. If zero, the current residual slice is maintained. If
+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
+ * scx_bpf_kick_cpu() to trigger scheduling.
+ */
+__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
+ u64 enq_flags)
+{
+ if (!scx_dsq_insert_preamble(p, enq_flags))
+ return;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ scx_dsq_insert_commit(p, dsq_id, enq_flags);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
+ u64 enq_flags)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
+ scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
+}
+
+/**
+ * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
+ * @slice: duration @p can run for in nsecs, 0 to keep the current value
+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
+ * Tasks queued into the priority queue are ordered by @vtime. All other aspects
+ * are identical to scx_bpf_dsq_insert().
+ *
+ * @vtime ordering is according to time_before64() which considers wrapping. A
+ * numerically larger vtime may indicate an earlier position in the ordering and
+ * vice-versa.
+ *
+ * A DSQ can only be used as a FIFO or priority queue at any given time and this
+ * function must not be called on a DSQ which already has one or more FIFO tasks
+ * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
+ * SCX_DSQ_GLOBAL) cannot be used as priority queues.
+ */
+__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
+{
+ if (!scx_dsq_insert_preamble(p, enq_flags))
+ return;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ p->scx.dsq_vtime = vtime;
+
+ scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
+ u64 slice, u64 vtime, u64 enq_flags)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
+ scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_enqueue_dispatch,
+};
+
+static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
+ struct task_struct *p, u64 dsq_id, u64 enq_flags)
+{
+ struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
+ struct rq *this_rq, *src_rq, *locked_rq;
+ bool dispatched = false;
+ bool in_balance;
+ unsigned long flags;
+
+ if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
+ return false;
+
+ /*
+ * Can be called from either ops.dispatch() locking this_rq() or any
+ * context where no rq lock is held. If latter, lock @p's task_rq which
+ * we'll likely need anyway.
+ */
+ src_rq = task_rq(p);
+
+ local_irq_save(flags);
+ this_rq = this_rq();
+ in_balance = this_rq->scx.flags & SCX_RQ_IN_BALANCE;
+
+ if (in_balance) {
+ if (this_rq != src_rq) {
+ raw_spin_rq_unlock(this_rq);
+ raw_spin_rq_lock(src_rq);
+ }
+ } else {
+ raw_spin_rq_lock(src_rq);
+ }
+
+ /*
+ * If the BPF scheduler keeps calling this function repeatedly, it can
+ * cause similar live-lock conditions as consume_dispatch_q(). Insert a
+ * breather if necessary.
+ */
+ scx_ops_breather(src_rq);
+
+ locked_rq = src_rq;
+ raw_spin_lock(&src_dsq->lock);
+
+ /*
+ * Did someone else get to it? @p could have already left $src_dsq, got
+ * re-enqueud, or be in the process of being consumed by someone else.
+ */
+ if (unlikely(p->scx.dsq != src_dsq ||
+ u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
+ p->scx.holding_cpu >= 0) ||
+ WARN_ON_ONCE(src_rq != task_rq(p))) {
+ raw_spin_unlock(&src_dsq->lock);
+ goto out;
+ }
+
+ /* @p is still on $src_dsq and stable, determine the destination */
+ dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+
+ /*
+ * Apply vtime and slice updates before moving so that the new time is
+ * visible before inserting into $dst_dsq. @p is still on $src_dsq but
+ * this is safe as we're locking it.
+ */
+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
+ p->scx.dsq_vtime = kit->vtime;
+ if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
+ p->scx.slice = kit->slice;
+
+ /* execute move */
+ locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
+ dispatched = true;
+out:
+ if (in_balance) {
+ if (this_rq != locked_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ raw_spin_rq_lock(this_rq);
+ }
+ } else {
+ raw_spin_rq_unlock_irqrestore(locked_rq, flags);
+ }
+
+ kit->cursor.flags &= ~(__SCX_DSQ_ITER_HAS_SLICE |
+ __SCX_DSQ_ITER_HAS_VTIME);
+ return dispatched;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
+ *
+ * Can only be called from ops.dispatch().
+ */
+__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
+{
+ if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ return 0;
+
+ return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
+}
+
+/**
+ * scx_bpf_dispatch_cancel - Cancel the latest dispatch
+ *
+ * Cancel the latest dispatch. Can be called multiple times to cancel further
+ * dispatches. Can only be called from ops.dispatch().
+ */
+__bpf_kfunc void scx_bpf_dispatch_cancel(void)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+
+ if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ return;
+
+ if (dspc->cursor > 0)
+ dspc->cursor--;
+ else
+ scx_ops_error("dispatch buffer underflow");
+}
+
+/**
+ * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
+ * @dsq_id: DSQ to move task from
+ *
+ * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
+ * local DSQ for execution. Can only be called from ops.dispatch().
+ *
+ * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
+ * before trying to move from the specified DSQ. It may also grab rq locks and
+ * thus can't be called under any BPF locks.
+ *
+ * Returns %true if a task has been moved, %false if there isn't any task to
+ * move.
+ */
+__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
+{
+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_dispatch_q *dsq;
+
+ if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ return false;
+
+ flush_dispatch_buf(dspc->rq);
+
+ dsq = find_user_dsq(dsq_id);
+ if (unlikely(!dsq)) {
+ scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+ return false;
+ }
+
+ if (consume_dispatch_q(dspc->rq, dsq)) {
+ /*
+ * A successfully consumed task can be dequeued before it starts
+ * running while the CPU is trying to migrate other dispatched
+ * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
+ * local DSQ.
+ */
+ dspc->nr_tasks++;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
+ return scx_bpf_dsq_move_to_local(dsq_id);
+}
+
+/**
+ * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
+ * @it__iter: DSQ iterator in progress
+ * @slice: duration the moved task can run for in nsecs
+ *
+ * Override the slice of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous
+ * slice duration is kept.
+ */
+__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
+ u64 slice)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
+
+ kit->slice = slice;
+ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
+ struct bpf_iter_scx_dsq *it__iter, u64 slice)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
+ scx_bpf_dsq_move_set_slice(it__iter, slice);
+}
+
+/**
+ * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
+ * @it__iter: DSQ iterator in progress
+ * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
+ *
+ * Override the vtime of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
+ * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the
+ * override is ignored and cleared.
+ */
+__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
+ u64 vtime)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
+
+ kit->vtime = vtime;
+ kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
+ struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
+ scx_bpf_dsq_move_set_vtime(it__iter, vtime);
+}
+
+/**
+ * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
+ * @it__iter: DSQ iterator in progress
+ * @p: task to transfer
+ * @dsq_id: DSQ to move @p to
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the DSQ
+ * specified by @dsq_id. All DSQs - local DSQs, global DSQ and user DSQs - can
+ * be the destination.
+ *
+ * For the transfer to be successful, @p must still be on the DSQ and have been
+ * queued before the DSQ iteration started. This function doesn't care whether
+ * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
+ * been queued before the iteration started.
+ *
+ * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
+ *
+ * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
+ * lock (e.g. BPF timers or SYSCALL programs).
+ *
+ * Returns %true if @p has been consumed, %false if @p had already been consumed
+ * or dequeued.
+ */
+__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
+ return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
+}
+
+/**
+ * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
+ * @it__iter: DSQ iterator in progress
+ * @p: task to transfer
+ * @dsq_id: DSQ to move @p to
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Transfer @p which is on the DSQ currently iterated by @it__iter to the
+ * priority queue of the DSQ specified by @dsq_id. The destination must be a
+ * user DSQ as only user DSQs support priority queue.
+ *
+ * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
+ * and scx_bpf_dsq_move_set_vtime() to update.
+ *
+ * All other aspects are identical to scx_bpf_dsq_move(). See
+ * scx_bpf_dsq_insert_vtime() for more information on @vtime.
+ */
+__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+ p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
+ struct task_struct *p, u64 dsq_id,
+ u64 enq_flags)
+{
+ printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
+ return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
+BTF_ID_FLAGS(func, scx_bpf_consume)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_dispatch,
+};
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
+ * processed tasks. Can only be called from ops.cpu_release().
+ */
+__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+{
+ LIST_HEAD(tasks);
+ u32 nr_enqueued = 0;
+ struct rq *rq;
+ struct task_struct *p, *n;
+
+ if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+ return 0;
+
+ rq = cpu_rq(smp_processor_id());
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The BPF scheduler may choose to dispatch tasks back to
+ * @rq->scx.local_dsq. Move all candidate tasks off to a private list
+ * first to avoid processing the same tasks repeatedly.
+ */
+ list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
+ scx.dsq_list.node) {
+ /*
+ * If @p is being migrated, @p's current CPU may not agree with
+ * its allowed CPUs and the migration_cpu_stop is about to
+ * deactivate and re-activate @p anyway. Skip re-enqueueing.
+ *
+ * While racing sched property changes may also dequeue and
+ * re-enqueue a migrating task while its current CPU and allowed
+ * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
+ * the current local DSQ for running tasks and thus are not
+ * visible to the BPF scheduler.
+ */
+ if (p->migration_pending)
+ continue;
+
+ dispatch_dequeue(rq, p);
+ list_add_tail(&p->scx.dsq_list.node, &tasks);
+ }
+
+ list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
+ list_del_init(&p->scx.dsq_list.node);
+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+ nr_enqueued++;
+ }
+
+ return nr_enqueued;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
+BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_cpu_release,
+};
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_create_dsq - Create a custom DSQ
+ * @dsq_id: DSQ to create
+ * @node: NUMA node to allocate from
+ *
+ * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
+ * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
+ */
+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
+{
+ if (unlikely(node >= (int)nr_node_ids ||
+ (node < 0 && node != NUMA_NO_NODE)))
+ return -EINVAL;
+ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_unlocked,
+};
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+ struct rq *this_rq;
+ unsigned long irq_flags;
+
+ if (!ops_cpu_valid(cpu, NULL))
+ return;
+
+ local_irq_save(irq_flags);
+
+ this_rq = this_rq();
+
+ /*
+ * While bypassing for PM ops, IRQ handling may not be online which can
+ * lead to irq_work_queue() malfunction such as infinite busy wait for
+ * IRQ status update. Suppress kicking.
+ */
+ if (scx_rq_bypassing(this_rq))
+ goto out;
+
+ /*
+ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
+ * rq locks. We can probably be smarter and avoid bouncing if called
+ * from ops which don't hold a rq lock.
+ */
+ if (flags & SCX_KICK_IDLE) {
+ struct rq *target_rq = cpu_rq(cpu);
+
+ if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
+ scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+
+ if (raw_spin_rq_trylock(target_rq)) {
+ if (can_skip_idle_kick(target_rq)) {
+ raw_spin_rq_unlock(target_rq);
+ goto out;
+ }
+ raw_spin_rq_unlock(target_rq);
+ }
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
+ } else {
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
+
+ if (flags & SCX_KICK_PREEMPT)
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
+ if (flags & SCX_KICK_WAIT)
+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait);
+ }
+
+ irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
+out:
+ local_irq_restore(irq_flags);
+}
+
+/**
+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks
+ * @dsq_id: id of the DSQ
+ *
+ * Return the number of tasks in the DSQ matching @dsq_id. If not found,
+ * -%ENOENT is returned.
+ */
+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
+{
+ struct scx_dispatch_q *dsq;
+ s32 ret;
+
+ preempt_disable();
+
+ if (dsq_id == SCX_DSQ_LOCAL) {
+ ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
+ goto out;
+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
+
+ if (ops_cpu_valid(cpu, NULL)) {
+ ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
+ goto out;
+ }
+ } else {
+ dsq = find_user_dsq(dsq_id);
+ if (dsq) {
+ ret = READ_ONCE(dsq->nr);
+ goto out;
+ }
+ }
+ ret = -ENOENT;
+out:
+ preempt_enable();
+ return ret;
+}
+
+/**
+ * scx_bpf_destroy_dsq - Destroy a custom DSQ
+ * @dsq_id: DSQ to destroy
+ *
+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
+ * which doesn't exist. Can be called from any online scx_ops operations.
+ */
+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
+{
+ destroy_dsq(dsq_id);
+}
+
+/**
+ * bpf_iter_scx_dsq_new - Create a DSQ iterator
+ * @it: iterator to initialize
+ * @dsq_id: DSQ to iterate
+ * @flags: %SCX_DSQ_ITER_*
+ *
+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
+ * tasks which are already queued when this function is invoked.
+ */
+__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
+ u64 flags)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
+ sizeof(struct bpf_iter_scx_dsq));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
+ __alignof__(struct bpf_iter_scx_dsq));
+
+ if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
+ return -EINVAL;
+
+ kit->dsq = find_user_dsq(dsq_id);
+ if (!kit->dsq)
+ return -ENOENT;
+
+ INIT_LIST_HEAD(&kit->cursor.node);
+ kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;
+ kit->cursor.priv = READ_ONCE(kit->dsq->seq);
+
+ return 0;
+}
+
+/**
+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator
+ * @it: iterator to progress
+ *
+ * Return the next task. See bpf_iter_scx_dsq_new().
+ */
+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
+ struct task_struct *p;
+ unsigned long flags;
+
+ if (!kit->dsq)
+ return NULL;
+
+ raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+
+ if (list_empty(&kit->cursor.node))
+ p = NULL;
+ else
+ p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
+
+ /*
+ * Only tasks which were queued before the iteration started are
+ * visible. This bounds BPF iterations and guarantees that vtime never
+ * jumps in the other direction while iterating.
+ */
+ do {
+ p = nldsq_next_task(kit->dsq, p, rev);
+ } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
+
+ if (p) {
+ if (rev)
+ list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
+ else
+ list_move(&kit->cursor.node, &p->scx.dsq_list.node);
+ } else {
+ list_del_init(&kit->cursor.node);
+ }
+
+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+
+ return p;
+}
+
+/**
+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator
+ * @it: iterator to destroy
+ *
+ * Undo scx_iter_scx_dsq_new().
+ */
+__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
+{
+ struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+
+ if (!kit->dsq)
+ return;
+
+ if (!list_empty(&kit->cursor.node)) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+ list_del_init(&kit->cursor.node);
+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
+ }
+ kit->dsq = NULL;
+}
+
+__bpf_kfunc_end_defs();
+
+static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
+ char *fmt, unsigned long long *data, u32 data__sz)
+{
+ struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
+ s32 ret;
+
+ if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
+ (data__sz && !data)) {
+ scx_ops_error("invalid data=%p and data__sz=%u",
+ (void *)data, data__sz);
+ return -EINVAL;
+ }
+
+ ret = copy_from_kernel_nofault(data_buf, data, data__sz);
+ if (ret < 0) {
+ scx_ops_error("failed to read data fields (%d)", ret);
+ return ret;
+ }
+
+ ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
+ &bprintf_data);
+ if (ret < 0) {
+ scx_ops_error("format preparation failed (%d)", ret);
+ return ret;
+ }
+
+ ret = bstr_printf(line_buf, line_size, fmt,
+ bprintf_data.bin_args);
+ bpf_bprintf_cleanup(&bprintf_data);
+ if (ret < 0) {
+ scx_ops_error("(\"%s\", %p, %u) failed to format",
+ fmt, data, data__sz);
+ return ret;
+ }
+
+ return ret;
+}
+
+static s32 bstr_format(struct scx_bstr_buf *buf,
+ char *fmt, unsigned long long *data, u32 data__sz)
+{
+ return __bstr_format(buf->data, buf->line, sizeof(buf->line),
+ fmt, data, data__sz);
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
+ * @exit_code: Exit value to pass to user space via struct scx_exit_info.
+ * @fmt: error message format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
+ * disabling.
+ */
+__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
+ unsigned long long *data, u32 data__sz)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
+ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
+ scx_exit_bstr_buf.line);
+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
+}
+
+/**
+ * scx_bpf_error_bstr - Indicate fatal error
+ * @fmt: error message format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops
+ * disabling.
+ */
+__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
+ u32 data__sz)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
+ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
+ scx_exit_bstr_buf.line);
+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
+}
+
+/**
+ * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
+ * @fmt: format string
+ * @data: format string parameters packaged using ___bpf_fill() macro
+ * @data__sz: @data len, must end in '__sz' for the verifier
+ *
+ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
+ * dump_task() to generate extra debug dump specific to the BPF scheduler.
+ *
+ * The extra dump may be multiple lines. A single line may be split over
+ * multiple calls. The last line is automatically terminated.
+ */
+__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
+ u32 data__sz)
+{
+ struct scx_dump_data *dd = &scx_dump_data;
+ struct scx_bstr_buf *buf = &dd->buf;
+ s32 ret;
+
+ if (raw_smp_processor_id() != dd->cpu) {
+ scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ return;
+ }
+
+ /* append the formatted string to the line buf */
+ ret = __bstr_format(buf->data, buf->line + dd->cursor,
+ sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
+ if (ret < 0) {
+ dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
+ dd->prefix, fmt, data, data__sz, ret);
+ return;
+ }
+
+ dd->cursor += ret;
+ dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
+
+ if (!dd->cursor)
+ return;
+
+ /*
+ * If the line buf overflowed or ends in a newline, flush it into the
+ * dump. This is to allow the caller to generate a single line over
+ * multiple calls. As ops_dump_flush() can also handle multiple lines in
+ * the line buf, the only case which can lead to an unexpected
+ * truncation is when the caller keeps generating newlines in the middle
+ * instead of the end consecutively. Don't do that.
+ */
+ if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
+ ops_dump_flush();
+}
+
+/**
+ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the maximum relative capacity of @cpu in relation to the most
+ * performant CPU in the system. The return value is in the range [1,
+ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
+{
+ if (ops_cpu_valid(cpu, NULL))
+ return arch_scale_cpu_capacity(cpu);
+ else
+ return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
+ * @cpu: CPU of interest
+ *
+ * Return the current relative performance of @cpu in relation to its maximum.
+ * The return value is in the range [1, %SCX_CPUPERF_ONE].
+ *
+ * The current performance level of a CPU in relation to the maximum performance
+ * available in the system can be calculated as follows:
+ *
+ * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE
+ *
+ * The result is in the range [1, %SCX_CPUPERF_ONE].
+ */
+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
+{
+ if (ops_cpu_valid(cpu, NULL))
+ return arch_scale_freq_capacity(cpu);
+ else
+ return SCX_CPUPERF_ONE;
+}
+
+/**
+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
+ * @cpu: CPU of interest
+ * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ *
+ * Set the target performance level of @cpu to @perf. @perf is in linear
+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
+ * schedutil cpufreq governor chooses the target frequency.
+ *
+ * The actual performance level chosen, CPU grouping, and the overhead and
+ * latency of the operations are dependent on the hardware and cpufreq driver in
+ * use. Consult hardware and cpufreq documentation for more information. The
+ * current performance level can be monitored using scx_bpf_cpuperf_cur().
+ */
+__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
+{
+ if (unlikely(perf > SCX_CPUPERF_ONE)) {
+ scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ return;
+ }
+
+ if (ops_cpu_valid(cpu, NULL)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->scx.cpuperf_target = perf;
+
+ rcu_read_lock_sched_notrace();
+ cpufreq_update_util(cpu_rq(cpu), 0);
+ rcu_read_unlock_sched_notrace();
+ }
+}
+
+/**
+ * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
+ *
+ * All valid CPU IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
+{
+ return nr_cpu_ids;
+}
+
+/**
+ * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
+{
+ return cpu_possible_mask;
+}
+
+/**
+ * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void)
+{
+ return cpu_online_mask;
+}
+
+/**
+ * scx_bpf_put_cpumask - Release a possible/online cpumask
+ * @cpumask: cpumask to release
+ */
+__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global cpumask, which is read-only in the caller and
+ * is never released. The acquire / release semantics here are just used
+ * to make the cpumask is a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
+ * per-CPU cpumask.
+ *
+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+{
+ if (!check_builtin_idle_enabled())
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ return idle_masks.cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
+ * per-physical-core cpumask. Can be used to determine if an entire physical
+ * core is free.
+ *
+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+{
+ if (!check_builtin_idle_enabled())
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ if (sched_smt_active())
+ return idle_masks.smt;
+ else
+ return idle_masks.cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
+ * either the percpu, or SMT idle-tracking cpumask.
+ * @idle_mask: &cpumask to use
+ */
+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global idle cpumask, which is read-only in the
+ * caller and is never released. The acquire / release semantics here
+ * are just used to make the cpumask a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
+ * @cpu: cpu to test and clear idle for
+ *
+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
+ * %false otherwise.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+{
+ if (!check_builtin_idle_enabled())
+ return false;
+
+ if (ops_cpu_valid(cpu, NULL))
+ return test_and_clear_cpu_idle(cpu);
+ else
+ return false;
+}
+
+/**
+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
+ * number on success. -%EBUSY if no matching cpu was found.
+ *
+ * Idle CPU tracking may race against CPU scheduling state transitions. For
+ * example, this function may return -%EBUSY as CPUs are transitioning into the
+ * idle state. If the caller then assumes that there will be dispatch events on
+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
+ * event in the near future.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ return scx_pick_idle_cpu(cpus_allowed, flags);
+}
+
+/**
+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ s32 cpu;
+
+ if (static_branch_likely(&scx_builtin_idle_enabled)) {
+ cpu = scx_pick_idle_cpu(cpus_allowed, flags);
+ if (cpu >= 0)
+ return cpu;
+ }
+
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+/**
+ * scx_bpf_task_running - Is task currently running?
+ * @p: task of interest
+ */
+__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p)
+{
+ return task_rq(p)->curr == p;
+}
+
+/**
+ * scx_bpf_task_cpu - CPU a task is currently associated with
+ * @p: task of interest
+ */
+__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
+{
+ return task_cpu(p);
+}
+
+/**
+ * scx_bpf_cpu_rq - Fetch the rq of a CPU
+ * @cpu: CPU of the rq
+ */
+__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
+{
+ if (!ops_cpu_valid(cpu, NULL))
+ return NULL;
+
+ return cpu_rq(cpu);
+}
+
+/**
+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
+ * @p: task of interest
+ *
+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
+ * from the scheduler's POV. SCX operations should use this function to
+ * determine @p's current cgroup as, unlike following @p->cgroups,
+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
+ * rq-locked operations. Can be called on the parameter tasks of rq-locked
+ * operations. The restriction guarantees that @p's rq is locked by the caller.
+ */
+#ifdef CONFIG_CGROUP_SCHED
+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
+{
+ struct task_group *tg = p->sched_task_group;
+ struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+
+ if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+ goto out;
+
+ cgrp = tg_cgrp(tg);
+
+out:
+ cgroup_get(cgrp);
+ return cgrp;
+}
+#endif
+
+/**
+ * scx_bpf_now - Returns a high-performance monotonically non-decreasing
+ * clock for the current CPU. The clock returned is in nanoseconds.
+ *
+ * It provides the following properties:
+ *
+ * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently
+ * to account for execution time and track tasks' runtime properties.
+ * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
+ * eventually reads a hardware timestamp counter -- is neither performant nor
+ * scalable. scx_bpf_now() aims to provide a high-performance clock by
+ * using the rq clock in the scheduler core whenever possible.
+ *
+ * 2) High enough resolution for the BPF scheduler use cases: In most BPF
+ * scheduler use cases, the required clock resolution is lower than the most
+ * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically
+ * uses the rq clock in the scheduler core whenever it is valid. It considers
+ * that the rq clock is valid from the time the rq clock is updated
+ * (update_rq_clock) until the rq is unlocked (rq_unpin_lock).
+ *
+ * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
+ * guarantees the clock never goes backward when comparing them in the same
+ * CPU. On the other hand, when comparing clocks in different CPUs, there
+ * is no such guarantee -- the clock can go backward. It provides a
+ * monotonically *non-decreasing* clock so that it would provide the same
+ * clock values in two different scx_bpf_now() calls in the same CPU
+ * during the same period of when the rq clock is valid.
+ */
+__bpf_kfunc u64 scx_bpf_now(void)
+{
+ struct rq *rq;
+ u64 clock;
+
+ preempt_disable();
+
+ rq = this_rq();
+ if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
+ /*
+ * If the rq clock is valid, use the cached rq clock.
+ *
+ * Note that scx_bpf_now() is re-entrant between a process
+ * context and an interrupt context (e.g., timer interrupt).
+ * However, we don't need to consider the race between them
+ * because such race is not observable from a caller.
+ */
+ clock = READ_ONCE(rq->scx.clock);
+ } else {
+ /*
+ * Otherwise, return a fresh rq clock.
+ *
+ * The rq clock is updated outside of the rq lock.
+ * In this case, keep the updated rq clock invalid so the next
+ * kfunc call outside the rq lock gets a fresh rq clock.
+ */
+ clock = sched_clock_cpu(cpu_of(rq));
+ }
+
+ preempt_enable();
+
+ return clock;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_any)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
+BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+#ifdef CONFIG_CGROUP_SCHED
+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
+#endif
+BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_KFUNCS_END(scx_kfunc_ids_any)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_any = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_any,
+};
+
+static int __init scx_init(void)
+{
+ int ret;
+
+ /*
+ * kfunc registration can't be done from init_sched_ext_class() as
+ * register_btf_kfunc_id_set() needs most of the system to be up.
+ *
+ * Some kfuncs are context-sensitive and can only be called from
+ * specific SCX ops. They are grouped into BTF sets accordingly.
+ * Unfortunately, BPF currently doesn't have a way of enforcing such
+ * restrictions. Eventually, the verifier should be able to enforce
+ * them. For now, register them the same and make each kfunc explicitly
+ * check using scx_kf_allowed().
+ */
+ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_select_cpu)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_enqueue_dispatch)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_dispatch)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_cpu_release)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_unlocked)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &scx_kfunc_set_unlocked)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &scx_kfunc_set_any)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &scx_kfunc_set_any)) ||
+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+ &scx_kfunc_set_any))) {
+ pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
+ return ret;
+ }
+
+ ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
+ if (ret) {
+ pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
+ return ret;
+ }
+
+ ret = register_pm_notifier(&scx_pm_notifier);
+ if (ret) {
+ pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret);
+ return ret;
+ }
+
+ scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
+ if (!scx_kset) {
+ pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n");
+ return -ENOMEM;
+ }
+
+ ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
+ if (ret < 0) {
+ pr_err("sched_ext: Failed to add global attributes\n");
+ return ret;
+ }
+
+ return 0;
+}
+__initcall(scx_init);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
new file mode 100644
index 000000000000..1079b56b0f7a
--- /dev/null
+++ b/kernel/sched/ext.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifdef CONFIG_SCHED_CLASS_EXT
+
+void scx_tick(struct rq *rq);
+void init_scx_entity(struct sched_ext_entity *scx);
+void scx_pre_fork(struct task_struct *p);
+int scx_fork(struct task_struct *p);
+void scx_post_fork(struct task_struct *p);
+void scx_cancel_fork(struct task_struct *p);
+bool scx_can_stop_tick(struct rq *rq);
+void scx_rq_activate(struct rq *rq);
+void scx_rq_deactivate(struct rq *rq);
+int scx_check_setscheduler(struct task_struct *p, int policy);
+bool task_should_scx(int policy);
+void init_sched_ext_class(void);
+
+static inline u32 scx_cpuperf_target(s32 cpu)
+{
+ if (scx_enabled())
+ return cpu_rq(cpu)->scx.cpuperf_target;
+ else
+ return 0;
+}
+
+static inline bool task_on_scx(const struct task_struct *p)
+{
+ return scx_enabled() && p->sched_class == &ext_sched_class;
+}
+
+#ifdef CONFIG_SCHED_CORE
+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi);
+#endif
+
+#else /* CONFIG_SCHED_CLASS_EXT */
+
+static inline void scx_tick(struct rq *rq) {}
+static inline void scx_pre_fork(struct task_struct *p) {}
+static inline int scx_fork(struct task_struct *p) { return 0; }
+static inline void scx_post_fork(struct task_struct *p) {}
+static inline void scx_cancel_fork(struct task_struct *p) {}
+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
+static inline void scx_rq_activate(struct rq *rq) {}
+static inline void scx_rq_deactivate(struct rq *rq) {}
+static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
+static inline bool task_on_scx(const struct task_struct *p) { return false; }
+static inline void init_sched_ext_class(void) {}
+
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
+
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ if (scx_enabled())
+ __scx_update_idle(rq, idle, do_notify);
+}
+#else
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+#ifdef CONFIG_EXT_GROUP_SCHED
+int scx_tg_online(struct task_group *tg);
+void scx_tg_offline(struct task_group *tg);
+int scx_cgroup_can_attach(struct cgroup_taskset *tset);
+void scx_cgroup_move_task(struct task_struct *p);
+void scx_cgroup_finish_attach(void);
+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
+void scx_group_set_idle(struct task_group *tg, bool idle);
+#else /* CONFIG_EXT_GROUP_SCHED */
+static inline int scx_tg_online(struct task_group *tg) { return 0; }
+static inline void scx_tg_offline(struct task_group *tg) {}
+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
+static inline void scx_cgroup_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_finish_attach(void) {}
+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
+static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+#endif /* CONFIG_EXT_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..c798d2795243 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,25 +20,43 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include "sched.h"
-
-#include <trace/events/sched.h>
+#include <linux/energy_model.h>
+#include <linux/mmap_lock.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/jiffies.h>
+#include <linux/mm_api.h>
+#include <linux/highmem.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/prio.h>
+
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/memory-tiers.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/ratelimit.h>
+#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
+
+#include <asm/switch_to.h>
+
+#include <uapi/linux/sched/types.h>
-/*
- * Targeted preemption latency for CPU-bound tasks:
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- * run vmstat and monitor the context-switches (cs) field)
- *
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
/*
* The initial- and re-scaling of tunables is configurable
@@ -46,55 +64,26 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
* Options are:
*
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-
-/*
- * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
-
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
-/*
- * SCHED_OTHER wake-up granularity.
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- *
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_base_slice = 750000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
- int _shift = 0;
-
- if (kstrtoint(str, 0, &_shift))
- pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
-
- sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -115,6 +104,13 @@ int __weak arch_asym_cpu_priority(int cpu)
*/
#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
+/*
+ * The margin used when comparing CPU capacities.
+ * is 'cap1' noticeably greater than 'cap2'
+ *
+ * (default: ~5%)
+ */
+#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -128,7 +124,44 @@ int __weak arch_asym_cpu_priority(int cpu)
*
* (default: 5 msec, units: microseconds)
*/
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+#endif
+
+#ifdef CONFIG_SYSCTL
+static const struct ctl_table sched_fair_sysctls[] = {
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_slice_us",
+ .data = &sysctl_sched_cfs_bandwidth_slice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_promote_rate_limit_MBps",
+ .data = &sysctl_numa_balancing_promote_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+};
+
+static int __init sched_fair_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_fair_sysctls);
+ return 0;
+}
+late_initcall(sched_fair_sysctl_init);
#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -185,9 +218,7 @@ static void update_sysctl(void)
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_base_slice);
#undef SET_SYSCTL
}
@@ -231,27 +262,40 @@ static void __update_inv_weight(struct load_weight *lw)
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
u64 fact = scale_load_down(weight);
+ u32 fact_hi = (u32)(fact >> 32);
int shift = WMULT_SHIFT;
+ int fs;
__update_inv_weight(lw);
- if (unlikely(fact >> 32)) {
- while (fact >> 32) {
- fact >>= 1;
- shift--;
- }
+ if (unlikely(fact_hi)) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
fact = mul_u32_u32(fact, lw->inv_weight);
- while (fact >> 32) {
- fact >>= 1;
- shift--;
+ fact_hi = (u32)(fact >> 32);
+ if (fact_hi) {
+ fs = fls(fact_hi);
+ shift -= fs;
+ fact >>= fs;
}
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
const struct sched_class fair_sched_class;
@@ -260,46 +304,11 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- SCHED_WARN_ON(!entity_is_task(se));
- return container_of(se, struct task_struct, se);
-}
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (!path)
- return;
-
- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
- autogroup_path(cfs_rq->tg, path, len);
- else if (cfs_rq && cfs_rq->tg->css.cgroup)
- cgroup_path(cfs_rq->tg->css.cgroup, path, len);
- else
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -375,8 +384,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
/*
* With cfs_rq being unthrottled/throttled during an enqueue,
- * it can happen the tmp_alone_branch points the a leaf that
- * we finally want to del. In this case, tmp_alone_branch moves
+ * it can happen the tmp_alone_branch points to the leaf that
+ * we finally want to delete. In this case, tmp_alone_branch moves
* to the prev element but it will point to rq->leaf_cfs_rq_list
* at the end of the enqueue.
*/
@@ -393,7 +402,7 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
+/* Iterate through all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
leaf_cfs_rq_list)
@@ -408,7 +417,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse)
return NULL;
}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
+static inline struct sched_entity *parent_entity(const struct sched_entity *se)
{
return se->parent;
}
@@ -445,40 +454,27 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
-
-static inline struct task_struct *task_of(struct sched_entity *se)
+static int tg_is_idle(struct task_group *tg)
{
- return container_of(se, struct task_struct, se);
+ return tg->idle > 0;
}
-#define for_each_sched_entity(se) \
- for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
{
- return &task_rq(p)->cfs;
+ return cfs_rq->idle > 0;
}
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+static int se_is_idle(struct sched_entity *se)
{
- struct task_struct *p = task_of(se);
- struct rq *rq = task_rq(p);
-
- return &rq->cfs;
+ if (entity_is_task(se))
+ return task_has_idle_policy(task_of(se));
+ return cfs_rq_is_idle(group_cfs_rq(se));
}
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return NULL;
-}
+#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (path)
- strlcpy(path, "(null)", len);
-}
+#define for_each_sched_entity(se) \
+ for (; se; se = NULL)
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
@@ -506,6 +502,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
+static inline int tg_is_idle(struct task_group *tg)
+{
+ return 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+ return task_has_idle_policy(task_of(se));
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
@@ -515,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
@@ -524,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
return max_vruntime;
}
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
@@ -533,17 +544,222 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline int entity_before(struct sched_entity *a,
- struct sched_entity *b)
+static inline bool entity_before(const struct sched_entity *a,
+ const struct sched_entity *b)
{
- return (s64)(a->vruntime - b->vruntime) < 0;
+ /*
+ * Tiebreak on vruntime seems unnecessary since it can
+ * hardly happen.
+ */
+ return (s64)(a->deadline - b->deadline) < 0;
}
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
+static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return (s64)(se->vruntime - cfs_rq->min_vruntime);
+}
+
+#define __node_2_se(node) \
+ rb_entry((node), struct sched_entity, run_node)
+
+/*
+ * Compute virtual time from the per-task service numbers:
+ *
+ * Fair schedulers conserve lag:
+ *
+ * \Sum lag_i = 0
+ *
+ * Where lag_i is given by:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * Where S is the ideal service time and V is it's virtual time counterpart.
+ * Therefore:
+ *
+ * \Sum lag_i = 0
+ * \Sum w_i * (V - v_i) = 0
+ * \Sum w_i * V - w_i * v_i = 0
+ *
+ * From which we can solve an expression for V in v_i (which we have in
+ * se->vruntime):
+ *
+ * \Sum v_i * w_i \Sum v_i * w_i
+ * V = -------------- = --------------
+ * \Sum w_i W
+ *
+ * Specifically, this is the weighted average of all entity virtual runtimes.
+ *
+ * [[ NOTE: this is only equal to the ideal scheduler under the condition
+ * that join/leave operations happen at lag_i = 0, otherwise the
+ * virtual time has non-contiguous motion equivalent to:
+ *
+ * V +-= lag_i / W
+ *
+ * Also see the comment in place_entity() that deals with this. ]]
+ *
+ * However, since v_i is u64, and the multiplication could easily overflow
+ * transform it into a relative form that uses smaller quantities:
+ *
+ * Substitute: v_i == (v_i - v0) + v0
+ *
+ * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
+ * V = ---------------------------- = --------------------- + v0
+ * W W
+ *
+ * Which we track using:
+ *
+ * v0 := cfs_rq->min_vruntime
+ * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
+ * \Sum w_i := cfs_rq->avg_load
+ *
+ * Since min_vruntime is a monotonic increasing variable that closely tracks
+ * the per-task service, these deltas: (v_i - v), will be in the order of the
+ * maximal (virtual) lag induced in the system due to quantisation.
+ *
+ * Also, we use scale_load_down() to reduce the size.
+ *
+ * As measured, the max (key * weight) value was ~44 bits for a kernel build.
+ */
+static void
+avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime += key * weight;
+ cfs_rq->avg_load += weight;
+}
+
+static void
+avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ unsigned long weight = scale_load_down(se->load.weight);
+ s64 key = entity_key(cfs_rq, se);
+
+ cfs_rq->avg_vruntime -= key * weight;
+ cfs_rq->avg_load -= weight;
+}
+
+static inline
+void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+{
+ /*
+ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ */
+ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+}
+
+/*
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * For this to be so, the result of this function must have a left bias.
+ */
+u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ if (load) {
+ /* sign flips effective floor / ceiling */
+ if (avg < 0)
+ avg -= (load - 1);
+ avg = div_s64(avg, load);
+ }
+
+ return cfs_rq->min_vruntime + avg;
+}
+
+/*
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
+ */
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 vlag, limit;
+
+ SCHED_WARN_ON(!se->on_rq);
+ vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ se->vlag = clamp(vlag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
+ * to the loss in precision caused by the division.
+ */
+static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+}
+
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return vruntime_eligible(cfs_rq, se->vruntime);
+}
+
+static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ u64 min_vruntime = cfs_rq->min_vruntime;
+ /*
+ * open coded max_vruntime() to allow updating avg_vruntime
+ */
+ s64 delta = (s64)(vruntime - min_vruntime);
+ if (delta > 0) {
+ avg_vruntime_update(cfs_rq, delta);
+ min_vruntime = vruntime;
+ }
+ return min_vruntime;
+}
+
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *se = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
u64 vruntime = cfs_rq->min_vruntime;
if (curr) {
@@ -553,60 +769,108 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
curr = NULL;
}
- if (leftmost) { /* non-empty tree */
- struct sched_entity *se;
- se = rb_entry(leftmost, struct sched_entity, run_node);
-
+ if (se) {
if (!curr)
- vruntime = se->vruntime;
+ vruntime = se->min_vruntime;
else
- vruntime = min_vruntime(vruntime, se->vruntime);
+ vruntime = min_vruntime(vruntime, se->min_vruntime);
}
/* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+}
+
+static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 min_slice = ~0ULL;
+
+ if (curr && curr->on_rq)
+ min_slice = curr->slice;
+
+ if (root)
+ min_slice = min(min_slice, root->min_slice);
+
+ return min_slice;
+}
+
+static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+{
+ return entity_before(__node_2_se(a), __node_2_se(b));
+}
+
+#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (vruntime_gt(min_vruntime, se, rse))
+ se->min_vruntime = rse->min_vruntime;
+ }
+}
+
+static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->min_slice < se->min_slice)
+ se->min_slice = rse->min_slice;
+ }
}
/*
- * Enqueue an entity into the rb-tree:
+ * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_entity *entry;
- bool leftmost = true;
+ u64 old_min_vruntime = se->min_vruntime;
+ u64 old_min_slice = se->min_slice;
+ struct rb_node *node = &se->run_node;
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_entity, run_node);
- /*
- * We dont care about collisions. Nodes with
- * the same key stay together.
- */
- if (entity_before(se, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
+ se->min_vruntime = se->vruntime;
+ __min_vruntime_update(se, node->rb_right);
+ __min_vruntime_update(se, node->rb_left);
- rb_link_node(&se->run_node, parent, link);
- rb_insert_color_cached(&se->run_node,
- &cfs_rq->tasks_timeline, leftmost);
+ se->min_slice = se->slice;
+ __min_slice_update(se, node->rb_right);
+ __min_slice_update(se, node->rb_left);
+
+ return se->min_vruntime == old_min_vruntime &&
+ se->min_slice == old_min_slice;
+}
+
+RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+ run_node, min_vruntime, min_vruntime_update);
+
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ avg_vruntime_add(cfs_rq, se);
+ se->min_vruntime = se->vruntime;
+ se->min_slice = se->slice;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_vruntime_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_vruntime_cb);
+ avg_vruntime_sub(cfs_rq, se);
+}
+
+struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
+
+ if (!root)
+ return NULL;
+
+ return __node_2_se(root);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
@@ -616,17 +880,91 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
if (!left)
return NULL;
- return rb_entry(left, struct sched_entity, run_node);
+ return __node_2_se(left);
}
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on deadline, but also functions as a
+ * heap based on the vruntime by keeping:
+ *
+ * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
+ *
+ * Which allows tree pruning through eligibility.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
- struct rb_node *next = rb_next(&se->run_node);
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
- if (!next)
- return NULL;
+ /*
+ * We can safely skip eligibility check if there is only one entity
+ * in this cfs_rq, saving some cycles.
+ */
+ if (cfs_rq->nr_queued == 1)
+ return curr && curr->on_rq ? curr : se;
+
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ /*
+ * Once selected, run a task until it either becomes non-eligible or
+ * until it gets a new slice. See the HACK in set_next_entity().
+ */
+ if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ return curr;
+
+ /* Pick the leftmost entity if it's eligible */
+ if (se && entity_eligible(cfs_rq, se)) {
+ best = se;
+ goto found;
+ }
- return rb_entry(next, struct sched_entity, run_node);
+ /* Heap search for the EEVD entity */
+ while (node) {
+ struct rb_node *left = node->rb_left;
+
+ /*
+ * Eligible entities in left subtree are always better
+ * choices, since they have earlier deadlines.
+ */
+ if (left && vruntime_eligible(cfs_rq,
+ __node_2_se(left)->min_vruntime)) {
+ node = left;
+ continue;
+ }
+
+ se = __node_2_se(node);
+
+ /*
+ * The left subtree either is empty or has no eligible
+ * entity, so check the current node since it is the one
+ * with earliest deadline that might be eligible.
+ */
+ if (entity_eligible(cfs_rq, se)) {
+ best = se;
+ break;
+ }
+
+ node = node->rb_right;
+ }
+found:
+ if (!best || (curr && entity_before(curr, best)))
+ best = curr;
+
+ return best;
}
#ifdef CONFIG_SCHED_DEBUG
@@ -637,99 +975,55 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
if (!last)
return NULL;
- return rb_entry(last, struct sched_entity, run_node);
+ return __node_2_se(last);
}
/**************************************************************
* Scheduling class statistics methods:
*/
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+#ifdef CONFIG_SMP
+int sched_update_scaling(void)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();
- if (ret || !write)
- return ret;
-
- sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
- sysctl_sched_min_granularity);
-
#define WRT_SYSCTL(name) \
(normalized_sysctl_##name = sysctl_##name / (factor))
- WRT_SYSCTL(sched_min_granularity);
- WRT_SYSCTL(sched_latency);
- WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_base_slice);
#undef WRT_SYSCTL
return 0;
}
#endif
+#endif
-/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
*/
-static u64 __sched_period(unsigned long nr_running)
+static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (unlikely(nr_running > sched_nr_latency))
- return nr_running * sysctl_sched_min_granularity;
- else
- return sysctl_sched_latency;
-}
-
-/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
- */
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
-
- for_each_sched_entity(se) {
- struct load_weight *load;
- struct load_weight lw;
-
- cfs_rq = cfs_rq_of(se);
- load = &cfs_rq->load;
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return false;
- if (unlikely(!se->on_rq)) {
- lw = cfs_rq->load;
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * sysctl_sched_base_slice.
+ */
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
- update_load_add(&lw, se->load.weight);
- load = &lw;
- }
- slice = __calc_delta(slice, se->load.weight, load);
- }
- return slice;
-}
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
+ /*
+ * The task has consumed its request, reschedule.
+ */
+ return true;
}
#include "pelt.h"
@@ -755,16 +1049,15 @@ void init_entity_runnable_average(struct sched_entity *se)
if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
- /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+ /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
}
-static void attach_entity_cfs_rq(struct sched_entity *se);
-
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
*
- * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
+ * * se_weight(se)
*
* However, in many cases, the above util_avg does not give a desired
* value. Moreover, the sum of the util_avgs may be divergent, such
@@ -794,20 +1087,6 @@ void post_init_entity_util_avg(struct task_struct *p)
long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
- if (cap > 0) {
- if (cfs_rq->avg.util_avg != 0) {
- sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
- sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
- if (sa->util_avg > cap)
- sa->util_avg = cap;
- } else {
- sa->util_avg = cap;
- }
- }
-
- sa->runnable_avg = sa->util_avg;
-
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
@@ -823,7 +1102,19 @@ void post_init_entity_util_avg(struct task_struct *p)
return;
}
- attach_entity_cfs_rq(se);
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ }
+
+ sa->runnable_avg = sa->util_avg;
}
#else /* !CONFIG_SMP */
@@ -833,177 +1124,209 @@ void init_entity_runnable_average(struct sched_entity *se)
void post_init_entity_util_avg(struct task_struct *p)
{
}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_SMP */
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+{
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
+
+ delta_exec = now - curr->exec_start;
+ if (unlikely(delta_exec <= 0))
+ return delta_exec;
+
+ curr->exec_start = now;
+ curr->sum_exec_runtime += delta_exec;
+
+ if (schedstat_enabled()) {
+ struct sched_statistics *stats;
+
+ stats = __schedstats_from_se(curr);
+ __schedstat_set(stats->exec_max,
+ max(delta_exec, stats->exec_max));
+ }
+
+ return delta_exec;
+}
+
+static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
+{
+ trace_sched_stat_runtime(p, delta_exec);
+ account_group_exec_runtime(p, delta_exec);
+ cgroup_account_cputime(p, delta_exec);
+}
+
+static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ if (!sched_feat(PREEMPT_SHORT))
+ return false;
+
+ if (curr->vlag == curr->deadline)
+ return false;
+
+ return !entity_eligible(cfs_rq, curr);
+}
+
+static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ if (!sched_feat(PREEMPT_SHORT))
+ return false;
+
+ if (pse->slice >= se->slice)
+ return false;
+
+ if (!entity_eligible(cfs_rq, pse))
+ return false;
+
+ if (entity_before(pse, se))
+ return true;
+
+ if (!entity_eligible(cfs_rq, se))
+ return true;
+
+ return false;
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+ struct task_struct *donor = rq->donor;
+ s64 delta_exec;
+
+ delta_exec = update_curr_se(rq, &donor->se);
+ if (likely(delta_exec > 0))
+ update_curr_task(donor, delta_exec);
+
+ return delta_exec;
+}
+
/*
* Update the current task's runtime statistics.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_clock_task(rq_of(cfs_rq));
- u64 delta_exec;
+ struct rq *rq = rq_of(cfs_rq);
+ s64 delta_exec;
+ bool resched;
if (unlikely(!curr))
return;
- delta_exec = now - curr->exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_curr_se(rq, curr);
+ if (unlikely(delta_exec <= 0))
return;
- curr->exec_start = now;
-
- schedstat_set(curr->statistics.exec_max,
- max(delta_exec, curr->statistics.exec_max));
-
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq->exec_clock, delta_exec);
-
curr->vruntime += calc_delta_fair(delta_exec, curr);
+ resched = update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
- struct task_struct *curtask = task_of(curr);
+ struct task_struct *p = task_of(curr);
+
+ update_curr_task(p, delta_exec);
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cgroup_account_cputime(curtask, delta_exec);
- account_group_exec_runtime(curtask, delta_exec);
+ /*
+ * If the fair_server is active, we need to account for the
+ * fair_server time whether or not the task is running on
+ * behalf of fair_server or not:
+ * - If the task is running on behalf of fair_server, we need
+ * to limit its time based on the assigned runtime.
+ * - Fair task that runs outside of fair_server should account
+ * against fair_server such that it can account for this time
+ * and possibly avoid running this period.
+ */
+ if (dl_server_active(&rq->fair_server))
+ dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+ if (cfs_rq->nr_queued == 1)
+ return;
+
+ if (resched || did_preempt_short(cfs_rq, curr)) {
+ resched_curr_lazy(rq);
+ clear_buddies(cfs_rq, curr);
+ }
}
static void update_curr_fair(struct rq *rq)
{
- update_curr(cfs_rq_of(&rq->curr->se));
+ update_curr(cfs_rq_of(&rq->donor->se));
}
static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 wait_start, prev_wait_start;
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
if (!schedstat_enabled())
return;
- wait_start = rq_clock(rq_of(cfs_rq));
- prev_wait_start = schedstat_val(se->statistics.wait_start);
+ stats = __schedstats_from_se(se);
- if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
- likely(wait_start > prev_wait_start))
- wait_start -= prev_wait_start;
+ if (entity_is_task(se))
+ p = task_of(se);
- __schedstat_set(se->statistics.wait_start, wait_start);
+ __update_stats_wait_start(rq_of(cfs_rq), p, stats);
}
static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct task_struct *p;
- u64 delta;
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
if (!schedstat_enabled())
return;
- delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
+ stats = __schedstats_from_se(se);
- if (entity_is_task(se)) {
+ /*
+ * When the sched_schedstat changes from 0 to 1, some sched se
+ * maybe already in the runqueue, the se->statistics.wait_start
+ * will be 0.So it will let the delta wrong. We need to avoid this
+ * scenario.
+ */
+ if (unlikely(!schedstat_val(stats->wait_start)))
+ return;
+
+ if (entity_is_task(se))
p = task_of(se);
- if (task_on_rq_migrating(p)) {
- /*
- * Preserve migrating task's wait time so wait_start
- * time stamp can be adjusted to accumulate wait time
- * prior to migration.
- */
- __schedstat_set(se->statistics.wait_start, delta);
- return;
- }
- trace_sched_stat_wait(p, delta);
- }
- __schedstat_set(se->statistics.wait_max,
- max(schedstat_val(se->statistics.wait_max), delta));
- __schedstat_inc(se->statistics.wait_count);
- __schedstat_add(se->statistics.wait_sum, delta);
- __schedstat_set(se->statistics.wait_start, 0);
+ __update_stats_wait_end(rq_of(cfs_rq), p, stats);
}
static inline void
-update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ struct sched_statistics *stats;
struct task_struct *tsk = NULL;
- u64 sleep_start, block_start;
if (!schedstat_enabled())
return;
- sleep_start = schedstat_val(se->statistics.sleep_start);
- block_start = schedstat_val(se->statistics.block_start);
+ stats = __schedstats_from_se(se);
if (entity_is_task(se))
tsk = task_of(se);
- if (sleep_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
- __schedstat_set(se->statistics.sleep_max, delta);
-
- __schedstat_set(se->statistics.sleep_start, 0);
- __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
- if (tsk) {
- account_scheduler_latency(tsk, delta >> 10, 1);
- trace_sched_stat_sleep(tsk, delta);
- }
- }
- if (block_start) {
- u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
-
- if ((s64)delta < 0)
- delta = 0;
-
- if (unlikely(delta > schedstat_val(se->statistics.block_max)))
- __schedstat_set(se->statistics.block_max, delta);
-
- __schedstat_set(se->statistics.block_start, 0);
- __schedstat_add(se->statistics.sum_sleep_runtime, delta);
-
- if (tsk) {
- if (tsk->in_iowait) {
- __schedstat_add(se->statistics.iowait_sum, delta);
- __schedstat_inc(se->statistics.iowait_count);
- trace_sched_stat_iowait(tsk, delta);
- }
-
- trace_sched_stat_blocked(tsk, delta);
-
- /*
- * Blocking time is in units of nanosecs, so shift by
- * 20 to get a milliseconds-range estimation of the
- * amount of time that the task spent sleeping:
- */
- if (unlikely(prof_on == SLEEP_PROFILING)) {
- profile_hits(SLEEP_PROFILING,
- (void *)get_wchan(tsk),
- delta >> 20);
- }
- account_scheduler_latency(tsk, delta >> 10, 0);
- }
- }
+ __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
}
/*
* Task is being enqueued - update stats:
*/
static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
return;
@@ -1013,14 +1336,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* a dequeue/enqueue event is a NOP)
*/
if (se != cfs_rq->curr)
- update_stats_wait_start(cfs_rq, se);
+ update_stats_wait_start_fair(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP)
- update_stats_enqueue_sleeper(cfs_rq, se);
+ update_stats_enqueue_sleeper_fair(cfs_rq, se);
}
static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
if (!schedstat_enabled())
@@ -1031,16 +1354,19 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* waiting task:
*/
if (se != cfs_rq->curr)
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end_fair(cfs_rq, se);
if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
+ unsigned int state;
- if (tsk->state & TASK_INTERRUPTIBLE)
- __schedstat_set(se->statistics.sleep_start,
+ /* XXX racy against TTWU */
+ state = READ_ONCE(tsk->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(tsk->stats.sleep_start,
rq_clock(rq_of(cfs_rq)));
- if (tsk->state & TASK_UNINTERRUPTIBLE)
- __schedstat_set(se->statistics.block_start,
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(tsk->stats.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
@@ -1061,6 +1387,50 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
+
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
+
+ if (!idle_cpu(sibling))
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1076,6 +1446,9 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
struct numa_group {
refcount_t refcount;
@@ -1088,11 +1461,12 @@ struct numa_group {
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
+ * faults[] array is split into two regions: faults_mem and faults_cpu.
+ *
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
* more by CPU use than by memory faults.
*/
- unsigned long *faults_cpu;
unsigned long faults[];
};
@@ -1103,7 +1477,7 @@ struct numa_group {
static struct numa_group *deref_task_numa_group(struct task_struct *p)
{
return rcu_dereference_check(p->numa_group, p == current ||
- (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+ (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
}
static struct numa_group *deref_curr_numa_group(struct task_struct *p)
@@ -1133,7 +1507,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
return rss / nr_scan_pages;
}
-/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560
static unsigned int task_scan_min(struct task_struct *p)
@@ -1266,8 +1640,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
- group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+ return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
+ group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
}
static inline unsigned long group_faults_priv(struct numa_group *ng)
@@ -1308,10 +1682,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
- int maxdist, bool task)
+ int lim_dist, bool task)
{
unsigned long score = 0;
- int node;
+ int node, max_dist;
/*
* All nodes are directly connected, and the same distance
@@ -1320,9 +1694,11 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
+ /* sched_max_numa_distance may be changed in parallel. */
+ max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
- * which should be ok given the number of nodes rarely exceeds 8.
+ * which should be OK given the number of nodes rarely exceeds 8.
*/
for_each_online_node(node) {
unsigned long faults;
@@ -1332,7 +1708,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
- if (dist == sched_max_numa_distance || node == nid)
+ if (dist >= max_dist || node == nid)
continue;
/*
@@ -1342,8 +1718,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
- if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist >= maxdist)
+ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
continue;
/* Add up the faults from nearby nodes. */
@@ -1361,8 +1736,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
- faults *= (sched_max_numa_distance - dist);
- faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ faults *= (max_dist - dist);
+ faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
@@ -1416,15 +1791,169 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID. When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID. So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+ return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ promo_wmark_pages(zone) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page. So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ * hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct folio *folio)
+{
+ int last_time, time;
+
+ time = jiffies_to_msecs(jiffies);
+ last_time = folio_xchg_access_time(folio, time);
+
+ return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long nr_cand;
+ unsigned int now, start;
+
+ now = jiffies_to_msecs(jiffies);
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS 16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit,
+ unsigned int ref_th)
+{
+ unsigned int now, start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ now = jiffies_to_msecs(jiffies);
+ th_period = sysctl_numa_balancing_scan_period_max;
+ start = pgdat->nbp_th_start;
+ if (now - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+ ref_cand = rate_limit *
+ sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
int src_nid, int dst_cpu)
{
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * Cannot migrate to memoryless nodes.
+ */
+ if (!node_state(dst_nid, N_MEMORY))
+ return false;
+
+ /*
+ * The pages in slow memory node should be migrated according
+ * to hot/cold instead of private/shared.
+ */
+ if (folio_use_access_time(folio)) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int latency, th, def_th;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ return true;
+ }
+
+ def_th = sysctl_numa_balancing_hot_threshold;
+ rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+ (20 - PAGE_SHIFT);
+ numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ latency = numa_hint_fault_latency(folio);
+ if (latency >= th)
+ return false;
+
+ return !numa_promotion_rate_limit(pgdat, rate_limit,
+ folio_nr_pages(folio));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
+
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+ return false;
/*
* Allow first faults or private faults to migrate immediately early in
@@ -1506,6 +2035,7 @@ enum numa_type {
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
+ unsigned long runnable;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
@@ -1515,28 +2045,12 @@ struct numa_stats {
int idle_cpu;
};
-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- int sibling;
-
- for_each_cpu(sibling, cpu_smt_mask(cpu)) {
- if (cpu == sibling)
- continue;
-
- if (!idle_cpu(cpu))
- return false;
- }
-#endif
-
- return true;
-}
-
struct task_numa_env {
struct task_struct *p;
int src_cpu, src_nid;
int dst_cpu, dst_nid;
+ int imb_numa_nr;
struct numa_stats src_stats, dst_stats;
@@ -1549,19 +2063,20 @@ struct task_numa_env {
};
static unsigned long cpu_load(struct rq *rq);
-static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+static unsigned long cpu_runnable(struct rq *rq);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
- ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
return node_overloaded;
if ((ns->nr_running < ns->weight) ||
- ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
return node_has_spare;
return node_fully_busy;
@@ -1569,11 +2084,11 @@ numa_type numa_classify(unsigned int imbalance_pct,
#ifdef CONFIG_SCHED_SMT
/* Forward declarations of select_idle_sibling helpers */
-static inline bool test_idle_cores(int cpu, bool def);
+static inline bool test_idle_cores(int cpu);
static inline int numa_idle_core(int idle_core, int cpu)
{
if (!static_branch_likely(&sched_smt_present) ||
- idle_core >= 0 || !test_idle_cores(cpu, false))
+ idle_core >= 0 || !test_idle_cores(cpu))
return idle_core;
/*
@@ -1612,11 +2127,12 @@ static void update_numa_stats(struct task_numa_env *env,
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq);
- ns->util += cpu_util(cpu);
- ns->nr_running += rq->cfs.h_nr_running;
+ ns->runnable += cpu_runnable(rq);
+ ns->util += cpu_util_cfs(cpu);
+ ns->nr_running += rq->cfs.h_nr_runnable;
ns->compute_capacity += capacity_of(cpu);
- if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
if (READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
@@ -1648,7 +2164,7 @@ static void task_numa_assign(struct task_numa_env *env,
int start = env->dst_cpu;
/* Find alternative idle CPU. */
- for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
if (cpu == env->best_cpu || !idle_cpu(cpu) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
continue;
@@ -1788,6 +2304,15 @@ static bool task_numa_compare(struct task_numa_env *env,
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
+ /*
+ * Do not swap within a group or between tasks that have
+ * no group if there is spare capacity. Swapping does
+ * not address the load imbalance and helps one task at
+ * the cost of punishing another.
+ */
+ if (env->dst_stats.node_type == node_has_spare)
+ goto unlock;
+
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
@@ -1927,7 +2452,8 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
- imbalance = adjust_numa_imbalance(imbalance, src_running);
+ imbalance = adjust_numa_imbalance(imbalance, dst_running,
+ env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -1992,8 +2518,10 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
+ if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ env.imb_numa_nr = sd->imb_numa_nr;
+ }
rcu_read_unlock();
/*
@@ -2028,7 +2556,7 @@ static int task_numa_migrate(struct task_struct *p)
*/
ng = deref_curr_numa_group(p);
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -2116,7 +2644,7 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
- * Find out how many nodes on the workload is actively running on. Do this by
+ * Find out how many nodes the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
@@ -2126,13 +2654,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
unsigned long faults, max_faults = 0;
int nid, active_nodes = 0;
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
@@ -2170,7 +2698,7 @@ static void update_task_scan_period(struct task_struct *p,
/*
* If there were no record hinting faults then either the task is
- * completely idle or all activity is areas that are not of interest
+ * completely idle or all activity is in areas that are not of interest
* to automatic numa balancing. Related to that, if there were failed
* migration then it implies we are migrating too quickly or the local
* node is overloaded. In either case, scan slower
@@ -2286,7 +2814,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
dist = sched_max_numa_distance;
- for_each_online_node(node) {
+ for_each_node_state(node, N_CPU) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
@@ -2305,7 +2833,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
- nodes = node_online_map;
+ nodes = node_states[N_CPU];
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
@@ -2427,7 +2955,7 @@ static void task_numa_placement(struct task_struct *p)
* is at the beginning of the numa_faults array.
*/
ng->faults[mem_idx] += diff;
- ng->faults_cpu[mem_idx] += f_diff;
+ ng->faults[cpu_idx] += f_diff;
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
@@ -2444,6 +2972,9 @@ static void task_numa_placement(struct task_struct *p)
}
}
+ /* Cannot migrate task to CPU-less node */
+ max_nid = numa_nearest_node(max_nid, N_CPU);
+
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
@@ -2481,7 +3012,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
- 4*nr_node_ids*sizeof(unsigned long);
+ NR_NUMA_HINT_FAULT_STATS *
+ nr_node_ids * sizeof(unsigned long);
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!grp)
@@ -2492,9 +3024,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
- /* Second half of the array tracks nids where faults happen */
- grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
- nr_node_ids;
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
@@ -2551,7 +3080,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!join)
return;
- BUG_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled());
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
@@ -2578,7 +3107,7 @@ no_join:
}
/*
- * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * Get rid of NUMA statistics associated with a task (either current or dead).
* If @final is set, the task is dead and has reached refcount zero, so we can
* safely free all relevant data structures. Otherwise, there might be
* concurrent reads from places like load balancing and procfs, and we should
@@ -2636,6 +3165,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (!p->mm)
return;
+ /*
+ * NUMA faults statistics are unnecessary for the slow memory
+ * node for memory tiering mode.
+ */
+ if (!node_is_toptier(mem_node) &&
+ (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+ !cpupid_valid(last_cpupid)))
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
@@ -2706,6 +3244,45 @@ static void reset_ptenuma_scan(struct task_struct *p)
p->mm->numa_scan_offset = 0;
}
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long pids;
+ /*
+ * Allow unconditional access first two times, so that all the (pages)
+ * of VMAs get prot_none fault introduced irrespective of accesses.
+ * This is also done to avoid any side effect of task scanning
+ * amplifying the unfairness of disjoint set of VMAs' access.
+ */
+ if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
+ return true;
+
+ pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
+ if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+ return true;
+
+ /*
+ * Complete a scan that has already started regardless of PID access, or
+ * some VMAs may never be scanned in multi-threaded applications:
+ */
+ if (mm->numa_scan_offset > vma->vm_start) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+ return true;
+ }
+
+ /*
+ * This vma has not been accessed for a while, and if the number
+ * the threads in the same process is low, which means no other
+ * threads can help scan this vma, force a vma scan.
+ */
+ if (READ_ONCE(mm->numa_scan_seq) >
+ (vma->numab_state->prev_scan_seq + get_nr_threads(current)))
+ return true;
+
+ return false;
+}
+
+#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
@@ -2720,6 +3297,9 @@ static void task_numa_work(struct callback_head *work)
unsigned long start, end;
unsigned long nr_pte_updates = 0;
long pages, virtpages;
+ struct vma_iterator vmi;
+ bool vma_pids_skipped;
+ bool vma_pids_forced = false;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
@@ -2753,7 +3333,7 @@ static void task_numa_work(struct callback_head *work)
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
- if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
return;
/*
@@ -2762,7 +3342,6 @@ static void task_numa_work(struct callback_head *work)
*/
p->node_stamp += 2 * TICK_NSEC;
- start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */
@@ -2772,34 +3351,118 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
- vma = find_vma(mm, start);
+
+ /*
+ * VMAs are skipped if the current PID has not trapped a fault within
+ * the VMA recently. Allow scanning to be forced if there is no
+ * suitable VMA remaining.
+ */
+ vma_pids_skipped = false;
+
+retry_pids:
+ start = mm->numa_scan_offset;
+ vma_iter_init(&vmi, mm, start);
+ vma = vma_next(&vmi);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
- vma = mm->mmap;
+ vma_iter_set(&vmi, start);
+ vma = vma_next(&vmi);
}
- for (; vma; vma = vma->vm_next) {
+
+ for (; vma; vma = vma_next(&vmi)) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
continue;
}
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
- * hinting faults in read-only file-backed mappings or the vdso
+ * hinting faults in read-only file-backed mappings or the vDSO
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
- (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
continue;
+ }
/*
* Skip inaccessible VMAs to avoid any confusion between
- * PROT_NONE and NUMA hinting ptes
+ * PROT_NONE and NUMA hinting PTEs
*/
- if (!vma_is_accessible(vma))
+ if (!vma_is_accessible(vma)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
continue;
+ }
+
+ /* Initialise new per-VMA NUMAB state. */
+ if (!vma->numab_state) {
+ struct vma_numab_state *ptr;
+
+ ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ continue;
+
+ if (cmpxchg(&vma->numab_state, NULL, ptr)) {
+ kfree(ptr);
+ continue;
+ }
+
+ vma->numab_state->start_scan_seq = mm->numa_scan_seq;
+
+ vma->numab_state->next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+
+ /* Reset happens after 4 times scan delay of scan start */
+ vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+ /*
+ * Ensure prev_scan_seq does not match numa_scan_seq,
+ * to prevent VMAs being skipped prematurely on the
+ * first scan:
+ */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
+ }
+
+ /*
+ * Scanning the VMAs of short lived tasks add more overhead. So
+ * delay the scan for new VMAs.
+ */
+ if (mm->numa_scan_seq && time_before(jiffies,
+ vma->numab_state->next_scan)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
+ continue;
+ }
+
+ /* RESET access PIDs regularly for old VMAs. */
+ if (mm->numa_scan_seq &&
+ time_after(jiffies, vma->numab_state->pids_active_reset)) {
+ vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+ vma->numab_state->pids_active[1] = 0;
+ }
+
+ /* Do not rescan VMAs twice within the same sequence. */
+ if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+ mm->numa_scan_offset = vma->vm_end;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
+ continue;
+ }
+
+ /*
+ * Do not scan the VMA if task has not accessed it, unless no other
+ * VMA candidate exists.
+ */
+ if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+ vma_pids_skipped = true;
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+ continue;
+ }
do {
start = max(start, vma->vm_start);
@@ -2810,7 +3473,7 @@ static void task_numa_work(struct callback_head *work)
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
- * is not already pte-numa. If the VMA contains
+ * is not already PTE-numa. If the VMA contains
* areas that are unused or already full of prot_numa
* PTEs, scan up to virtpages, to skip through those
* areas faster.
@@ -2825,6 +3488,26 @@ static void task_numa_work(struct callback_head *work)
cond_resched();
} while (end != vma->vm_end);
+
+ /* VMA scan is complete, do not scan until next sequence. */
+ vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+ /*
+ * Only force scan within one VMA at a time, to limit the
+ * cost of scanning a potentially uninteresting VMA.
+ */
+ if (vma_pids_forced)
+ break;
+ }
+
+ /*
+ * If no VMAs are remaining and VMAs were skipped due to the PID
+ * not accessing the VMA previously, then force a scan to ensure
+ * forward progress:
+ */
+ if (!vma && !vma_pids_forced && vma_pids_skipped) {
+ vma_pids_forced = true;
+ goto retry_pids;
}
out:
@@ -2867,9 +3550,12 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
+ p->numa_pages_migrated = 0;
+ p->total_numa_faults = 0;
RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
@@ -2907,7 +3593,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -2925,7 +3611,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan))
- task_work_add(curr, work, true);
+ task_work_add(curr, work, TWA_RESUME);
}
}
@@ -2994,7 +3680,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_add(&se->group_node, &rq->cfs_tasks);
}
#endif
- cfs_rq->nr_running++;
+ cfs_rq->nr_queued++;
}
static void
@@ -3007,7 +3693,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_del_init(&se->group_node);
}
#endif
- cfs_rq->nr_running--;
+ cfs_rq->nr_queued--;
}
/*
@@ -3071,6 +3757,9 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
@@ -3079,44 +3768,74 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
+ bool curr = cfs_rq->curr == se;
+
if (se->on_rq) {
/* commit outstanding execution time */
- if (cfs_rq->curr == se)
- update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
+ update_curr(cfs_rq);
+ update_entity_lag(cfs_rq, se);
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
+ if (!curr)
+ __dequeue_entity(cfs_rq, se);
+ update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
+ if (se->rel_deadline)
+ se->deadline = div_s64(se->deadline * se->load.weight, weight);
+
update_load_set(&se->load, weight);
#ifdef CONFIG_SMP
do {
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+ u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
#endif
enqueue_load_avg(cfs_rq, se);
- if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
+ if (se->on_rq) {
+ update_load_add(&cfs_rq->load, se->load.weight);
+ place_entity(cfs_rq, se, 0);
+ if (!curr)
+ __enqueue_entity(cfs_rq, se);
+ /*
+ * The entity's vruntime has been adjusted, so let's check
+ * whether the rq-wide min_vruntime needs updated too. Since
+ * the calculations above require stable min_vruntime rather
+ * than up-to-date one, we do the update at the end of the
+ * reweight process.
+ */
+ update_min_vruntime(cfs_rq);
+ }
}
-void reweight_task(struct task_struct *p, int prio)
+static void reweight_task_fair(struct rq *rq, struct task_struct *p,
+ const struct load_weight *lw)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct load_weight *load = &se->load;
- unsigned long weight = scale_load(sched_prio_to_weight[prio]);
- reweight_entity(cfs_rq, se, weight);
- load->inv_weight = sched_prio_to_wmult[prio];
+ reweight_entity(cfs_rq, se, lw->weight);
+ load->inv_weight = lw->inv_weight;
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/*
@@ -3128,7 +3847,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (1)
- * \Sum grq->load.weight
+ * \Sum grq->load.weight
*
* Now, because computing that sum is prohibitively expensive to compute (been
* there, done that) we approximate it with this average stuff. The average
@@ -3142,7 +3861,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->avg.load_avg
* ge->load.weight = ------------------------------ (3)
- * tg->load_avg
+ * tg->load_avg
*
* Where: tg->load_avg ~= \Sum grq->avg.load_avg
*
@@ -3158,7 +3877,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- = tg->weight (4)
- * grp->load.weight
+ * grp->load.weight
*
* That is, the sum collapses because all other CPUs are idle; the UP scenario.
*
@@ -3177,7 +3896,7 @@ void reweight_task(struct task_struct *p, int prio)
*
* tg->weight * grq->load.weight
* ge->load.weight = ----------------------------- (6)
- * tg_load_avg'
+ * tg_load_avg'
*
* Where:
*
@@ -3227,8 +3946,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
@@ -3238,7 +3955,11 @@ static void update_cfs_group(struct sched_entity *se)
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares;
- if (!gcfs_rq)
+ /*
+ * When a group becomes empty, preserve its weight. This matters for
+ * DELAY_DEQUEUE.
+ */
+ if (!gcfs_rq || !gcfs_rq->load.weight)
return;
if (throttled_hierarchy(gcfs_rq))
@@ -3246,14 +3967,11 @@ static void update_cfs_group(struct sched_entity *se)
#ifndef CONFIG_SMP
shares = READ_ONCE(gcfs_rq->tg->shares);
-
- if (likely(se->load.weight == shares))
- return;
#else
- shares = calc_group_shares(gcfs_rq);
+ shares = calc_group_shares(gcfs_rq);
#endif
-
- reweight_entity(cfs_rq_of(se), se, shares);
+ if (unlikely(se->load.weight != shares))
+ reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -3279,18 +3997,87 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
- * See cpu_util().
+ * See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
}
#ifdef CONFIG_SMP
+static inline bool load_avg_is_decayed(struct sched_avg *sa)
+{
+ if (sa->load_sum)
+ return false;
+
+ if (sa->util_sum)
+ return false;
+
+ if (sa->runnable_sum)
+ return false;
+
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ SCHED_WARN_ON(sa->load_avg ||
+ sa->util_avg ||
+ sa->runnable_avg);
+
+ return true;
+}
+
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return u64_u32_load_copy(cfs_rq->avg.last_update_time,
+ cfs_rq->last_update_time_copy);
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
+ * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
+ * bottom-up, we only have to test whether the cfs_rq before us on the list
+ * is our child.
+ * If cfs_rq is not on the list, test whether a child needs its to be added to
+ * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
+ */
+static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
+{
+ struct cfs_rq *prev_cfs_rq;
+ struct list_head *prev;
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (cfs_rq->on_list) {
+ prev = cfs_rq->leaf_cfs_rq_list.prev;
+ } else {
+ prev = rq->tmp_alone_branch;
+ }
+
+ if (prev == &rq->leaf_cfs_rq_list)
+ return false;
+
+ prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
+
+ return (prev_cfs_rq->tg->parent == cfs_rq->tg);
+}
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (!load_avg_is_decayed(&cfs_rq->avg))
+ return false;
+
+ if (child_cfs_rq_on_list(cfs_rq))
+ return false;
+
+ return true;
+}
+
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
@@ -3302,9 +4089,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
*
* Updating tg's load_avg is necessary before update_cfs_share().
*/
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
- long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ long delta;
+ u64 now;
/*
* No need to update load_avg for root_task_group as it is not used.
@@ -3312,12 +4100,69 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
if (cfs_rq->tg == &root_task_group)
return;
- if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ /* rq has been offline and doesn't contribute to the share anymore: */
+ if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+ return;
+
+ /*
+ * For migration heavy workloads, access to tg->load_avg can be
+ * unbound. Limit the update rate to at most once per ms.
+ */
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+ return;
+
+ delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+ cfs_rq->last_update_tg_load_avg = now;
}
}
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+ long delta;
+ u64 now;
+
+ /*
+ * No need to update load_avg for root_task_group, as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ delta = 0 - cfs_rq->tg_load_avg_contrib;
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ clear_tg_load_avg(cfs_rq);
+ }
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
+}
+
/*
* Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
@@ -3342,32 +4187,13 @@ void set_task_rq_fair(struct sched_entity *se,
if (!(se->avg.last_update_time && prev))
return;
-#ifndef CONFIG_64BIT
- {
- u64 p_last_update_time_copy;
- u64 n_last_update_time_copy;
-
- do {
- p_last_update_time_copy = prev->load_last_update_time_copy;
- n_last_update_time_copy = next->load_last_update_time_copy;
-
- smp_rmb();
-
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
+ p_last_update_time = cfs_rq_last_update_time(prev);
+ n_last_update_time = cfs_rq_last_update_time(next);
- } while (p_last_update_time != p_last_update_time_copy ||
- n_last_update_time != n_last_update_time_copy);
- }
-#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
-#endif
__update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
-
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
@@ -3435,51 +4261,66 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
+
+ /* Nothing to update */
+ if (!delta_avg)
+ return;
+
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ divider = get_pelt_divider(&cfs_rq->avg);
- /* Nothing to update */
- if (!delta)
- return;
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * divider;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 new_sum, divider;
+
+ /* Nothing to update */
+ if (!delta_avg)
+ return;
+
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
-
- /* Nothing to update */
- if (!delta)
- return;
+ divider = get_pelt_divider(&cfs_rq->avg);
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * divider;
+ new_sum = se->avg.runnable_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+ se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}
static inline void
@@ -3500,7 +4341,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) {
/*
@@ -3515,7 +4356,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
- load_sum = div_s64(gcfs_rq->avg.load_sum,
+ load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
@@ -3532,16 +4373,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
- load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, divider);
+ load_sum = se_weight(se) * runnable_sum;
+ load_avg = div_u64(load_sum, divider);
- delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
delta_avg = load_avg - se->avg.load_avg;
+ if (!delta_avg)
+ return;
+
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta_avg);
add_positive(&cfs_rq->avg.load_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3610,7 +4457,9 @@ static inline bool skip_blocked_update(struct sched_entity *se)
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
+
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
@@ -3621,18 +4470,100 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+ u64 throttled = 0, now, lut;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ bool is_idle;
+
+ if (load_avg_is_decayed(&se->avg))
+ return;
+
+ cfs_rq = cfs_rq_of(se);
+ rq = rq_of(cfs_rq);
+
+ rcu_read_lock();
+ is_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ /*
+ * The lag estimation comes with a cost we don't want to pay all the
+ * time. Hence, limiting to the case where the source CPU is idle and
+ * we know we are at the greatest risk to have an outdated clock.
+ */
+ if (!is_idle)
+ return;
+
+ /*
+ * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
+ *
+ * last_update_time (the cfs_rq's last_update_time)
+ * = cfs_rq_clock_pelt()@cfs_rq_idle
+ * = rq_clock_pelt()@cfs_rq_idle
+ * - cfs->throttled_clock_pelt_time@cfs_rq_idle
+ *
+ * cfs_idle_lag (delta between rq's update and cfs_rq's update)
+ * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
+ *
+ * rq_idle_lag (delta between now and rq's update)
+ * = sched_clock_cpu() - rq_clock()@rq_idle
+ *
+ * We can then write:
+ *
+ * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
+ * sched_clock_cpu() - rq_clock()@rq_idle
+ * Where:
+ * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
+ * rq_clock()@rq_idle is rq->clock_idle
+ * cfs->throttled_clock_pelt_time@cfs_rq_idle
+ * is cfs_rq->throttled_pelt_idle
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
+ /* The clock has been stopped for throttling */
+ if (throttled == U64_MAX)
+ return;
+#endif
+ now = u64_u32_load(rq->clock_pelt_idle);
+ /*
+ * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
+ * is observed the old clock_pelt_idle value and the new clock_idle,
+ * which lead to an underestimation. The opposite would lead to an
+ * overestimation.
+ */
+ smp_rmb();
+ lut = cfs_rq_last_update_time(cfs_rq);
+
+ now -= throttled;
+ if (now < lut)
+ /*
+ * cfs_rq->avg.last_update_time is more recent than our
+ * estimation, let's use it.
+ */
+ now = lut;
+ else
+ now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
+
+ __update_load_avg_blocked_se(now, se);
+}
+#else
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif
+
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
- * avg. The immediate corollary is that all (fair) tasks must be attached, see
- * post_init_entity_util_avg().
+ * avg. The immediate corollary is that all (fair) tasks must be attached.
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed load.
+ * Return: true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
@@ -3646,7 +4577,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq->removed.nr) {
unsigned long r;
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
@@ -3658,14 +4589,31 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * divider);
+ /* See sa->util_sum below */
+ sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util;
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
sub_positive(&sa->runnable_sum, r * divider);
+ /* See sa->util_sum above */
+ sa->runnable_sum = max_t(u32, sa->runnable_sum,
+ sa->runnable_avg * PELT_MIN_DIVIDER);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -3678,12 +4626,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
}
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
-
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->load_last_update_time_copy = sa->last_update_time;
-#endif
-
+ u64_u32_store_copy(sa->last_update_time,
+ cfs_rq->last_update_time_copy,
+ sa->last_update_time);
return decayed;
}
@@ -3701,7 +4646,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
@@ -3723,11 +4668,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
se->avg.runnable_sum = se->avg.runnable_avg * divider;
- se->avg.load_sum = divider;
- if (se_weight(se)) {
- se->avg.load_sum =
- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
- }
+ se->avg.load_sum = se->avg.load_avg * divider;
+ if (se_weight(se) < se->avg.load_sum)
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+ else
+ se->avg.load_sum = 1;
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
@@ -3755,8 +4700,15 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -3771,6 +4723,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
#define DO_ATTACH 0x4
+#define DO_DETACH 0x8
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -3780,7 +4733,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
+ * track group sched_entity load average for task_h_load calculation in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
@@ -3798,37 +4751,23 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
+ } else if (flags & DO_DETACH) {
+ /*
+ * DO_DETACH means we're here from dequeue_entity()
+ * and we are migrating task out of the CPU.
+ */
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
}
}
-#ifndef CONFIG_64BIT
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- u64 last_update_time_copy;
- u64 last_update_time;
-
- do {
- last_update_time_copy = cfs_rq->load_last_update_time_copy;
- smp_rmb();
- last_update_time = cfs_rq->avg.last_update_time;
- } while (last_update_time != last_update_time_copy);
-
- return last_update_time;
-}
-#else
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->avg.last_update_time;
-}
-#endif
-
/*
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
@@ -3853,8 +4792,8 @@ static void remove_entity_load_avg(struct sched_entity *se)
/*
* tasks cannot exit without having gone through wake_up_new_task() ->
- * post_init_entity_util_avg() which will have added things to the
- * cfs_rq, so we can remove unconditionally.
+ * enqueue_task_fair() which will have added things to the cfs_rq,
+ * so we can remove unconditionally.
*/
sync_entity_load_avg(se);
@@ -3877,38 +4816,27 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
}
-static inline unsigned long _task_util_est(struct task_struct *p)
+static inline unsigned long task_runnable(struct task_struct *p)
{
- struct util_est ue = READ_ONCE(p->se.avg.util_est);
-
- return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+ return READ_ONCE(p->se.avg.runnable_avg);
}
-static inline unsigned long task_util_est(struct task_struct *p)
+static inline unsigned long _task_util_est(struct task_struct *p)
{
- return max(task_util(p), _task_util_est(p));
+ return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
}
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p)
-{
- return clamp(task_util_est(p),
- uclamp_eff_value(p, UCLAMP_MIN),
- uclamp_eff_value(p, UCLAMP_MAX));
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long task_util_est(struct task_struct *p)
{
- return task_util_est(p);
+ return max(task_util(p), _task_util_est(p));
}
-#endif
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
@@ -3919,38 +4847,39 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
return;
/* Update root cfs_rq's estimated utilization */
- enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued = cfs_rq->avg.util_est;
enqueued += _task_util_est(p);
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
-}
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
-/*
- * Check if a (signed) value is within a specified (unsigned) margin,
- * based on the observation that:
- *
- * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
- *
- * NOTE: this only works when value + maring < INT_MAX.
- */
-static inline bool within_margin(int value, int margin)
-{
- return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+ trace_sched_util_est_cfs_tp(cfs_rq);
}
-static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
{
- long last_ewma_diff;
- struct util_est ue;
- int cpu;
+ unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+ enqueued = cfs_rq->avg.util_est;
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+ struct task_struct *p,
+ bool task_sleep)
+{
+ unsigned int ewma, dequeued, last_ewma_diff;
+
+ if (!sched_feat(UTIL_EST))
+ return;
/*
* Skip update of task's estimated utilization when the task has not
@@ -3959,82 +4888,232 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
if (!task_sleep)
return;
+ /* Get current estimate of utilization */
+ ewma = READ_ONCE(p->se.avg.util_est);
+
/*
* If the PELT values haven't changed since enqueue time,
* skip the util_est update.
*/
- ue = p->se.avg.util_est;
- if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ if (ewma & UTIL_AVG_UNCHANGED)
return;
+ /* Get utilization at dequeue */
+ dequeued = task_util(p);
+
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
- if (sched_feat(UTIL_EST_FASTUP)) {
- if (ue.ewma < ue.enqueued) {
- ue.ewma = ue.enqueued;
- goto done;
- }
+ if (ewma <= dequeued) {
+ ewma = dequeued;
+ goto done;
}
/*
- * Skip update of task's estimated utilization when its EWMA is
+ * Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
- last_ewma_diff = ue.enqueued - ue.ewma;
- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
- return;
+ last_ewma_diff = ewma - dequeued;
+ if (last_ewma_diff < UTIL_EST_MARGIN)
+ goto done;
/*
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- cpu = cpu_of(rq_of(cfs_rq));
- if (task_util(p) > capacity_orig_of(cpu))
+ if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
return;
/*
+ * To avoid underestimate of task utilization, skip updates of EWMA if
+ * we cannot grant that thread got all CPU time it wanted.
+ */
+ if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
+ goto done;
+
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
- * of the task size. This is done by storing the current PELT value
- * as ue.enqueued and by using this value to update the Exponential
- * Weighted Moving Average (EWMA):
+ * of the task size. This is done by using this value to update the
+ * Exponential Weighted Moving Average (EWMA):
*
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
- * = w * ( last_ewma_diff ) + ewma(t-1)
- * = w * (last_ewma_diff + ewma(t-1) / w)
+ * = w * ( -last_ewma_diff ) + ewma(t-1)
+ * = w * (-last_ewma_diff + ewma(t-1) / w)
*
* Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
- ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
- ue.ewma += last_ewma_diff;
- ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ewma -= last_ewma_diff;
+ ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
- WRITE_ONCE(p->se.avg.util_est, ue);
+ ewma |= UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(p->se.avg.util_est, ewma);
+
+ trace_sched_util_est_se_tp(&p->se);
}
-static inline int task_fits_capacity(struct task_struct *p, long capacity)
+static inline unsigned long get_actual_cpu_capacity(int cpu)
{
- return fits_capacity(uclamp_task_util(p), capacity);
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+ capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+ return capacity;
+}
+
+static inline int util_fits_cpu(unsigned long util,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max,
+ int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long capacity_orig;
+ bool fits, uclamp_max_fits;
+
+ /*
+ * Check if the real util fits without any uclamp boost/cap applied.
+ */
+ fits = fits_capacity(util, capacity);
+
+ if (!uclamp_is_used())
+ return fits;
+
+ /*
+ * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
+ * uclamp_max. We only care about capacity pressure (by using
+ * capacity_of()) for comparing against the real util.
+ *
+ * If a task is boosted to 1024 for example, we don't want a tiny
+ * pressure to skew the check whether it fits a CPU or not.
+ *
+ * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
+ * should fit a little cpu even if there's some pressure.
+ *
+ * Only exception is for HW or cpufreq pressure since it has a direct impact
+ * on available OPP of the system.
+ *
+ * We honour it for uclamp_min only as a drop in performance level
+ * could result in not getting the requested minimum performance level.
+ *
+ * For uclamp_max, we can tolerate a drop in performance level as the
+ * goal is to cap the task. So it's okay if it's getting less.
+ */
+ capacity_orig = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * We want to force a task to fit a cpu as implied by uclamp_max.
+ * But we do have some corner cases to cater for..
+ *
+ *
+ * C=z
+ * | ___
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | |
+ * | | | | | | | (util somewhere in this region)
+ * | | | | | | |
+ * | | | | | | |
+ * +----------------------------------------
+ * CPU0 CPU1 CPU2
+ *
+ * In the above example if a task is capped to a specific performance
+ * point, y, then when:
+ *
+ * * util = 80% of x then it does not fit on CPU0 and should migrate
+ * to CPU1
+ * * util = 80% of y then it is forced to fit on CPU1 to honour
+ * uclamp_max request.
+ *
+ * which is what we're enforcing here. A task always fits if
+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+ * the normal upmigration rules should withhold still.
+ *
+ * Only exception is when we are on max capacity, then we need to be
+ * careful not to block overutilized state. This is so because:
+ *
+ * 1. There's no concept of capping at max_capacity! We can't go
+ * beyond this performance level anyway.
+ * 2. The system is being saturated when we're operating near
+ * max capacity, it doesn't make sense to block overutilized.
+ */
+ uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+ uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
+ fits = fits || uclamp_max_fits;
+
+ /*
+ *
+ * C=z
+ * | ___ (region a, capped, util >= uclamp_max)
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
+ * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+ * | | | | | | |
+ * | | | | | | | (region c, boosted, util < uclamp_min)
+ * +----------------------------------------
+ * CPU0 CPU1 CPU2
+ *
+ * a) If util > uclamp_max, then we're capped, we don't care about
+ * actual fitness value here. We only care if uclamp_max fits
+ * capacity without taking margin/pressure into account.
+ * See comment above.
+ *
+ * b) If uclamp_min <= util <= uclamp_max, then the normal
+ * fits_capacity() rules apply. Except we need to ensure that we
+ * enforce we remain within uclamp_max, see comment above.
+ *
+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+ * need to take into account the boosted value fits the CPU without
+ * taking margin/pressure into account.
+ *
+ * Cases (a) and (b) are handled in the 'fits' variable already. We
+ * just need to consider an extra check for case (c) after ensuring we
+ * handle the case uclamp_min > uclamp_max.
+ */
+ uclamp_min = min(uclamp_min, uclamp_max);
+ if (fits && (util < uclamp_min) &&
+ (uclamp_min > get_actual_cpu_capacity(cpu)))
+ return -1;
+
+ return fits;
+}
+
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
+{
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+ /*
+ * Return true only if the cpu fully fits the task requirements, which
+ * include the utilization but also the performance hints.
+ */
+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
- return;
+ int cpu = cpu_of(rq);
- if (!p) {
- rq->misfit_task_load = 0;
+ if (!sched_asym_cpucap_active())
return;
- }
- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {
+
rq->misfit_task_load = 0;
return;
}
@@ -4048,9 +5127,15 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
#else /* CONFIG_SMP */
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ return !cfs_rq->nr_queued;
+}
+
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
#define DO_ATTACH 0x0
+#define DO_DETACH 0x0
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
@@ -4064,7 +5149,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
+static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -4073,178 +5158,210 @@ static inline void
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
{
-#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
-
- if (d < 0)
- d = -d;
+ struct sched_entity *se = &p->se;
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq->nr_spread_over);
-#endif
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ se->custom_slice = 1;
+ se->slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ se->custom_slice = 0;
+ se->slice = sysctl_sched_base_slice;
+ }
}
static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 vruntime = cfs_rq->min_vruntime;
+ u64 vslice, vruntime = avg_vruntime(cfs_rq);
+ s64 lag = 0;
+
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
+ vslice = calc_delta_fair(se->slice, se);
/*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
+ * Due to how V is constructed as the weighted average of entities,
+ * adding tasks with positive lag, or removing tasks with negative lag
+ * will move 'time' backwards, this can screw around with the lag of
+ * other tasks.
+ *
+ * EEVDF: placement strategy #1 / #2
*/
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
+ struct sched_entity *curr = cfs_rq->curr;
+ unsigned long load;
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh = sysctl_sched_latency;
+ lag = se->vlag;
/*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
+ * If we want to place a task and preserve lag, we have to
+ * consider the effect of the new entity on the weighted
+ * average and compensate for this, otherwise lag can quickly
+ * evaporate.
+ *
+ * Lag is defined as:
+ *
+ * lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * To avoid the 'w_i' term all over the place, we only track
+ * the virtual lag:
+ *
+ * vl_i = V - v_i <=> v_i = V - vl_i
+ *
+ * And we take V to be the weighted average of all v:
+ *
+ * V = (\Sum w_j*v_j) / W
+ *
+ * Where W is: \Sum w_j
+ *
+ * Then, the weighted average after adding an entity with lag
+ * vl_i is given by:
+ *
+ * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
+ * = (W*V + w_i*(V - vl_i)) / (W + w_i)
+ * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
+ * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = V - w_i*vl_i / (W + w_i)
+ *
+ * And the actual lag after adding an entity with vl_i is:
+ *
+ * vl'_i = V' - v_i
+ * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
+ * = vl_i - w_i*vl_i / (W + w_i)
+ *
+ * Which is strictly less than vl_i. So in order to preserve lag
+ * we should inflate the lag before placement such that the
+ * effective lag after placement comes out right.
+ *
+ * As such, invert the above relation for vl'_i to get the vl_i
+ * we need to use such that the lag after placement is the lag
+ * we computed before dequeue.
+ *
+ * vl'_i = vl_i - w_i*vl_i / (W + w_i)
+ * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
+ *
+ * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
+ * = W*vl_i
+ *
+ * vl_i = (W + w_i)*vl'_i / W
*/
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
+ load = cfs_rq->avg_load;
+ if (curr && curr->on_rq)
+ load += scale_load_down(curr->load.weight);
- vruntime -= thresh;
+ lag *= load + scale_load_down(se->load.weight);
+ if (WARN_ON_ONCE(!load))
+ load = 1;
+ lag = div_s64(lag, load);
}
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
-}
-
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+ se->vruntime = vruntime - lag;
-static inline void check_schedstat_required(void)
-{
-#ifdef CONFIG_SCHEDSTATS
- if (schedstat_enabled())
+ if (se->rel_deadline) {
+ se->deadline += se->vruntime;
+ se->rel_deadline = 0;
return;
-
- /* Force schedstat enabled if a dependent tracepoint is active */
- if (trace_sched_stat_wait_enabled() ||
- trace_sched_stat_sleep_enabled() ||
- trace_sched_stat_iowait_enabled() ||
- trace_sched_stat_blocked_enabled() ||
- trace_sched_stat_runtime_enabled()) {
- printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
- "stat_blocked and stat_runtime require the "
- "kernel parameter schedstats=enable or "
- "kernel.sched_schedstats=1\n");
}
-#endif
+
+ /*
+ * When joining the competition; the existing tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
+ */
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
+ vslice /= 2;
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}
-static inline bool cfs_bandwidth_used(void);
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
-/*
- * MIGRATION
- *
- * dequeue
- * update_curr()
- * update_min_vruntime()
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way the vruntime transition between RQs is done when both
- * min_vruntime are up-to-date.
- *
- * WAKEUP (remote)
- *
- * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
- * vruntime -= min_vruntime
- *
- * enqueue
- * update_curr()
- * update_min_vruntime()
- * vruntime += min_vruntime
- *
- * this way we don't have the most up-to-date min_vruntime on the originating
- * CPU and an up-to-date min_vruntime on the destination CPU.
- */
+static void
+requeue_delayed_entity(struct sched_entity *se);
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
+ if (curr)
+ place_entity(cfs_rq, se, flags);
update_curr(cfs_rq);
/*
- * Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past. Being
- * placed in the past could significantly boost this task to the
- * fairness detriment of existing tasks.
- */
- if (renorm && !curr)
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_runnable of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
se_update_runnable(se);
+ /*
+ * XXX update_load_avg() above will have attached us to the pelt sum;
+ * but update_cfs_group() here will re-adjust the weight and have to
+ * undo/redo all that. Seems wasteful.
+ */
update_cfs_group(se);
+
+ /*
+ * XXX now that the entity has been re-weighted, and it's lag adjusted,
+ * we can place the entity.
+ */
+ if (!curr)
+ place_entity(cfs_rq, se, flags);
+
account_entity_enqueue(cfs_rq, se);
- if (flags & ENQUEUE_WAKEUP)
- place_entity(cfs_rq, se, 0);
+ /* Entity has migrated, no longer consider this task hot */
+ if (flags & ENQUEUE_MIGRATED)
+ se->exec_start = 0;
check_schedstat_required();
- update_stats_enqueue(cfs_rq, se, flags);
- check_spread(cfs_rq, se);
+ update_stats_enqueue_fair(cfs_rq, se, flags);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- /*
- * When bandwidth control is enabled, cfs might have been removed
- * because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionnally.
- */
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
- list_add_leaf_cfs_rq(cfs_rq);
-
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last != se)
- break;
+ if (!throttled_hierarchy(cfs_rq)) {
+ list_add_leaf_cfs_rq(cfs_rq);
+ } else {
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct rq *rq = rq_of(cfs_rq);
- cfs_rq->last = NULL;
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+#endif
+ }
}
}
@@ -4259,68 +5376,122 @@ static void __clear_buddies_next(struct sched_entity *se)
}
}
-static void __clear_buddies_skip(struct sched_entity *se)
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ if (cfs_rq->next == se)
+ __clear_buddies_next(se);
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+static void set_delayed(struct sched_entity *se)
+{
+ se->sched_delayed = 1;
+
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since dequeue_entities()
+ * will account it for blocked tasks.
+ */
+ if (!entity_is_task(se))
+ return;
+
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip != se)
- break;
- cfs_rq->skip = NULL;
+ cfs_rq->h_nr_runnable--;
+ if (cfs_rq_throttled(cfs_rq))
+ break;
}
}
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void clear_delayed(struct sched_entity *se)
{
- if (cfs_rq->last == se)
- __clear_buddies_last(se);
+ se->sched_delayed = 0;
- if (cfs_rq->next == se)
- __clear_buddies_next(se);
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since a dequeue has
+ * already accounted for it or an enqueue of a task
+ * below it will account for it in enqueue_task_fair().
+ */
+ if (!entity_is_task(se))
+ return;
- if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ cfs_rq->h_nr_runnable++;
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
}
-static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+{
+ clear_delayed(se);
+ if (sched_feat(DELAY_ZERO) && se->vlag > 0)
+ se->vlag = 0;
+}
-static void
+static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- /*
- * Update run-time statistics of the 'current'.
- */
+ bool sleep = flags & DEQUEUE_SLEEP;
+ int action = UPDATE_TG;
+
update_curr(cfs_rq);
+ clear_buddies(cfs_rq, se);
+
+ if (flags & DEQUEUE_DELAYED) {
+ SCHED_WARN_ON(!se->sched_delayed);
+ } else {
+ bool delay = sleep;
+ /*
+ * DELAY_DEQUEUE relies on spurious wakeups, special task
+ * states must not suffer spurious wakeups, excempt them.
+ */
+ if (flags & DEQUEUE_SPECIAL)
+ delay = false;
+
+ SCHED_WARN_ON(delay && se->sched_delayed);
+
+ if (sched_feat(DELAY_DEQUEUE) && delay &&
+ !entity_eligible(cfs_rq, se)) {
+ update_load_avg(cfs_rq, se, 0);
+ set_delayed(se);
+ return false;
+ }
+ }
+
+ if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
+ action |= DO_DETACH;
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Subtract its load from the cfs_rq->runnable_avg.
+ * - For group_entity, update its runnable_weight to reflect the new
+ * h_nr_runnable of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(cfs_rq, se, action);
se_update_runnable(se);
- update_stats_dequeue(cfs_rq, se, flags);
+ update_stats_dequeue_fair(cfs_rq, se, flags);
- clear_buddies(cfs_rq, se);
+ update_entity_lag(cfs_rq, se);
+ if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
+ }
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
@@ -4330,55 +5501,25 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- ie. we'll be penalized.
+ * further than we started -- i.e. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
-}
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- unsigned long ideal_runtime, delta_exec;
- struct sched_entity *se;
- s64 delta;
+ if (flags & DEQUEUE_DELAYED)
+ finish_delayed_dequeue_entity(se);
- ideal_runtime = sched_slice(cfs_rq, curr);
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
- return;
- }
-
- /*
- * Ensure that a task that missed wakeup preemption by a
- * narrow margin doesn't have to wait for a full slice.
- * This also mitigates buddy induced latencies under load.
- */
- if (delta_exec < sysctl_sched_min_granularity)
- return;
+ if (cfs_rq->nr_queued == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
- se = __pick_first_entity(cfs_rq);
- delta = curr->vruntime - se->vruntime;
-
- if (delta < 0)
- return;
-
- if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
+ return true;
}
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ clear_buddies(cfs_rq, se);
+
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
@@ -4386,31 +5527,39 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
- update_stats_wait_end(cfs_rq, se);
+ update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
+ /*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+ se->vlag = se->deadline;
}
update_stats_curr_start(cfs_rq, se);
+ SCHED_WARN_ON(cfs_rq->curr);
cfs_rq->curr = se;
/*
* Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
+ * least twice that of our own weight (i.e. don't track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
- schedstat_set(se->statistics.slice_max,
- max((u64)schedstat_val(se->statistics.slice_max),
- se->sum_exec_runtime - se->prev_sum_exec_runtime));
+ struct sched_statistics *stats;
+
+ stats = __schedstats_from_se(se);
+ __schedstat_set(stats->slice_max,
+ max((u64)stats->slice_max,
+ se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
/*
* Pick the next process, keeping these things in mind, in this order:
@@ -4420,53 +5569,28 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
struct sched_entity *se;
/*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
- */
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
-
- se = left; /* ideally we run the leftmost entity */
-
- /*
- * Avoid running the skip buddy, if running something else can
- * be done without getting too unfair.
+ * Picking the ->next buddy will affect latency but not fairness.
*/
- if (cfs_rq->skip == se) {
- struct sched_entity *second;
-
- if (se == curr) {
- second = __pick_first_entity(cfs_rq);
- } else {
- second = __pick_next_entity(se);
- if (!second || (curr && entity_before(curr, second)))
- second = curr;
- }
-
- if (second && wakeup_preempt_entity(second, left) < 1)
- se = second;
+ if (sched_feat(PICK_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ SCHED_WARN_ON(cfs_rq->next->sched_delayed);
+ return cfs_rq->next;
}
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
- se = cfs_rq->last;
-
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
-
- clear_buddies(cfs_rq, se);
-
+ se = pick_eevdf(cfs_rq);
+ if (se->sched_delayed) {
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ /*
+ * Must not reference @se again, see __block_task().
+ */
+ return NULL;
+ }
return se;
}
@@ -4484,15 +5608,14 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
- check_spread(cfs_rq, prev);
-
if (prev->on_rq) {
- update_stats_wait_start(cfs_rq, prev);
+ update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
+ SCHED_WARN_ON(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
@@ -4516,19 +5639,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
- /*
- * don't let the period tick interfere with the hrtick preemption
- */
- if (!sched_feat(DOUBLE_TICK) &&
- hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
- return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}
@@ -4588,8 +5702,20 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- if (cfs_b->quota != RUNTIME_INF)
- cfs_b->runtime = cfs_b->quota;
+ s64 runtime;
+
+ if (unlikely(cfs_b->quota == RUNTIME_INF))
+ return;
+
+ cfs_b->runtime += cfs_b->quota;
+ runtime = cfs_b->runtime_snap - cfs_b->runtime;
+ if (runtime > 0) {
+ cfs_b->burst_time += runtime;
+ cfs_b->nr_burst++;
+ }
+
+ cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
+ cfs_b->runtime_snap = cfs_b->runtime;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4700,12 +5826,23 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- cfs_rq->throttled_clock_task;
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
- /* Add cfs_rq with already running entity in the list */
- if (cfs_rq->nr_running >= 1)
+ /* Add cfs_rq with load or one or more already running entities to the list */
+ if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+ cfs_rq->throttled_clock_self = 0;
+
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;
+
+ cfs_rq->throttled_clock_self_time += delta;
+ }
}
return 0;
@@ -4718,8 +5855,12 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
+
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_queued)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@@ -4731,7 +5872,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, dequeue = 1;
+ long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+ long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -4761,37 +5903,73 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
+ queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
+ idle_delta = cfs_rq->h_nr_idle;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ int flags;
+
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
- break;
+ goto done;
- if (dequeue) {
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- } else {
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
+ /*
+ * Abuse SPECIAL to avoid delayed dequeue in this instance.
+ * This avoids teaching dequeue_entities() about throttled
+ * entities and keeps things relatively simple.
+ */
+ flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
+ if (se->sched_delayed)
+ flags |= DEQUEUE_DELAYED;
+ dequeue_entity(qcfs_rq, se, flags);
+
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_delta = cfs_rq->h_nr_queued;
+
+ qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
+ qcfs_rq->h_nr_idle -= idle_delta;
+
+ if (qcfs_rq->load.weight) {
+ /* Avoid re-evaluating load for this entity: */
+ se = parent_entity(se);
+ break;
}
+ }
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ /* throttled entity or throttle-on-deactivate */
+ if (!se->on_rq)
+ goto done;
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ update_load_avg(qcfs_rq, se, 0);
+ se_update_runnable(se);
- if (qcfs_rq->load.weight)
- dequeue = 0;
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_delta = cfs_rq->h_nr_queued;
+
+ qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
+ qcfs_rq->h_nr_idle -= idle_delta;
}
- if (!se)
- sub_nr_running(rq, task_delta);
+ /* At this point se is NULL and we are at root level*/
+ sub_nr_running(rq, queued_delta);
+ /* Stop the fair server if throttling resulted in no runnable tasks */
+ if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
+ dl_server_stop(&rq->fair_server);
+done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
+ SCHED_WARN_ON(cfs_rq->throttled_clock);
+ if (cfs_rq->nr_queued)
+ cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -4800,7 +5978,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta;
+ long queued_delta, runnable_delta, idle_delta;
+ long rq_h_nr_queued = rq->cfs.h_nr_queued;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4809,93 +5988,196 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ if (cfs_rq->throttled_clock) {
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ cfs_rq->throttled_clock = 0;
+ }
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- if (!cfs_rq->load.weight)
- return;
+ if (!cfs_rq->load.weight) {
+ if (!cfs_rq->on_list)
+ return;
+ /*
+ * Nothing to run but something to decay (on_list)?
+ * Complete the branch.
+ */
+ for_each_sched_entity(se) {
+ if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+ break;
+ }
+ goto unthrottle_throttle;
+ }
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
+ queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
+ idle_delta = cfs_rq->h_nr_idle;
for_each_sched_entity(se) {
- if (se->on_rq)
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+
+ /* Handle any unfinished DELAY_DEQUEUE business first. */
+ if (se->sched_delayed) {
+ int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
+
+ dequeue_entity(qcfs_rq, se, flags);
+ } else if (se->on_rq)
break;
- cfs_rq = cfs_rq_of(se);
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_delta = cfs_rq->h_nr_queued;
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
+ qcfs_rq->h_nr_idle += idle_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
}
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(qcfs_rq, se, UPDATE_TG);
se_update_runnable(se);
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_delta = cfs_rq->h_nr_queued;
+ qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
+ qcfs_rq->h_nr_idle += idle_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
}
+ /* Start the fair server if un-throttling resulted in new runnable tasks */
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
+ dl_server_start(&rq->fair_server);
+
/* At this point se is NULL and we are at root level*/
- add_nr_running(rq, task_delta);
+ add_nr_running(rq, queued_delta);
unthrottle_throttle:
+ assert_list_leaf_cfs_rq(rq);
+
+ /* Determine whether we need to wake up potentially idle CPU: */
+ if (rq->curr == rq->idle && rq->cfs.nr_queued)
+ resched_curr(rq);
+}
+
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
+{
+ struct cfs_rq *cursor, *tmp;
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+
/*
- * The cfs_rq_throttled() breaks in the above iteration can result in
- * incomplete leaf list maintenance, resulting in triggering the
- * assertion below.
+ * Iterating over the list can trigger several call to
+ * update_rq_clock() in unthrottle_cfs_rq().
+ * Do it once and skip the potential next ones.
*/
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ update_rq_clock(rq);
+ rq_clock_start_loop_update(rq);
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
}
- assert_list_leaf_cfs_rq(rq);
+ rcu_read_unlock();
- /* Determine whether we need to wake up potentially idle CPU: */
- if (rq->curr == rq->idle && rq->cfs.nr_running)
- resched_curr(rq);
+ rq_clock_stop_loop_update(rq);
+ rq_unlock(rq, &rf);
}
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
- struct cfs_rq *cfs_rq;
+ struct rq *rq = rq_of(cfs_rq);
+ bool first;
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ first = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+ if (first)
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+ int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
+ bool throttled = false;
+ struct cfs_rq *cfs_rq, *tmp;
+ struct rq_flags rf;
+ struct rq *rq;
+ LIST_HEAD(local_unthrottle);
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
- struct rq_flags rf;
+ rq = rq_of(cfs_rq);
+
+ if (!remaining) {
+ throttled = true;
+ break;
+ }
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
- /* By the above check, this should never be true */
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+
+ /* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
@@ -4909,16 +6191,44 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu) {
+ unthrottle_cfs_rq_async(cfs_rq);
+ } else {
+ /*
+ * We currently only expect to be unthrottling
+ * a single cfs_rq locally.
+ */
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ list_add_tail(&cfs_rq->throttled_csd_list,
+ &local_unthrottle);
+ }
+ } else {
+ throttled = true;
+ }
next:
rq_unlock_irqrestore(rq, &rf);
+ }
- if (!remaining)
- break;
+ list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+ throttled_csd_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ rq_lock_irqsave(rq, &rf);
+
+ list_del_init(&cfs_rq->throttled_csd_list);
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+
+ rq_unlock_irqrestore(rq, &rf);
}
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
rcu_read_unlock();
+
+ return throttled;
}
/*
@@ -4938,6 +6248,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->nr_periods += overrun;
+ /* Refill extra burst quota even if cfs_b->idle */
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
/*
* idle depends on !throttled (for the case of a large deficit), and if
* we're going inactive then everything else can be deferred
@@ -4945,8 +6258,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
if (cfs_b->idle && !throttled)
goto out_deactivate;
- __refill_cfs_bandwidth_runtime(cfs_b);
-
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
@@ -4962,10 +6273,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- distribute_cfs_runtime(cfs_b);
+ throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/*
@@ -4999,7 +6308,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
- u64 remaining;
+ s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
@@ -5007,7 +6316,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- if (remaining < min_expire)
+ if (remaining < (s64)min_expire)
return 1;
return 0;
@@ -5060,7 +6369,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -5093,15 +6402,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
return;
distribute_cfs_runtime(cfs_b);
-
- raw_spin_lock_irqsave(&cfs_b->lock, flags);
- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ * runtime as update_curr() throttling can not trigger until it's on-rq.
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
@@ -5136,7 +6442,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -5199,6 +6505,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (new < max_cfs_quota_period) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
+ cfs_b->burst *= 2;
pr_warn_ratelimited(
"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
@@ -5224,16 +6531,22 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
{
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->burst = 0;
+ cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
+
+ /* Add a random offset so that timers interleave */
+ hrtimer_set_expires(&cfs_b->period_timer,
+ get_random_u32_below(cfs_b->period));
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->slack_started = false;
@@ -5243,6 +6556,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5259,12 +6573,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
+#endif
}
/*
@@ -5274,12 +6614,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* bits doesn't do much.
*/
-/* cpu online calback */
+/* cpu online callback */
static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5298,7 +6638,18 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
+
+ // Do not unthrottle for an active CPU
+ if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
+ return;
+
+ /*
+ * The rq clock has already been updated in the
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5308,29 +6659,68 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
continue;
/*
- * clock_task is not advancing so we just need to make sure
- * there's some valid quota amount
- */
- cfs_rq->runtime_remaining = 1;
- /*
* Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
- if (cfs_rq_throttled(cfs_rq))
- unthrottle_cfs_rq(cfs_rq);
+ if (!cfs_rq_throttled(cfs_rq))
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = 1;
+ unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
-}
-#else /* CONFIG_CFS_BANDWIDTH */
+ rq_clock_stop_loop_update(rq);
+}
-static inline bool cfs_bandwidth_used(void)
+bool cfs_task_bw_constrained(struct task_struct *p)
{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (cfs_rq->runtime_enabled ||
+ tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
+ return true;
+
return false;
}
+#ifdef CONFIG_NO_HZ_FULL
+/* called from pick_next_task_fair() */
+static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq);
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ if (rq->nr_running != 1)
+ return;
+
+ /*
+ * We know there is only one task runnable and we've just picked it. The
+ * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
+ * be otherwise able to stop the tick. Just need to check if we are using
+ * bandwidth control.
+ */
+ if (cfs_task_bw_constrained(p))
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#endif
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
@@ -5353,9 +6743,8 @@ static inline int throttled_lb_pair(struct task_group *tg,
return 0;
}
-void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
@@ -5366,9 +6755,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-
+#ifdef CONFIG_CGROUP_SCHED
+bool cfs_task_bw_constrained(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif /* CONFIG_CFS_BANDWIDTH */
+#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
+static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
+#endif
+
/**************************************************
* CFS operations on tasks:
*/
@@ -5377,17 +6775,16 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
SCHED_WARN_ON(task_rq(p) != rq);
- if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
+ if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;
if (delta < 0) {
- if (rq->curr == p)
+ if (task_current_donor(rq, p))
resched_curr(rq);
return;
}
@@ -5402,13 +6799,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
*/
static void hrtick_update(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *donor = rq->donor;
- if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class)
return;
- if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, donor);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -5422,28 +6818,55 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
-static inline unsigned long cpu_util(int cpu);
-
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
+ unsigned long rq_util_min, rq_util_max;
+
+ if (!sched_energy_enabled())
+ return false;
+
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ /* Return true only if the utilization doesn't fit CPU's capacity */
+ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+}
+
+/*
+ * overutilized value make sense only if EAS is enabled
+ */
+static inline bool is_rd_overutilized(struct root_domain *rd)
+{
+ return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
}
-static inline void update_overutilized_status(struct rq *rq)
+static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
- WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
- }
+ if (!sched_energy_enabled())
+ return;
+
+ WRITE_ONCE(rd->overutilized, flag);
+ trace_sched_overutilized_tp(rd, flag);
+}
+
+static inline void check_update_overutilized_status(struct rq *rq)
+{
+ /*
+ * overutilized field is used for load balancing decisions only
+ * if energy aware scheduler is being used
+ */
+
+ if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
+ set_rd_overutilized(rq->rd, 1);
}
#else
-static inline void update_overutilized_status(struct rq *rq) { }
+static inline void check_update_overutilized_status(struct rq *rq) { }
#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
{
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
rq->nr_running);
}
@@ -5454,6 +6877,37 @@ static int sched_idle_cpu(int cpu)
}
#endif
+static void
+requeue_delayed_entity(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * se->sched_delayed should imply: se->on_rq == 1.
+ * Because a delayed entity is one that is still on
+ * the runqueue competing until elegibility.
+ */
+ SCHED_WARN_ON(!se->sched_delayed);
+ SCHED_WARN_ON(!se->on_rq);
+
+ if (sched_feat(DELAY_ZERO)) {
+ update_entity_lag(cfs_rq, se);
+ if (se->vlag > 0) {
+ cfs_rq->nr_queued--;
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ se->vlag = 0;
+ place_entity(cfs_rq, se, 0);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
+ }
+ }
+
+ update_load_avg(cfs_rq, se, 0);
+ clear_delayed(se);
+}
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5464,7 +6918,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int idle_h_nr_running = task_has_idle_policy(p);
+ int h_nr_idle = task_has_idle_policy(p);
+ int h_nr_runnable = 1;
+ int task_new = !(flags & ENQUEUE_WAKEUP);
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
+ u64 slice = 0;
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5472,7 +6930,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
- util_est_enqueue(&rq->cfs, p);
+ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+ util_est_enqueue(&rq->cfs, p);
+
+ if (flags & ENQUEUE_DELAYED) {
+ requeue_delayed_entity(se);
+ return;
+ }
/*
* If in_iowait is set, the code below may not trigger any cpufreq
@@ -5482,14 +6946,35 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
+ if (task_new && se->sched_delayed)
+ h_nr_runnable = 0;
+
for_each_sched_entity(se) {
- if (se->on_rq)
+ if (se->on_rq) {
+ if (se->sched_delayed)
+ requeue_delayed_entity(se);
break;
+ }
cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Basically set the slice of group entries to the min_slice of
+ * their respective cfs_rq. This ensures the group can service
+ * its entities in the desired time-frame.
+ */
+ if (slice) {
+ se->slice = slice;
+ se->custom_slice = 1;
+ }
enqueue_entity(cfs_rq, se, flags);
+ slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
+
+ if (cfs_rq_is_idle(cfs_rq))
+ h_nr_idle = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
@@ -5505,19 +6990,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se_update_runnable(se);
update_cfs_group(se);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ se->slice = slice;
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
+
+ if (cfs_rq_is_idle(cfs_rq))
+ h_nr_idle = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
+ }
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
+ /* Account for idle runtime */
+ if (!rq->nr_running)
+ dl_server_update_idle_time(rq, rq->curr);
+ dl_server_start(&rq->fair_server);
}
/* At this point se is NULL and we are at root level*/
@@ -5537,25 +7029,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
- if (flags & ENQUEUE_WAKEUP)
- update_overutilized_status(rq);
+ if (!task_new)
+ check_update_overutilized_status(rq);
enqueue_throttle:
- if (cfs_bandwidth_used()) {
- /*
- * When bandwidth control is enabled; the cfs_rq_throttled()
- * breaks in the above iteration can result in incomplete
- * leaf list maintenance, resulting in triggering the assertion
- * below.
- */
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
- }
- }
-
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -5564,31 +7041,63 @@ enqueue_throttle:
static void set_next_buddy(struct sched_entity *se);
/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ * 0 - dequeue throttled
+ * 1 - dequeue complete
*/
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
- int task_sleep = flags & DEQUEUE_SLEEP;
- int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
+ bool task_sleep = flags & DEQUEUE_SLEEP;
+ bool task_delayed = flags & DEQUEUE_DELAYED;
+ struct task_struct *p = NULL;
+ int h_nr_idle = 0;
+ int h_nr_queued = 0;
+ int h_nr_runnable = 0;
+ struct cfs_rq *cfs_rq;
+ u64 slice = 0;
+
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ h_nr_queued = 1;
+ h_nr_idle = task_has_idle_policy(p);
+ if (task_sleep || task_delayed || !se->sched_delayed)
+ h_nr_runnable = 1;
+ } else {
+ cfs_rq = group_cfs_rq(se);
+ slice = cfs_rq_min_slice(cfs_rq);
+ }
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, flags);
- cfs_rq->h_nr_running--;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (!dequeue_entity(cfs_rq, se, flags)) {
+ if (p && &p->se == se)
+ return -1;
+
+ break;
+ }
+
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
+
+ if (cfs_rq_is_idle(cfs_rq))
+ h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
+ return 0;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
+ slice = cfs_rq_min_slice(cfs_rq);
+
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
/*
@@ -5600,6 +7109,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
}
flags |= DEQUEUE_SLEEP;
+ flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
}
for_each_sched_entity(se) {
@@ -5609,32 +7119,76 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se_update_runnable(se);
update_cfs_group(se);
- cfs_rq->h_nr_running--;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ se->slice = slice;
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
+
+ if (cfs_rq_is_idle(cfs_rq))
+ h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
+ return 0;
}
-dequeue_throttle:
- if (!se)
- sub_nr_running(rq, 1);
+ sub_nr_running(rq, h_nr_queued);
+
+ if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
+ dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
- util_est_dequeue(&rq->cfs, p, task_sleep);
+ if (p && task_delayed) {
+ SCHED_WARN_ON(!task_sleep);
+ SCHED_WARN_ON(p->on_rq != 1);
+
+ /* Fix-up what dequeue_task_fair() skipped */
+ hrtick_update(rq);
+
+ /*
+ * Fix-up what block_task() skipped.
+ *
+ * Must be last, @p might not be valid after this.
+ */
+ __block_task(rq, p);
+ }
+
+ return 1;
+}
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+ util_est_dequeue(&rq->cfs, p);
+
+ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+ if (dequeue_entities(rq, &p->se, flags) < 0)
+ return false;
+
+ /*
+ * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
+ */
+
hrtick_update(rq);
+ return true;
}
#ifdef CONFIG_SMP
-/* Working cpumask for: load_balance, load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -5642,6 +7196,7 @@ static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
int has_blocked; /* Idle CPUS has blocked load */
+ int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
} nohz ____cacheline_aligned;
@@ -5792,6 +7347,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
+ if (available_idle_cpu(prev_cpu))
+ return prev_cpu;
+
return nr_cpumask_bits;
}
@@ -5849,23 +7407,23 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
- schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
- if (target == nr_cpumask_bits)
+ schedstat_inc(p->stats.nr_wakeups_affine_attempts);
+ if (target != this_cpu)
return prev_cpu;
schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ schedstat_inc(p->stats.nr_wakeups_affine);
return target;
}
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
/*
- * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
+ * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
*/
static int
-find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -5880,11 +7438,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ struct rq *rq = cpu_rq(i);
+
+ if (!sched_core_cookie_match(rq, p))
+ continue;
+
if (sched_idle_cpu(i))
return i;
if (available_idle_cpu(i)) {
- struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
@@ -5917,7 +7479,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
@@ -5942,13 +7504,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu);
+ group = sched_balance_find_dst_group(sd, p, cpu);
if (!group) {
sd = sd->child;
continue;
}
- new_cpu = find_idlest_group_cpu(group, p, cpu);
+ new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of 'cpu': */
sd = sd->child;
@@ -5970,6 +7532,15 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return new_cpu;
}
+static inline int __select_idle_cpu(int cpu, struct task_struct *p)
+{
+ if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+ sched_cpu_cookie_match(cpu_rq(cpu), p))
+ return cpu;
+
+ return -1;
+}
+
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
@@ -5983,7 +7554,7 @@ static inline void set_idle_cores(int cpu, int val)
WRITE_ONCE(sds->has_idle_cores, val);
}
-static inline bool test_idle_cores(int cpu, bool def)
+static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
@@ -5991,7 +7562,7 @@ static inline bool test_idle_cores(int cpu, bool def)
if (sds)
return READ_ONCE(sds->has_idle_cores);
- return def;
+ return false;
}
/*
@@ -6007,7 +7578,7 @@ void __update_idle_core(struct rq *rq)
int cpu;
rcu_read_lock();
- if (test_idle_cores(core, true))
+ if (test_idle_cores(core))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -6028,54 +7599,49 @@ unlock:
* there are no idle cores left in the system; tracked through
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu;
-
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
- if (!test_idle_cores(target, false))
- return -1;
-
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
- for_each_cpu_wrap(core, cpus, target) {
- bool idle = true;
+ bool idle = true;
+ int cpu;
- for_each_cpu(cpu, cpu_smt_mask(core)) {
- if (!available_idle_cpu(cpu)) {
- idle = false;
- break;
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+ if (!available_idle_cpu(cpu)) {
+ idle = false;
+ if (*idle_cpu == -1) {
+ if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+ *idle_cpu = cpu;
+ break;
+ }
+ continue;
}
+ break;
}
- cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
-
- if (idle)
- return core;
+ if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
+ *idle_cpu = cpu;
}
- /*
- * Failed to find an idle core; stop looking for one.
- */
- set_idle_cores(target, 0);
+ if (idle)
+ return core;
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
return -1;
}
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
+ if (cpu == target)
+ continue;
+ /*
+ * Check if the CPU is in the LLC scheduling domain of @target.
+ * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
+ */
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6086,12 +7652,21 @@ static int select_idle_smt(struct task_struct *p, int target)
#else /* CONFIG_SCHED_SMT */
-static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static inline void set_idle_cores(int cpu, int val)
{
- return -1;
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline bool test_idle_cores(int cpu)
+{
+ return false;
+}
+
+static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
+{
+ return __select_idle_cpu(core, p);
+}
+
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -6103,52 +7678,68 @@ static inline int select_idle_smt(struct task_struct *p, int target)
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- struct sched_domain *this_sd;
- u64 avg_cost, avg_idle;
- u64 time;
- int this = smp_processor_id();
- int cpu, nr = INT_MAX;
-
- this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
- if (!this_sd)
- return -1;
-
- /*
- * Due to large variance we need a large fuzz factor; hackbench in
- * particularly is sensitive here.
- */
- avg_idle = this_rq()->avg_idle / 512;
- avg_cost = this_sd->avg_scan_cost + 1;
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+ int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct sched_domain_shared *sd_share;
- if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
- return -1;
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- if (sched_feat(SIS_PROP)) {
- u64 span_avg = sd->span_weight * avg_idle;
- if (span_avg > 4*avg_cost)
- nr = div_u64(span_avg, avg_cost);
- else
- nr = 4;
+ if (sched_feat(SIS_UTIL)) {
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ if (sd_share) {
+ /* because !--nr is the condition to stop scan */
+ nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+ /* overloaded LLC is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
+ }
}
- time = cpu_clock(this);
+ if (static_branch_unlikely(&sched_cluster_active)) {
+ struct sched_group *sg = sd->groups;
+
+ if (sg->flags & SD_CLUSTER) {
+ for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+ if (!cpumask_test_cpu(cpu, cpus))
+ continue;
+
+ if (has_idle_core) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ } else {
+ if (--nr <= 0)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu, p);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ return idle_cpu;
+ }
+ }
+ cpumask_andnot(cpus, cpus, sched_group_span(sg));
+ }
+ }
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
+ if (has_idle_core) {
+ i = select_idle_core(p, cpu, cpus, &idle_cpu);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
- for_each_cpu_wrap(cpu, cpus, target) {
- if (!--nr)
- return -1;
- if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
- break;
+ } else {
+ if (--nr <= 0)
+ return -1;
+ idle_cpu = __select_idle_cpu(cpu, p);
+ if ((unsigned int)idle_cpu < nr_cpumask_bits)
+ break;
+ }
}
- time = cpu_clock(this) - time;
- update_avg(&this_sd->avg_scan_cost, time);
+ if (has_idle_core)
+ set_idle_cores(target, false);
- return cpu;
+ return idle_cpu;
}
/*
@@ -6159,71 +7750,109 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long best_cap = 0;
+ unsigned long task_util, util_min, util_max, best_cap = 0;
+ int fits, best_fits = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
- sync_entity_load_avg(&p->se);
-
- cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
+
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (task_fits_capacity(p, cpu_cap))
+
+ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+ /* This CPU fits with all requirements */
+ if (fits > 0)
return cpu;
+ /*
+ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+ * Look for the CPU with best capacity.
+ */
+ else if (fits < 0)
+ cpu_cap = get_actual_cpu_capacity(cpu);
- if (cpu_cap > best_cap) {
+ /*
+ * First, select CPU which fits better (-1 being better than 0).
+ * Then, select the one with best capacity at same level.
+ */
+ if ((fits < best_fits) ||
+ ((fits == best_fits) && (cpu_cap > best_cap))) {
best_cap = cpu_cap;
best_cpu = cpu;
+ best_fits = fits;
}
}
return best_cpu;
}
+static inline bool asym_fits_cpu(unsigned long util,
+ unsigned long util_min,
+ unsigned long util_max,
+ int cpu)
+{
+ if (sched_asym_cpucap_active())
+ /*
+ * Return true only if the cpu fully fits the task requirements
+ * which include the utilization and the performance hints.
+ */
+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+
+ return true;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
+ bool has_idle_core = false;
struct sched_domain *sd;
- int i, recent_used_cpu;
+ unsigned long task_util, util_min, util_max;
+ int i, recent_used_cpu, prev_aff = -1;
/*
- * For asymmetric CPU capacity systems, our domain of interest is
- * sd_asym_cpucapacity rather than sd_llc.
+ * On asymmetric system, update task utilization because we will check
+ * that the task fits with CPU's capacity.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
- /*
- * On an asymmetric CPU capacity system where an exclusive
- * cpuset defines a symmetric island (i.e. one unique
- * capacity_orig value through the cpuset), the key will be set
- * but the CPUs within that cpuset will not have a domain with
- * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
- * capacity path.
- */
- if (!sd)
- goto symmetric;
-
- i = select_idle_capacity(p, sd, target);
- return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ if (sched_asym_cpucap_active()) {
+ sync_entity_load_avg(&p->se);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
}
-symmetric:
- if (available_idle_cpu(target) || sched_idle_cpu(target))
+ /*
+ * per-cpu select_rq_mask usage
+ */
+ lockdep_assert_irqs_disabled();
+
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ asym_fits_cpu(task_util, util_min, util_max, target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)))
- return prev;
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(prev, target))
+ return prev;
+
+ prev_aff = prev;
+ }
/*
* Allow a per-cpu kthread to stack with the wakee if the
@@ -6234,95 +7863,197 @@ symmetric:
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
+ in_task() &&
prev == smp_processor_id() &&
- this_rq()->nr_running <= 1) {
+ this_rq()->nr_running <= 1 &&
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
return prev;
}
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
+ p->recent_used_cpu = prev;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
+ cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
+
+ if (!static_branch_unlikely(&sched_cluster_active) ||
+ cpus_share_resources(recent_used_cpu, target))
+ return recent_used_cpu;
+
+ } else {
+ recent_used_cpu = -1;
+ }
+
+ /*
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
+ */
+ if (sched_asym_cpucap_active()) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
/*
- * Replace recent_used_cpu with prev as it is a potential
- * candidate for the next wake:
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
*/
- p->recent_used_cpu = prev;
- return recent_used_cpu;
+ if (sd) {
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
}
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
- i = select_idle_core(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ if (sched_smt_active()) {
+ has_idle_core = test_idle_cores(target);
- i = select_idle_cpu(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
+ if (!has_idle_core && cpus_share_cache(prev, target)) {
+ i = select_idle_smt(p, sd, prev);
+ if ((unsigned int)i < nr_cpumask_bits)
+ return i;
+ }
+ }
- i = select_idle_smt(p, target);
+ i = select_idle_cpu(p, sd, has_idle_core, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
+ /*
+ * For cluster machines which have lower sharing cache like L2 or
+ * LLC Tag, we tend to find an idle CPU in the target's cluster
+ * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+ * use them if possible when no idle CPU found in select_idle_cpu().
+ */
+ if ((unsigned int)prev_aff < nr_cpumask_bits)
+ return prev_aff;
+ if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+ return recent_used_cpu;
+
return target;
}
/**
- * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
- * @cpu: the CPU to get the utilization of
- *
- * The unit of the return value must be the one of capacity so we can compare
- * the utilization with the capacity of the CPU that is available for CFS task
- * (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * The estimated utilization of a CPU is defined to be the maximum between its
- * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
- * currently RUNNABLE on that CPU.
- * This allows to properly represent the expected utilization of a CPU which
- * has just got a big task running since a long sleep period. At the same time
- * however it preserves the benefits of the "blocked utilization" in
- * describing the potential for other tasks waking up on the same CPU.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- *
- * Return: the (estimated) utilization for the specified CPU
- */
-static inline unsigned long cpu_util(int cpu)
+ * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for
+ * @p: task for which the CPU utilization should be predicted or NULL
+ * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
+ * @boost: 1 to enable boosting, otherwise 0
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
+ * CPU contention for CFS tasks can be detected by CPU runnable > CPU
+ * utilization. Boosting is implemented in cpu_util() so that internal
+ * users (e.g. EAS) can use it next to external users (e.g. schedutil),
+ * latter via cpu_util_cfs_boost().
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Boosted) (estimated) utilization for the specified CPU.
+ */
+static unsigned long
+cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{
- struct cfs_rq *cfs_rq;
- unsigned int util;
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long runnable;
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
+ if (boost) {
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ util = max(util, runnable);
+ }
+
+ /*
+ * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+ * contribution. If @p migrates from another CPU to @cpu add its
+ * contribution. In all the other cases @cpu is not impacted by the
+ * migration so its util_avg is already correct.
+ */
+ if (p && task_cpu(p) == cpu && dst_cpu != cpu)
+ lsub_positive(&util, task_util(p));
+ else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
+ util += task_util(p);
- if (sched_feat(UTIL_EST))
- util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+ if (sched_feat(UTIL_EST)) {
+ unsigned long util_est;
+
+ util_est = READ_ONCE(cfs_rq->avg.util_est);
- return min_t(unsigned long, util, capacity_orig_of(cpu));
+ /*
+ * During wake-up @p isn't enqueued yet and doesn't contribute
+ * to any cpu_rq(cpu)->cfs.avg.util_est.
+ * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+ * has been enqueued.
+ *
+ * During exec (@dst_cpu = -1) @p is enqueued and does
+ * contribute to cpu_rq(cpu)->cfs.util_est.
+ * Remove it to "simulate" cpu_util without @p's contribution.
+ *
+ * Despite the task_on_rq_queued(@p) check there is still a
+ * small window for a possible race when an exec
+ * select_task_rq_fair() races with LB's detach_task().
+ *
+ * detach_task()
+ * deactivate_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * -------------------------------- A
+ * dequeue_task() \
+ * dequeue_task_fair() + Race Time
+ * util_est_dequeue() /
+ * -------------------------------- B
+ *
+ * The additional check "current == p" is required to further
+ * reduce the race window.
+ */
+ if (dst_cpu == cpu)
+ util_est += _task_util_est(p);
+ else if (p && unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&util_est, _task_util_est(p));
+
+ util = max(util, util_est);
+ }
+
+ return min(util, arch_scale_cpu_capacity(cpu));
+}
+
+unsigned long cpu_util_cfs(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 0);
+}
+
+unsigned long cpu_util_cfs_boost(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 1);
}
/*
@@ -6340,168 +8071,255 @@ static inline unsigned long cpu_util(int cpu)
*/
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util(cpu);
+ p = NULL;
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
+ return cpu_util(cpu, p, -1, 0);
+}
- /* Discount task's util from CPU's util */
- lsub_positive(&util, task_util(p));
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the IRQ utilization.
+ *
+ * The DL bandwidth number OTOH is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max)
+{
+ unsigned long util, irq, scale;
+ struct rq *rq = cpu_rq(cpu);
+
+ scale = arch_scale_cpu_capacity(cpu);
/*
- * Covered cases:
- *
- * a) if *p is the only task sleeping on this CPU, then:
- * cpu_util (== task_util) > util_est (== 0)
- * and thus we return:
- * cpu_util_without = (cpu_util - task_util) = 0
- *
- * b) if other tasks are SLEEPING on this CPU, which is now exiting
- * IDLE, then:
- * cpu_util >= task_util
- * cpu_util > util_est (== 0)
- * and thus we discount *p's blocked utilization to return:
- * cpu_util_without = (cpu_util - task_util) >= 0
- *
- * c) if other tasks are RUNNABLE on that CPU and
- * util_est > cpu_util
- * then we use util_est since it returns a more restrictive
- * estimation of the spare capacity on that CPU, by just
- * considering the expected utilization of tasks already
- * runnable on that CPU.
- *
- * Cases a) and b) are covered by the above code, while case c) is
- * covered by the following code when estimated utilization is
- * enabled.
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
*/
- if (sched_feat(UTIL_EST)) {
- unsigned int estimated =
- READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+ if (min) {
/*
- * Despite the following checks we still have a small window
- * for a possible race, when an execl's select_task_rq_fair()
- * races with LB's detach_task():
- *
- * detach_task()
- * p->on_rq = TASK_ON_RQ_MIGRATING;
- * ---------------------------------- A
- * deactivate_task() \
- * dequeue_task() + RaceTime
- * util_est_dequeue() /
- * ---------------------------------- B
- *
- * The additional check on "current == p" it's required to
- * properly fix the execl regression and it helps in further
- * reducing the chances for the above race.
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
*/
- if (unlikely(task_on_rq_queued(p) || current == p))
- lsub_positive(&estimated, _task_util_est(p));
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
- util = max(util, estimated);
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
}
/*
- * Utilization (estimated) can exceed the CPU capacity, thus let's
- * clamp to the maximum CPU capacity to ensure consistency with
- * the cpu_util call.
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ util += cpu_util_dl(rq);
+
+ /*
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
+ */
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+
+ if (util >= scale)
+ return scale;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
*/
- return min_t(unsigned long, util, capacity_orig_of(cpu));
+ util = scale_irq_capacity(util, irq, scale);
+ util += irq;
+
+ return min(scale, util);
+}
+
+unsigned long sched_cpu_util(int cpu)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
+ * energy_env - Utilization landscape for energy estimation.
+ * @task_busy_time: Utilization contribution by the task for which we test the
+ * placement. Given by eenv_task_busy_time().
+ * @pd_busy_time: Utilization of the whole perf domain without the task
+ * contribution. Given by eenv_pd_busy_time().
+ * @cpu_cap: Maximum CPU capacity for the perf domain.
+ * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
*/
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+struct energy_env {
+ unsigned long task_busy_time;
+ unsigned long pd_busy_time;
+ unsigned long cpu_cap;
+ unsigned long pd_cap;
+};
+
+/*
+ * Compute the task busy time for compute_energy(). This time cannot be
+ * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * The latter only makes sense with the most recent CPUs where the task has
+ * run.
+ */
+static inline void eenv_task_busy_time(struct energy_env *eenv,
+ struct task_struct *p, int prev_cpu)
{
- struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
+ unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
- /*
- * If @p migrates from @cpu to another, remove its contribution. Or,
- * if @p migrates from another CPU to @cpu, add its contribution. In
- * the other cases, @cpu is not impacted by the migration, so the
- * util_avg should already be correct.
- */
- if (task_cpu(p) == cpu && dst_cpu != cpu)
- sub_positive(&util, task_util(p));
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
- util += task_util(p);
+ if (unlikely(irq >= max_cap))
+ busy_time = max_cap;
+ else
+ busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
- if (sched_feat(UTIL_EST)) {
- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ eenv->task_busy_time = busy_time;
+}
- /*
- * During wake-up, the task isn't enqueued yet and doesn't
- * appear in the cfs_rq->avg.util_est.enqueued of any rq,
- * so just add it (if needed) to "simulate" what will be
- * cpu_util() after the task has been enqueued.
- */
- if (dst_cpu == cpu)
- util_est += _task_util_est(p);
+/*
+ * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
+ * utilization for each @pd_cpus, it however doesn't take into account
+ * clamping since the ratio (utilization / cpu_capacity) is already enough to
+ * scale the EM reported power consumption at the (eventually clamped)
+ * cpu_capacity.
+ *
+ * The contribution of the task @p for which we want to estimate the
+ * energy cost is removed (by cpu_util()) and must be calculated
+ * separately (see eenv_task_busy_time). This ensures:
+ *
+ * - A stable PD utilization, no matter which CPU of that PD we want to place
+ * the task on.
+ *
+ * - A fair comparison between CPUs as the task contribution (task_util())
+ * will always be the same no matter which CPU utilization we rely on
+ * (util_avg or util_est).
+ *
+ * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
+ * exceed @eenv->pd_cap.
+ */
+static inline void eenv_pd_busy_time(struct energy_env *eenv,
+ struct cpumask *pd_cpus,
+ struct task_struct *p)
+{
+ unsigned long busy_time = 0;
+ int cpu;
- util = max(util, util_est);
+ for_each_cpu(cpu, pd_cpus) {
+ unsigned long util = cpu_util(cpu, p, -1, 0);
+
+ busy_time += effective_cpu_util(cpu, util, NULL, NULL);
}
- return min(util, capacity_orig_of(cpu));
+ eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
}
/*
- * compute_energy(): Estimates the energy that @pd would consume if @p was
- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of @pd's CPUs after the task migration, and uses the Energy Model
- * to compute what would be the energy if we decided to actually migrate that
- * task.
+ * Compute the maximum utilization for compute_energy() when the task @p
+ * is placed on the cpu @dst_cpu.
+ *
+ * Returns the maximum utilization among @eenv->cpus. This utilization can't
+ * exceed @eenv->cpu_cap.
*/
-static long
-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+static inline unsigned long
+eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+ struct task_struct *p, int dst_cpu)
{
- struct cpumask *pd_mask = perf_domain_span(pd);
- unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- unsigned long max_util = 0, sum_util = 0;
+ unsigned long max_util = 0;
int cpu;
- /*
- * The capacity state of CPUs of the current rd can be driven by CPUs
- * of another rd if they belong to the same pd. So, account for the
- * utilization of these CPUs too by masking pd with cpu_online_mask
- * instead of the rd span.
- *
- * If an entire pd is outside of the current rd, it will not appear in
- * its pd list and will not be accounted by compute_energy().
- */
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
- struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
-
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- ENERGY_UTIL, NULL);
+ for_each_cpu(cpu, pd_cpus) {
+ struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
+ unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ unsigned long eff_util, min, max;
/*
* Performance domain frequency: utilization clamping
* must be considered since it affects the selection
* of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the
- * FREQUENCY_UTIL's utilization can be max OPP.
+ * NOTE: in case RT tasks are running, by default the min
+ * utilization can be max OPP.
*/
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ eff_util = effective_cpu_util(cpu, util, &min, &max);
+
+ /* Task's uclamp can modify min and max value */
+ if (tsk && uclamp_is_used()) {
+ min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+
+ /*
+ * If there is no active max uclamp constraint,
+ * directly use task's one, otherwise keep max.
+ */
+ if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ max = uclamp_eff_value(p, UCLAMP_MAX);
+ else
+ max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
+ }
+
+ eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
+ max_util = max(max_util, eff_util);
}
- return em_pd_energy(pd->em_pd, max_util, sum_util);
+ return min(max_util, eenv->cpu_cap);
+}
+
+/*
+ * compute_energy(): Use the Energy Model to estimate the energy that @pd would
+ * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
+ * contribution is ignored.
+ */
+static inline unsigned long
+compute_energy(struct energy_env *eenv, struct perf_domain *pd,
+ struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long busy_time = eenv->pd_busy_time;
+ unsigned long energy;
+
+ if (dst_cpu >= 0)
+ busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+
+ energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+ trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+ return energy;
}
/*
@@ -6536,7 +8354,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: Forkees are not accepted in the energy-aware wake-up path because
* they don't have any useful utilization data yet and it's not possible to
* forecast their impact on energy consumption. Consequently, they will be
- * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
* to be energy-inefficient in some use-cases. The alternative would be to
* bias new tasks towards specific types of CPUs first, or to try to infer
* their util_avg from the parent task, but those heuristics could hurt
@@ -6545,17 +8363,23 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- unsigned long cpu_cap, util, base_energy = 0;
- int cpu, best_energy_cpu = prev_cpu;
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
+ struct root_domain *rd = this_rq()->rd;
+ int cpu, best_energy_cpu, target = -1;
+ int prev_fits = -1, best_fits = -1;
+ unsigned long best_actual_cap = 0;
+ unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
+ struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
- if (!pd || READ_ONCE(rd->overutilized))
- goto fail;
+ if (!pd)
+ goto unlock;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6565,113 +8389,198 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
- goto fail;
+ goto unlock;
+
+ target = prev_cpu;
sync_entity_load_avg(&p->se);
- if (!task_util_est(p))
+ if (!task_util_est(p) && p_util_min == 0)
goto unlock;
+ eenv_task_busy_time(&eenv, p, prev_cpu);
+
for (; pd; pd = pd->next) {
- unsigned long cur_delta, spare_cap, max_spare_cap = 0;
- unsigned long base_energy_pd;
+ unsigned long util_min = p_util_min, util_max = p_util_max;
+ unsigned long cpu_cap, cpu_actual_cap, util;
+ long prev_spare_cap = -1, max_spare_cap = -1;
+ unsigned long rq_util_min, rq_util_max;
+ unsigned long cur_delta, base_energy;
int max_spare_cap_cpu = -1;
+ int fits, max_fits = -1;
- /* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
+ cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+
+ if (cpumask_empty(cpus))
+ continue;
+
+ /* Account external pressure for the energy estimation */
+ cpu = cpumask_first(cpus);
+ cpu_actual_cap = get_actual_cpu_capacity(cpu);
+
+ eenv.cpu_cap = cpu_actual_cap;
+ eenv.pd_cap = 0;
+
+ for_each_cpu(cpu, cpus) {
+ struct rq *rq = cpu_rq(cpu);
+
+ eenv.pd_cap += cpu_actual_cap;
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- util = cpu_util_next(cpu, p, cpu);
+ util = cpu_util(cpu, p, cpu, 0);
cpu_cap = capacity_of(cpu);
- spare_cap = cpu_cap - util;
/*
* Skip CPUs that cannot satisfy the capacity request.
* IOW, placing the task there would make the CPU
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
- * aligned with schedutil_cpu_util().
+ * aligned with sched_cpu_util().
*/
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
- if (!fits_capacity(util, cpu_cap))
+ if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. I.e.: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+
+ fits = util_fits_cpu(util, util_min, util_max, cpu);
+ if (!fits)
continue;
- /* Always use prev_cpu as a candidate. */
+ lsub_positive(&cpu_cap, util);
+
if (cpu == prev_cpu) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- prev_delta -= base_energy_pd;
- best_delta = min(best_delta, prev_delta);
+ /* Always use prev_cpu as a candidate. */
+ prev_spare_cap = cpu_cap;
+ prev_fits = fits;
+ } else if ((fits > max_fits) ||
+ ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
+ /*
+ * Find the CPU with the maximum spare capacity
+ * among the remaining CPUs in the performance
+ * domain.
+ */
+ max_spare_cap = cpu_cap;
+ max_spare_cap_cpu = cpu;
+ max_fits = fits;
}
+ }
+
+ if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
+ continue;
+
+ eenv_pd_busy_time(&eenv, cpus, p);
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy = compute_energy(&eenv, pd, cpus, p, -1);
+
+ /* Evaluate the energy impact of using prev_cpu. */
+ if (prev_spare_cap > -1) {
+ prev_delta = compute_energy(&eenv, pd, cpus, p,
+ prev_cpu);
+ /* CPU utilization has changed */
+ if (prev_delta < base_energy)
+ goto unlock;
+ prev_delta -= base_energy;
+ prev_actual_cap = cpu_actual_cap;
+ best_delta = min(best_delta, prev_delta);
+ }
+
+ /* Evaluate the energy impact of using max_spare_cap_cpu. */
+ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+ /* Current best energy cpu fits better */
+ if (max_fits < best_fits)
+ continue;
/*
- * Find the CPU with the maximum spare capacity in
- * the performance domain
+ * Both don't fit performance hint (i.e. uclamp_min)
+ * but best energy cpu has better capacity.
*/
- if (spare_cap > max_spare_cap) {
- max_spare_cap = spare_cap;
- max_spare_cap_cpu = cpu;
- }
- }
+ if ((max_fits < 0) &&
+ (cpu_actual_cap <= best_actual_cap))
+ continue;
- /* Evaluate the energy impact of using this CPU. */
- if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
- cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
- cur_delta -= base_energy_pd;
- if (cur_delta < best_delta) {
- best_delta = cur_delta;
- best_energy_cpu = max_spare_cap_cpu;
- }
+ cur_delta = compute_energy(&eenv, pd, cpus, p,
+ max_spare_cap_cpu);
+ /* CPU utilization has changed */
+ if (cur_delta < base_energy)
+ goto unlock;
+ cur_delta -= base_energy;
+
+ /*
+ * Both fit for the task but best energy cpu has lower
+ * energy impact.
+ */
+ if ((max_fits > 0) && (best_fits > 0) &&
+ (cur_delta >= best_delta))
+ continue;
+
+ best_delta = cur_delta;
+ best_energy_cpu = max_spare_cap_cpu;
+ best_fits = max_fits;
+ best_actual_cap = cpu_actual_cap;
}
}
-unlock:
rcu_read_unlock();
- /*
- * Pick the best CPU if prev_cpu cannot be used, or if it saves at
- * least 6% of the energy used by prev_cpu.
- */
- if (prev_delta == ULONG_MAX)
- return best_energy_cpu;
-
- if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
- return best_energy_cpu;
+ if ((best_fits > prev_fits) ||
+ ((best_fits > 0) && (best_delta < prev_delta)) ||
+ ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
+ target = best_energy_cpu;
- return prev_cpu;
+ return target;
-fail:
+unlock:
rcu_read_unlock();
- return -1;
+ return target;
}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
- * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest CPU in the idlest group, or under
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
- *
- * preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
- int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+ /* SD_flags and WF_flags share the first nibble */
+ int sd_flag = wake_flags & 0xF;
- if (sd_flag & SD_BALANCE_WAKE) {
+ /*
+ * required for stable ->cpus_allowed
+ */
+ lockdep_assert_held(&p->pi_lock);
+ if (wake_flags & WF_TTWU) {
record_wakee(p);
- if (sched_energy_enabled()) {
+ if ((wake_flags & WF_CURRENT_CPU) &&
+ cpumask_test_cpu(cpu, p->cpus_ptr))
+ return cpu;
+
+ if (!is_rd_overutilized(this_rq()->rd)) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
@@ -6696,6 +8605,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break;
}
+ /*
+ * Usually only true for WF_EXEC and WF_FORK, as sched_domains
+ * usually do not have SD_BALANCE_WAKE set. That means wakeup
+ * will usually go to the fast path.
+ */
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
@@ -6704,22 +8618,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (unlikely(sd)) {
/* Slow path */
- new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
- } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+ new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ } else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
-
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-
- if (want_affine)
- current->recent_used_cpu = cpu;
}
rcu_read_unlock();
return new_cpu;
}
-static void detach_entity_cfs_rq(struct sched_entity *se);
-
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -6727,181 +8635,126 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (p->state == TASK_WAKING) {
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
+ struct sched_entity *se = &p->se;
- se->vruntime -= min_vruntime;
- }
+ if (!task_on_rq_migrating(p)) {
+ remove_entity_load_avg(se);
- if (p->on_rq == TASK_ON_RQ_MIGRATING) {
/*
- * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
- * rq->lock and can modify state directly.
- */
- lockdep_assert_held(&task_rq(p)->lock);
- detach_entity_cfs_rq(&p->se);
-
- } else {
- /*
- * We are supposed to update the task to "current" time, then
- * its up to date and ready to go to new CPU/cfs_rq. But we
- * have difficulty in getting what current time is, so simply
- * throw away the out-of-date time. This will result in the
- * wakee task is less decayed, but giving the wakee more load
- * sounds not bad.
+ * Here, the task's PELT values have been updated according to
+ * the current rq's clock. But if that clock hasn't been
+ * updated in a while, a substantial idle time will be missed,
+ * leading to an inflation after wake-up on the new rq.
+ *
+ * Estimate the missing time from the cfs_rq last_update_time
+ * and update sched_avg to improve the PELT continuity after
+ * migration.
*/
- remove_entity_load_avg(&p->se);
+ migrate_se_pelt_lag(se);
}
/* Tell new CPU we are migrated */
- p->se.avg.last_update_time = 0;
-
- /* We have migrated, no longer consider this task hot */
- p->se.exec_start = 0;
+ se->avg.last_update_time = 0;
update_scan_period(p, new_cpu);
}
static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
-}
-
-static int
-balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
- if (rq->nr_running)
- return 1;
+ struct sched_entity *se = &p->se;
- return newidle_balance(rq, rf) != 0;
-}
-#endif /* CONFIG_SMP */
+ if (se->sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
-static unsigned long wakeup_gran(struct sched_entity *se)
-{
- unsigned long gran = sysctl_sched_wakeup_granularity;
+ rq = task_rq_lock(p, &rf);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ task_rq_unlock(rq, p, &rf);
+ }
- /*
- * Since its curr running now, convert the gran from real-time
- * to virtual-time in his units.
- *
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- return calc_delta_fair(gran, se);
+ remove_entity_load_avg(se);
}
/*
- * Should 'se' preempt 'curr'.
- *
- * |s1
- * |s2
- * |s3
- * g
- * |<--->|c
- *
- * w(c, s1) = -1
- * w(c, s2) = 0
- * w(c, s3) = 1
- *
+ * Set the max capacity the task is allowed to run at for misfit detection.
*/
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+static void set_task_max_allowed_capacity(struct task_struct *p)
{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
+ struct asym_cap_data *entry;
- if (vdiff <= 0)
- return -1;
+ if (!sched_asym_cpucap_active())
+ return;
- gran = wakeup_gran(se);
- if (vdiff > gran)
- return 1;
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
- return 0;
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
}
-static void set_last_buddy(struct sched_entity *se)
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
+}
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- cfs_rq_of(se)->last = se;
- }
+static int
+balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ if (sched_fair_runnable(rq))
+ return 1;
+
+ return sched_balance_newidle(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
+#endif /* CONFIG_SMP */
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->next = se;
}
}
-static void set_skip_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se)
- cfs_rq_of(se)->skip = se;
-}
-
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
- struct task_struct *curr = rq->curr;
- struct sched_entity *se = &curr->se, *pse = &p->se;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int scale = cfs_rq->nr_running >= sched_nr_latency;
- int next_buddy_marked = 0;
+ struct task_struct *donor = rq->donor;
+ struct sched_entity *se = &donor->se, *pse = &p->se;
+ struct cfs_rq *cfs_rq = task_cfs_rq(donor);
+ int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
return;
/*
* This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_prempt_curr() after an enqueue (which may have
+ * unconditionally wakeup_preempt() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
return;
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
set_next_buddy(pse);
- next_buddy_marked = 1;
}
/*
@@ -6914,122 +8767,120 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
* prevents us from potentially nominating it as a false LAST_BUDDY
* below.
*/
- if (test_tsk_need_resched(curr))
+ if (test_tsk_need_resched(rq->curr))
+ return;
+
+ if (!sched_feat(WAKEUP_PREEMPTION))
return;
- /* Idle tasks are by definition preempted by non-idle tasks. */
- if (unlikely(task_has_idle_policy(curr)) &&
- likely(!task_has_idle_policy(p)))
+ find_matching_se(&se, &pse);
+ WARN_ON_ONCE(!pse);
+
+ cse_is_idle = se_is_idle(se);
+ pse_is_idle = se_is_idle(pse);
+
+ /*
+ * Preempt an idle entity in favor of a non-idle entity (and don't preempt
+ * in the inverse case).
+ */
+ if (cse_is_idle && !pse_is_idle)
goto preempt;
+ if (cse_is_idle != pse_is_idle)
+ return;
/*
- * Batch and idle tasks do not preempt non-idle tasks (their preemption
- * is driven by the tick):
+ * BATCH and IDLE tasks do not preempt others.
*/
- if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+ if (unlikely(!normal_policy(p->policy)))
return;
- find_matching_se(&se, &pse);
- update_curr(cfs_rq_of(se));
- BUG_ON(!pse);
- if (wakeup_preempt_entity(se, pse) == 1) {
- /*
- * Bias pick_next to pick the sched entity that is
- * triggering this preemption.
- */
- if (!next_buddy_marked)
- set_next_buddy(pse);
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+ /*
+ * If @p has a shorter slice than current and @p is eligible, override
+ * current's slice protection in order to allow preemption.
+ *
+ * Note that even if @p does not turn out to be the most eligible
+ * task at this moment, current's slice protection will be lost.
+ */
+ if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
+ se->vlag = se->deadline + 1;
+
+ /*
+ * If @p has become the most eligible task, force preemption.
+ */
+ if (pick_eevdf(cfs_rq) == pse)
goto preempt;
- }
return;
preempt:
- resched_curr(rq);
- /*
- * Only set the backward buddy when the current task is still
- * on the rq. This can happen when a wakeup gets interleaved
- * with schedule on the ->pre_schedule() or idle_balance()
- * point, either of which can * drop the rq lock.
- *
- * Also, during early boot the idle thread is in the fair class,
- * for obvious reasons its a bad idea to schedule back to it.
- */
- if (unlikely(!se->on_rq || curr == rq->idle))
- return;
+ resched_curr_lazy(rq);
+}
+
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+ struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+
+again:
+ cfs_rq = &rq->cfs;
+ if (!cfs_rq->nr_queued)
+ return NULL;
+
+ do {
+ /* Might not have done put_prev_entity() */
+ if (cfs_rq->curr && cfs_rq->curr->on_rq)
+ update_curr(cfs_rq);
+
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto again;
+
+ se = pick_next_entity(rq, cfs_rq);
+ if (!se)
+ goto again;
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
- set_last_buddy(se);
+ return task_of(se);
}
+static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
int new_tasks;
again:
- if (!sched_fair_runnable(rq))
+ p = pick_task_fair(rq);
+ if (!p)
goto idle;
+ se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (!prev || prev->sched_class != &fair_sched_class)
+ if (prev->sched_class != &fair_sched_class)
goto simple;
+ __put_prev_set_next_dl_server(rq, prev, p);
+
/*
* Because of the set_next_buddy() in dequeue_task_fair() it is rather
* likely that a next task is from the same cgroup as the current.
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
- */
-
- do {
- struct sched_entity *curr = cfs_rq->curr;
-
- /*
- * Since we got here without doing put_prev_entity() we also
- * have to consider cfs_rq->curr. If it is still a runnable
- * entity, update_curr() will update its vruntime, otherwise
- * forget we've ever seen it.
- */
- if (curr) {
- if (curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
-
- /*
- * This call to check_cfs_rq_runtime() will do the
- * throttle and dequeue its entity in the parent(s).
- * Therefore the nr_running test will indeed
- * be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
- cfs_rq = &rq->cfs;
-
- if (!cfs_rq->nr_running)
- goto idle;
-
- goto simple;
- }
- }
-
- se = pick_next_entity(cfs_rq, curr);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
-
- p = task_of(se);
-
- /*
+ *
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;
+ struct cfs_rq *cfs_rq;
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
@@ -7047,47 +8898,25 @@ again:
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
- }
- goto done;
-simple:
-#endif
- if (prev)
- put_prev_task(rq, prev);
-
- do {
- se = pick_next_entity(cfs_rq, NULL);
- set_next_entity(cfs_rq, se);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
+ __set_next_task_fair(rq, p, true);
+ }
- p = task_of(se);
+ return p;
-done: __maybe_unused;
-#ifdef CONFIG_SMP
- /*
- * Move the next running task to the front of
- * the list, so our cfs_tasks list becomes MRU
- * one.
- */
- list_move(&p->se.group_node, &rq->cfs_tasks);
+simple:
#endif
-
- if (hrtick_enabled(rq))
- hrtick_start_fair(rq, p);
-
- update_misfit_status(p, rq);
-
+ put_prev_set_next_task(rq, prev, p);
return p;
idle:
if (!rf)
return NULL;
- new_tasks = newidle_balance(rq, rf);
+ new_tasks = sched_balance_newidle(rq, rf);
/*
- * Because newidle_balance() releases (and re-acquires) rq->lock, it is
+ * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -7106,15 +8935,34 @@ idle:
return NULL;
}
-static struct task_struct *__pick_next_task_fair(struct rq *rq)
+static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ return pick_next_task_fair(rq, prev, NULL);
+}
+
+static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
+{
+ return !!dl_se->rq->cfs.nr_queued;
+}
+
+static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+{
+ return pick_task_fair(dl_se->rq);
+}
+
+void fair_server_init(struct rq *rq)
{
- return pick_next_task_fair(rq, NULL, NULL);
+ struct sched_dl_entity *dl_se = &rq->fair_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
}
/*
* Account for a descheduled task:
*/
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
@@ -7127,8 +8975,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
/*
* sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
*/
static void yield_task_fair(struct rq *rq)
{
@@ -7144,24 +8990,22 @@ static void yield_task_fair(struct rq *rq)
clear_buddies(cfs_rq, se);
- if (curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
- /*
- * Tell update_rq_clock() that we've just updated,
- * so we don't do microscopic update in schedule()
- * and double the fastpath cost.
- */
- rq_clock_skip_update(rq);
- }
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq_clock_skip_update(rq);
- set_skip_buddy(se);
+ se->deadline += calc_delta_fair(se->slice, se);
}
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -7169,7 +9013,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
- /* Tell the scheduler that we'd really like pse to run next. */
+ /* Tell the scheduler that we'd really like se to run next. */
set_next_buddy(se);
yield_task_fair(rq);
@@ -7227,7 +9071,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* topology where each level pairs two lower groups (or better). This results
* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
* tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of CPUs in
+ * of load-balance at each level inversely proportional to the number of CPUs in
* the groups.
*
* This yields:
@@ -7316,11 +9160,16 @@ enum group_type {
*/
group_fully_busy,
/*
- * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
- * and must be migrated to a more powerful CPU.
+ * One task doesn't fit with CPU's capacity and must be migrated to a
+ * more powerful CPU.
*/
group_misfit_task,
/*
+ * Balance SMT group that's fully busy. Can benefit from migration
+ * a task on SMT with busy sibling to another CPU on idle core.
+ */
+ group_smt_balance,
+ /*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
@@ -7349,8 +9198,7 @@ enum migration_type {
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
+#define LBF_ACTIVE_LB 0x10
struct lb_env {
struct sched_domain *sd;
@@ -7386,7 +9234,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
if (p->sched_class != &fair_sched_class)
return 0;
@@ -7394,16 +9242,27 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (unlikely(task_has_idle_policy(p)))
return 0;
+ /* SMT siblings share cache */
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
- (&p->se == cfs_rq_of(&p->se)->next ||
- &p->se == cfs_rq_of(&p->se)->last))
+ (&p->se == cfs_rq_of(&p->se)->next))
return 1;
if (sysctl_sched_migration_cost == -1)
return 1;
+
+ /*
+ * Don't migrate task if the task's cookie does not match
+ * with the destination CPU's core cookie.
+ */
+ if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+ return 1;
+
if (sysctl_sched_migration_cost == 0)
return 0;
@@ -7414,43 +9273,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
#ifdef CONFIG_NUMA_BALANCING
/*
- * Returns 1, if task migration degrades locality
- * Returns 0, if task migration improves locality i.e migration preferred.
- * Returns -1, if task migration is not affected by locality.
+ * Returns a positive value, if task migration degrades locality.
+ * Returns 0, if task migration is not affected by locality.
+ * Returns a negative value, if task migration improves locality i.e migration preferred.
*/
-static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
if (!static_branch_likely(&sched_numa_balancing))
- return -1;
+ return 0;
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return -1;
+ return 0;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
- return -1;
+ return 0;
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid) {
if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
return 1;
else
- return -1;
+ return 0;
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
- return 0;
+ return -1;
/* Leaving a core idle is often worse than degrading locality. */
if (env->idle == CPU_IDLE)
- return -1;
+ return 0;
dist = node_distance(src_nid, dst_nid);
if (numa_group) {
@@ -7461,41 +9320,85 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
dst_weight = task_weight(p, dst_nid, dist);
}
- return dst_weight < src_weight;
+ return src_weight - dst_weight;
}
#else
-static inline int migrate_degrades_locality(struct task_struct *p,
+static inline long migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
- return -1;
+ return 0;
}
#endif
/*
+ * Check whether the task is ineligible on the destination cpu
+ *
+ * When the PLACE_LAG scheduling feature is enabled and
+ * dst_cfs_rq->nr_queued is greater than 1, if the task
+ * is ineligible, it will also be ineligible when
+ * it is migrated to the destination cpu.
+ */
+static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
+{
+ struct cfs_rq *dst_cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
+#else
+ dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
+#endif
+ if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
+ !entity_eligible(task_cfs_rq(p), &p->se))
+ return 1;
+
+ return 0;
+}
+
+/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
- int tsk_cache_hot;
+ long degrades, hot;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
+ if (p->sched_task_hot)
+ p->sched_task_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU.
+ * 1) delayed dequeued unless we migrate load, or
+ * 2) throttled_lb_pair, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU.
*/
+ if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
+ return 0;
+
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
+ /*
+ * We want to prioritize the migration of eligible tasks.
+ * For ineligible tasks we soft-limit them and only allow
+ * them to migrate when nr_balance_failed is non-zero to
+ * avoid load-balancing trying very hard to balance the load.
+ */
+ if (!env->sd->nr_balance_failed &&
+ task_is_ineligible_on_dst_cpu(p, env->dst_cpu))
+ return 0;
+
+ /* Disregard percpu kthreads; they are where they need to be. */
+ if (kthread_is_per_cpu(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
- schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
+ schedstat_inc(p->stats.nr_failed_migrations_affine);
env->flags |= LBF_SOME_PINNED;
@@ -7504,10 +9407,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
- * already computed one in current iteration.
+ * Avoid computing new_dst_cpu
+ * - for NEWLY_IDLE
+ * - if we have already computed one in current iteration
+ * - if it's an active balance
*/
- if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE ||
+ env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
@@ -7522,34 +9428,37 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
}
- /* Record that we found atleast one task that could run on dst_cpu */
+ /* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_running(env->src_rq, p)) {
- schedstat_inc(p->se.statistics.nr_failed_migrations_running);
+ if (task_on_cpu(env->src_rq, p)) {
+ schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
- */
- tsk_cache_hot = migrate_degrades_locality(p, env);
- if (tsk_cache_hot == -1)
- tsk_cache_hot = task_hot(p, env);
-
- if (tsk_cache_hot <= 0 ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd->lb_hot_gained[env->idle]);
- schedstat_inc(p->se.statistics.nr_forced_migrations);
- }
+ * 1) active balance
+ * 2) destination numa is preferred
+ * 3) task is cache cold, or
+ * 4) too many balance attempts have failed.
+ */
+ if (env->flags & LBF_ACTIVE_LB)
+ return 1;
+
+ degrades = migrate_degrades_locality(p, env);
+ if (!degrades)
+ hot = task_hot(p, env);
+ else
+ hot = degrades > 0;
+
+ if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+ if (hot)
+ p->sched_task_hot = 1;
return 1;
}
- schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
return 0;
}
@@ -7558,7 +9467,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
+
+ if (p->sched_task_hot) {
+ p->sched_task_hot = 0;
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->stats.nr_forced_migrations);
+ }
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
@@ -7574,7 +9489,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
@@ -7595,8 +9510,6 @@ static struct task_struct *detach_one_task(struct lb_env *env)
return NULL;
}
-static const unsigned int sched_nr_migrate_break = 32;
-
/*
* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
@@ -7610,7 +9523,16 @@ static int detach_tasks(struct lb_env *env)
struct task_struct *p;
int detached = 0;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_rq_held(env->src_rq);
+
+ /*
+ * Source run queue has been emptied by another CPU, clear
+ * LBF_ALL_PINNED flag as we will not test any task.
+ */
+ if (env->src_rq->nr_running <= 1) {
+ env->flags &= ~LBF_ALL_PINNED;
+ return 0;
+ }
if (env->imbalance <= 0)
return 0;
@@ -7620,11 +9542,9 @@ static int detach_tasks(struct lb_env *env)
* We don't want to steal all, otherwise we may be treated likewise,
* which could at worst lead to a livelock crash.
*/
- if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ if (env->idle && env->src_rq->nr_running <= 1)
break;
- p = list_last_entry(tasks, struct task_struct, se.group_node);
-
env->loop++;
/* We've more or less seen every task there is, call it quits */
if (env->loop > env->loop_max)
@@ -7632,11 +9552,13 @@ static int detach_tasks(struct lb_env *env)
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sched_nr_migrate_break;
+ env->loop_break += SCHED_NR_MIGRATE_BREAK;
env->flags |= LBF_NEED_BREAK;
break;
}
+ p = list_last_entry(tasks, struct task_struct, se.group_node);
+
if (!can_migrate_task(p, env))
goto next;
@@ -7661,8 +9583,7 @@ static int detach_tasks(struct lb_env *env)
* scheduler fails to find a good waiting task to
* migrate.
*/
- if (load/2 > env->imbalance &&
- env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
@@ -7671,7 +9592,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_util:
util = task_util_est(p);
- if (util > env->imbalance)
+ if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= util;
@@ -7683,7 +9604,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_misfit:
/* This is not a misfit task */
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ if (task_fits_cpu(p, env->src_cpu))
goto next;
env->imbalance = 0;
@@ -7714,6 +9635,9 @@ static int detach_tasks(struct lb_env *env)
continue;
next:
+ if (p->sched_task_hot)
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
+
list_move(&p->se.group_node, tasks);
}
@@ -7732,11 +9656,11 @@ next:
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
- BUG_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
/*
@@ -7790,81 +9714,56 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
static inline bool others_have_blocked(struct rq *rq)
{
- if (READ_ONCE(rq->avg_rt.util_avg))
+ if (cpu_util_rt(rq))
return true;
- if (READ_ONCE(rq->avg_dl.util_avg))
+ if (cpu_util_dl(rq))
return true;
- if (thermal_load_avg(rq))
+ if (hw_load_avg(rq))
return true;
-#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
- if (READ_ONCE(rq->avg_irq.util_avg))
+ if (cpu_util_irq(rq))
return true;
-#endif
return false;
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
{
- rq->last_blocked_load_update_tick = jiffies;
+ WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
}
#else
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
static bool __update_blocked_others(struct rq *rq, bool *done)
{
- const struct sched_class *curr_class;
- u64 now = rq_clock_pelt(rq);
- unsigned long thermal_pressure;
- bool decayed;
+ bool updated;
/*
* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
* DL and IRQ signals have been updated before updating CFS.
*/
- curr_class = rq->curr->sched_class;
-
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
-
- decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
- update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
- update_irq_load_avg(rq, 0);
+ updated = update_other_load_avgs(rq);
if (others_have_blocked(rq))
*done = false;
- return decayed;
+ return updated;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load.weight)
- return false;
-
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->avg.runnable_sum)
- return false;
-
- return true;
-}
-
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq, *pos;
@@ -7879,7 +9778,10 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
+
+ if (cfs_rq->nr_queued == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
@@ -7888,7 +9790,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
- update_load_avg(cfs_rq_of(se), se, 0);
+ update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/*
* There can be a lot of idle CPU cgroups. Don't let fully
@@ -7970,13 +9872,14 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
-static void update_blocked_averages(int cpu)
+static void sched_balance_update_blocked_averages(int cpu)
{
bool decayed = false, done = true;
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
+ update_blocked_load_tick(rq);
update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
@@ -7988,24 +9891,25 @@ static void update_blocked_averages(int cpu)
rq_unlock_irqrestore(rq, &rf);
}
-/********** Helpers for find_busiest_group ************************/
+/********** Helpers for sched_balance_find_src_group ************************/
/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
+ * sg_lb_stats - stats of a sched_group required for load-balancing:
*/
struct sg_lb_stats {
- unsigned long avg_load; /*Avg load across the CPUs of the group */
- unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long group_capacity;
- unsigned long group_util; /* Total utilization over the CPUs of the group */
- unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
- unsigned int sum_nr_running; /* Nr of tasks running in the group */
- unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
- unsigned int idle_cpus;
+ unsigned long avg_load; /* Avg load over the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long group_capacity; /* Capacity over the CPUs of the group */
+ unsigned long group_util; /* Total utilization over the CPUs of the group */
+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
+ unsigned int sum_nr_running; /* Nr of all tasks running in the group */
+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
+ unsigned int idle_cpus; /* Nr of idle CPUs in the group */
unsigned int group_weight;
enum group_type group_type;
- unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
- unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -8013,19 +9917,18 @@ struct sg_lb_stats {
};
/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
+ * sd_lb_stats - stats of a sched_domain required for load-balancing:
*/
struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *local; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_capacity; /* Total capacity of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
- unsigned int prefer_sibling; /* tasks should go to sibling first */
-
- struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
- struct sg_lb_stats local_stat; /* Statistics of the local group */
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *local; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+ unsigned int prefer_sibling; /* Tasks should go to sibling first */
+
+ struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */
+ struct sg_lb_stats local_stat; /* Statistics of the local group */
};
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@ -8049,10 +9952,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
+ unsigned long max = get_actual_cpu_capacity(cpu);
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -8064,12 +9967,9 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
- * avg_thermal.load_avg tracks thermal pressure and the weighted
- * average uses the actual delta max capacity(load).
*/
- used = READ_ONCE(rq->avg_rt.util_avg);
- used += READ_ONCE(rq->avg_dl.util_avg);
- used += thermal_load_avg(rq);
+ used = cpu_util_rt(rq);
+ used += cpu_util_dl(rq);
if (unlikely(used >= max))
return 1;
@@ -8081,15 +9981,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(sd, cpu);
+ unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
if (!capacity)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
@@ -8159,19 +10059,13 @@ static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
return ((rq->cpu_capacity * sd->imbalance_pct) <
- (rq->cpu_capacity_orig * 100));
+ (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
{
- return rq->misfit_task_load &&
- (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}
/*
@@ -8195,7 +10089,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
*
* When this is so detected; this group becomes a candidate for busiest; see
* update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
* to create an effective group imbalance.
*
* This is a somewhat tricky proposition since the next run might not find the
@@ -8211,7 +10105,7 @@ static inline int sg_imbalanced(struct sched_group *group)
/*
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
- * We consider that a group has spare capacity if the * number of task is
+ * We consider that a group has spare capacity if the number of task is
* smaller than the number of CPUs or if the utilization is lower than the
* available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
@@ -8262,26 +10156,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
return false;
}
-/*
- * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
- * per-CPU capacity than sched_group ref.
- */
-static inline bool
-group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
-{
- return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
-}
-
-/*
- * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
- * per-CPU capacity_orig than sched_group ref.
- */
-static inline bool
-group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
-{
- return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
-}
-
static inline enum
group_type group_classify(unsigned int imbalance_pct,
struct sched_group *group,
@@ -8296,6 +10170,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (sgs->group_asym_packing)
return group_asym_packing;
+ if (sgs->group_smt_balance)
+ return group_smt_balance;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
@@ -8305,70 +10182,180 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
}
-static bool update_nohz_stats(struct rq *rq, bool force)
+/**
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd: The scheduling domain of the load balancing
+ * @cpu: A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
{
-#ifdef CONFIG_NO_HZ_COMMON
- unsigned int cpu = rq->cpu;
+ if (!(sd->flags & SD_ASYM_PACKING))
+ return false;
- if (!rq->has_blocked_load)
+ if (!sched_smt_active())
+ return true;
+
+ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
+{
+ /*
+ * First check if @dst_cpu can do asym_packing load balance. Only do it
+ * if it has higher priority than @src_cpu.
+ */
+ return sched_use_asym_prio(sd, dst_cpu) &&
+ sched_asym_prefer(dst_cpu, src_cpu);
+}
+
+/**
+ * sched_group_asym - Check if the destination CPU can do asym_packing balance
+ * @env: The load balancing environment
+ * @sgs: Load-balancing statistics of the candidate busiest group
+ * @group: The candidate busiest group
+ *
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
+ *
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
+ */
+static inline bool
+sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
+{
+ /*
+ * CPU priorities do not make sense for SMT cores with more than one
+ * busy sibling.
+ */
+ if ((group->flags & SD_SHARE_CPUCAPACITY) &&
+ (sgs->group_weight - sgs->idle_cpus != 1))
return false;
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
+}
+
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+ struct sched_group *sg2)
+{
+ if (!sg1 || !sg2)
return false;
- if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
- return true;
+ return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+ (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
- update_blocked_averages(cpu);
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (!env->idle)
+ return false;
+
+ /*
+ * For SMT source group, it is better to move a task
+ * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+ * will not be on.
+ */
+ if (group->flags & SD_SHARE_CPUCAPACITY &&
+ sgs->sum_h_nr_running > 1)
+ return true;
- return rq->has_blocked_load;
-#else
return false;
-#endif
+}
+
+static inline long sibling_imbalance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *local)
+{
+ int ncores_busiest, ncores_local;
+ long imbalance;
+
+ if (!env->idle || !busiest->sum_nr_running)
+ return 0;
+
+ ncores_busiest = sds->busiest->cores;
+ ncores_local = sds->local->cores;
+
+ if (ncores_busiest == ncores_local) {
+ imbalance = busiest->sum_nr_running;
+ lsub_positive(&imbalance, local->sum_nr_running);
+ return imbalance;
+ }
+
+ /* Balance such that nr_running/ncores ratio are same on both groups */
+ imbalance = ncores_local * busiest->sum_nr_running;
+ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+ /* Normalize imbalance and do rounding on normalization */
+ imbalance = 2 * imbalance + ncores_local + ncores_busiest;
+ imbalance /= ncores_local + ncores_busiest;
+
+ /* Take advantage of resource in an empty sched group */
+ if (imbalance <= 1 && local->sum_nr_running == 0 &&
+ busiest->sum_nr_running > 1)
+ imbalance = 2;
+
+ return imbalance;
+}
+
+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_runnable != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
+ * @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
- * @sg_status: Holds flag indicating the status of the sched_group
+ * @sg_overloaded: sched_group is overloaded
+ * @sg_overutilized: sched_group is overutilized
*/
static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sd_lb_stats *sds,
struct sched_group *group,
struct sg_lb_stats *sgs,
- int *sg_status)
+ bool *sg_overloaded,
+ bool *sg_overutilized)
{
- int i, nr_running, local_group;
+ int i, nr_running, local_group, sd_flags = env->sd->flags;
+ bool balancing_at_rd = !env->sd->parent;
memset(sgs, 0, sizeof(*sgs));
- local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+ local_group = group == sds->local;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);
- if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
- env->flags |= LBF_NOHZ_AGAIN;
-
- sgs->group_load += cpu_load(rq);
- sgs->group_util += cpu_util(i);
+ sgs->group_load += load;
+ sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
- if (nr_running > 1)
- *sg_status |= SG_OVERLOAD;
-
if (cpu_overutilized(i))
- *sg_status |= SG_OVERUTILIZED;
+ *sg_overutilized = 1;
-#ifdef CONFIG_NUMA_BALANCING
- sgs->nr_numa_running += rq->nr_numa_running;
- sgs->nr_preferred_running += rq->nr_preferred_running;
-#endif
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -8378,29 +10365,46 @@ static inline void update_sg_lb_stats(struct lb_env *env,
continue;
}
+ /* Overload indicator is only updated at root domain */
+ if (balancing_at_rd && nr_running > 1)
+ *sg_overloaded = 1;
+
+#ifdef CONFIG_NUMA_BALANCING
+ /* Only fbq_classify_group() uses this to classify NUMA groups */
+ if (sd_flags & SD_NUMA) {
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+ }
+#endif
if (local_group)
continue;
- /* Check for a misfit task on the cpu */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- sgs->group_misfit_task_load < rq->misfit_task_load) {
- sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ if (sd_flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_overloaded = 1;
+ }
+ } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ if (sgs->group_misfit_task_load < load)
+ sgs->group_misfit_task_load = load;
}
}
- /* Check if dst CPU is idle and preferred to this group */
- if (env->sd->flags & SD_ASYM_PACKING &&
- env->idle != CPU_NOT_IDLE &&
- sgs->sum_h_nr_running &&
- sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
- sgs->group_asym_packing = 1;
- }
-
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
+ /* Check if dst CPU is idle and preferred to this group */
+ if (!local_group && env->idle && sgs->sum_h_nr_running &&
+ sched_group_asym(env, sgs, group))
+ sgs->group_asym_packing = 1;
+
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (!local_group && smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
+
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
@@ -8439,8 +10443,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
- if (sgs->group_type == group_misfit_task &&
- (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
+ (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -8458,9 +10463,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
switch (sgs->group_type) {
case group_overloaded:
/* Select the overloaded group with highest avg_load. */
- if (sgs->avg_load <= busiest->avg_load)
- return false;
- break;
+ return sgs->avg_load > busiest->avg_load;
case group_imbalanced:
/*
@@ -8471,18 +10474,24 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
- if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
- return false;
- break;
+ return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
case group_misfit_task:
/*
* If we have more than one misfit sg go with the biggest
* misfit.
*/
- if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
- return false;
- break;
+ return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
+
+ case group_smt_balance:
+ /*
+ * Check if we have spare CPUs on either SMT group to
+ * choose has spare or fully busy handling.
+ */
+ if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+ goto has_spare;
+
+ fallthrough;
case group_fully_busy:
/*
@@ -8493,15 +10502,40 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
- * select the 1st one.
+ * select the 1st one, except if @sg is composed of SMT
+ * siblings.
*/
- if (sgs->avg_load <= busiest->avg_load)
+
+ if (sgs->avg_load < busiest->avg_load)
return false;
+
+ if (sgs->avg_load == busiest->avg_load) {
+ /*
+ * SMT sched groups need more help than non-SMT groups.
+ * If @sg happens to also be SMT, either choice is good.
+ */
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+ return false;
+ }
+
break;
case group_has_spare:
/*
- * Select not overloaded group with lowest number of idle cpus
+ * Do not pick sg with SMT CPUs over sg with pure CPUs,
+ * as we do not want to pull task off SMT core with one task
+ * and make the core idle.
+ */
+ if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+ return false;
+ else
+ return true;
+ }
+has_spare:
+
+ /*
+ * Select not overloaded group with lowest number of idle CPUs
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
* that the group has less spare capacity but finally more idle
@@ -8524,7 +10558,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
- (group_smaller_min_cpu_capacity(sds->local, sg)))
+ (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
return false;
return true;
@@ -8599,10 +10633,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
* be computed and tested before calling idle_cpu_without().
*/
-#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
-#endif
return 1;
}
@@ -8623,6 +10655,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
memset(sgs, 0, sizeof(*sgs));
+ /* Assume that task can't fit any CPU of the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY)
+ sgs->group_misfit_task_load = 1;
+
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -8631,7 +10667,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
@@ -8642,12 +10678,12 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
- }
+ /* Check if task fits in the CPU */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load &&
+ task_fits_cpu(p, i))
+ sgs->group_misfit_task_load = 0;
- /* Check if task fits in the group */
- if (sd->flags & SD_ASYM_CPUCAPACITY &&
- !task_fits_capacity(p, group->sgc->max_capacity)) {
- sgs->group_misfit_task_load = 1;
}
sgs->group_capacity = group->sgc->capacity;
@@ -8692,6 +10728,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those types are not used in the slow wakeup path */
return false;
@@ -8703,8 +10740,14 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_has_spare:
/* Select group with most idle CPUs */
- if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
return false;
+
+ /* Select group with lowest group_util */
+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
+ idlest_sgs->group_util <= sgs->group_util)
+ return false;
+
break;
}
@@ -8712,13 +10755,13 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * find_idlest_group() finds and returns the least busy CPU group within the
+ * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
@@ -8729,9 +10772,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
.group_type = group_overloaded,
};
- imbalance = scale_load_down(NICE_0_LOAD) *
- (sd->imbalance_pct-100) / 100;
-
do {
int local_group;
@@ -8740,6 +10780,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
p->cpus_ptr))
continue;
+ /* Skip over this group if no cookie matched */
+ if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+ continue;
+
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
@@ -8785,6 +10829,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
switch (local_sgs.group_type) {
case group_overloaded:
case group_fully_busy:
+
+ /* Calculate allowed imbalance based on load */
+ imbalance = scale_load_down(NICE_0_LOAD) *
+ (sd->imbalance_pct-100) / 100;
+
/*
* When comparing groups across NUMA domains, it's possible for
* the local domain to be very lightly loaded relative to the
@@ -8811,6 +10860,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those type are not used in the slow wakeup path */
return NULL;
@@ -8821,7 +10871,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break;
case group_has_spare:
+#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) {
+ int imb_numa_nr = sd->imb_numa_nr;
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
@@ -8834,16 +10886,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
/*
- * Otherwise, keep the task on this node to stay close
- * its wakeup source and improve locality. If there is
- * a real need of migration, periodic load balance will
+ * Otherwise, keep the task close to the wakeup source
+ * and improve locality if the number of running tasks
+ * would remain below threshold where an imbalance is
+ * allowed while accounting for the possibility the
+ * task is pinned to a subset of CPUs. If there is a
+ * real need of migration, periodic load balance will
* take care of it.
*/
- if (local_sgs.idle_cpus)
+ if (p->nr_cpus_allowed != NR_CPUS) {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+
+ cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
+ imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ }
+
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ imb_numa_nr)) {
return NULL;
+ }
}
+#endif /* CONFIG_NUMA */
/*
* Select group with highest number of idle CPUs. We could also
@@ -8859,6 +10926,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
}
+static void update_idle_cpu_scan(struct lb_env *env,
+ unsigned long sum_util)
+{
+ struct sched_domain_shared *sd_share;
+ int llc_weight, pct;
+ u64 x, y, tmp;
+ /*
+ * Update the number of CPUs to scan in LLC domain, which could
+ * be used as a hint in select_idle_cpu(). The update of sd_share
+ * could be expensive because it is within a shared cache line.
+ * So the write of this hint only occurs during periodic load
+ * balancing, rather than CPU_NEWLY_IDLE, because the latter
+ * can fire way more frequently than the former.
+ */
+ if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+ if (env->sd->span_weight != llc_weight)
+ return;
+
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ if (!sd_share)
+ return;
+
+ /*
+ * The number of CPUs to search drops as sum_util increases, when
+ * sum_util hits 85% or above, the scan stops.
+ * The reason to choose 85% as the threshold is because this is the
+ * imbalance_pct(117) when a LLC sched group is overloaded.
+ *
+ * let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
+ * and y'= y / SCHED_CAPACITY_SCALE
+ *
+ * x is the ratio of sum_util compared to the CPU capacity:
+ * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+ * y' is the ratio of CPUs to be scanned in the LLC domain,
+ * and the number of CPUs to scan is calculated by:
+ *
+ * nr_scan = llc_weight * y' [2]
+ *
+ * When x hits the threshold of overloaded, AKA, when
+ * x = 100 / pct, y drops to 0. According to [1],
+ * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+ *
+ * Scale x by SCHED_CAPACITY_SCALE:
+ * x' = sum_util / llc_weight; [3]
+ *
+ * and finally [1] becomes:
+ * y = SCHED_CAPACITY_SCALE -
+ * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
+ *
+ */
+ /* equation [3] */
+ x = sum_util;
+ do_div(x, llc_weight);
+
+ /* equation [4] */
+ pct = env->sd->imbalance_pct;
+ tmp = x * x * pct * pct;
+ do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+ tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+ y = SCHED_CAPACITY_SCALE - tmp;
+
+ /* equation [2] */
+ y *= llc_weight;
+ do_div(y, SCHED_CAPACITY_SCALE);
+ if ((int)y != sd_share->nr_idle_scan)
+ WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -8867,16 +11005,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- int sg_status = 0;
-
-#ifdef CONFIG_NO_HZ_COMMON
- if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
- env->flags |= LBF_NOHZ_STATS;
-#endif
+ unsigned long sum_util = 0;
+ bool sg_overloaded = 0, sg_overutilized = 0;
do {
struct sg_lb_stats *sgs = &tmp_sgs;
@@ -8892,70 +11025,44 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, sgs, &sg_status);
-
- if (local_group)
- goto next_group;
-
+ update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
- if (update_sd_pick_busiest(env, sds, sg, sgs)) {
+ if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
-next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sum_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
- /* Tag domain that child domain prefers tasks go to siblings first */
- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+ /*
+ * Indicate that the child domain of the busiest group prefers tasks
+ * go to a child's sibling domains first. NB the flags of a sched group
+ * are those of the child domain.
+ */
+ if (sds->busiest)
+ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
-#ifdef CONFIG_NO_HZ_COMMON
- if ((env->flags & LBF_NOHZ_AGAIN) &&
- cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
- WRITE_ONCE(nohz.next_blocked,
- jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
- }
-#endif
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
- struct root_domain *rd = env->dst_rq->rd;
-
/* update overload indicator if we are at root domain */
- WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+ set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
/* Update over-utilization (tipping point, U >= 0) indicator */
- WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
- } else if (sg_status & SG_OVERUTILIZED) {
- struct root_domain *rd = env->dst_rq->rd;
-
- WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
- trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
+ } else if (sg_overutilized) {
+ set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
}
-}
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
-{
- unsigned int imbalance_min;
-
- /*
- * Allow a small imbalance based on a simple pair of communicating
- * tasks that remain local when the source domain is almost idle.
- */
- imbalance_min = 2;
- if (src_nr_running <= imbalance_min)
- return 0;
-
- return imbalance;
+ update_idle_cpu_scan(env, sum_util);
}
/**
@@ -8972,9 +11079,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) {
- /* Set imbalance to allow misfit tasks to be balanced. */
- env->migration_type = migrate_misfit;
- env->imbalance = 1;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity.
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
return;
}
@@ -8988,6 +11104,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+ if (busiest->group_type == group_smt_balance) {
+ /* Reduce number of tasks sharing CPU capacity */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
+ }
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -9005,7 +11128,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* emptying busiest.
*/
if (local->group_type == group_has_spare) {
- if (busiest->group_type > group_fully_busy) {
+ if ((busiest->group_type > group_fully_busy) &&
+ !(env->sd->flags & SD_SHARE_LLC)) {
/*
* If busiest is overloaded, try to fill spare
* capacity. This might end up creating spare capacity
@@ -9025,7 +11149,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* waiting task in this overloaded busiest group. Let's
* try to pull it.
*/
- if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ if (env->idle && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
@@ -9034,29 +11158,34 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
- unsigned int nr_diff = busiest->sum_nr_running;
/*
* When prefer sibling, evenly spread running tasks on
* groups.
*/
env->migration_type = migrate_task;
- lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff >> 1;
+ env->imbalance = sibling_imbalance(env, sds, busiest, local);
} else {
/*
* If there is no overload, we just want to even the number of
- * idle cpus.
+ * idle CPUs.
*/
env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
- busiest->idle_cpus) >> 1);
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
}
+#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */
- if (env->sd->flags & SD_NUMA)
+ if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- busiest->sum_nr_running);
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
+ }
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
return;
}
@@ -9074,8 +11203,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
- sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
- sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
@@ -9084,6 +11211,19 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->imbalance = 0;
return;
}
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
+
+ /*
+ * If the local group is more loaded than the average system
+ * load, don't try to pull any tasks.
+ */
+ if (local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return;
+ }
+
}
/*
@@ -9101,7 +11241,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
) / SCHED_CAPACITY_SCALE;
}
-/******* find_busiest_group() helpers end here *********************/
+/******* sched_balance_find_src_group() helpers end here *********************/
/*
* Decision matrix according to the local and busiest group type:
@@ -9109,7 +11249,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
- * misfit_task force N/A N/A N/A force force
+ * misfit_task force N/A N/A N/A N/A N/A
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
@@ -9124,17 +11264,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
/**
- * find_busiest_group - Returns the busiest group within the sched_domain
+ * sched_balance_find_src_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
+ * @env: The load balancing environment.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
- * @env: The load balancing environment.
- *
* Return: - The busiest group if imbalance exists.
*/
-static struct sched_group *find_busiest_group(struct lb_env *env)
+static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
@@ -9147,24 +11286,20 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
-
- local = &sds.local_stat;
- busiest = &sds.busiest_stat;
-
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest)
goto out_balanced;
+ busiest = &sds.busiest_stat;
+
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
+ if (!is_rd_overutilized(env->dst_rq->rd) &&
+ rcu_dereference(env->dst_rq->rd->pd))
+ goto out_balanced;
+
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
@@ -9177,6 +11312,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
+ local = &sds.local_stat;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -9216,22 +11352,32 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto out_balanced;
}
- /* Try to move all excess tasks to child's sibling domain */
+ /*
+ * Try to move all excess tasks to a sibling domain of the busiest
+ * group's child domain.
+ */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- busiest->sum_nr_running > local->sum_nr_running + 1)
+ sibling_imbalance(env, &sds, busiest, local) > 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE)
+ if (!env->idle) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
+ }
+
+ if (busiest->group_type == group_smt_balance &&
+ smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+ /* Let non SMT CPU pull from SMT CPU sharing with sibling */
+ goto force_balance;
+ }
if (busiest->group_weight > 1 &&
- local->idle_cpus <= (busiest->idle_cpus + 1))
+ local->idle_cpus <= (busiest->idle_cpus + 1)) {
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
@@ -9242,12 +11388,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* there is more than 1 CPU per group.
*/
goto out_balanced;
+ }
- if (busiest->sum_h_nr_running == 1)
+ if (busiest->sum_h_nr_running == 1) {
/*
* busiest doesn't have any tasks waiting to run
*/
goto out_balanced;
+ }
}
force_balance:
@@ -9261,9 +11409,9 @@ out_balanced:
}
/*
- * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
+ * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
*/
-static struct rq *find_busiest_queue(struct lb_env *env,
+static struct rq *sched_balance_find_src_rq(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
@@ -9301,8 +11449,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
+ nr_running = rq->cfs.h_nr_runnable;
+ if (!nr_running)
+ continue;
+
capacity = capacity_of(i);
- nr_running = rq->cfs.h_nr_running;
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
@@ -9311,10 +11462,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* average load.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- capacity_of(env->dst_cpu) < capacity &&
+ !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
nr_running == 1)
continue;
+ /*
+ * Make sure we only pull tasks from a CPU of lower priority
+ * when balancing between SMT siblings.
+ *
+ * If balancing between cores, let lower priority CPUs help
+ * SMT cores with more than one busy sibling.
+ */
+ if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
+ continue;
+
switch (env->migration_type) {
case migrate_load:
/*
@@ -9348,7 +11509,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util(cpu_of(rq));
+ util = cpu_util_cfs_boost(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -9399,30 +11560,55 @@ static inline bool
asym_active_balance(struct lb_env *env)
{
/*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
+ * ASYM_PACKING needs to force migrate tasks from busy but lower
+ * priority CPUs in order to pack all tasks in the highest priority
+ * CPUs. When done between cores, do it only if the whole core if the
+ * whole core is idle.
+ *
+ * If @env::src_cpu is an SMT core with busy siblings, let
+ * the lower priority @env::dst_cpu help it. Do not follow
+ * CPU priority.
*/
- return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu);
+ return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+ !sched_use_asym_prio(env->sd, env->src_cpu));
}
static inline bool
-voluntary_active_balance(struct lb_env *env)
+imbalanced_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ /*
+ * The imbalanced case includes the case of pinned tasks preventing a fair
+ * distribution of the load on the system but also the even distribution of the
+ * threads on a system with spare capacity
+ */
+ if ((env->migration_type == migrate_task) &&
+ (sd->nr_balance_failed > sd->cache_nice_tries+2))
+ return 1;
+
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
{
struct sched_domain *sd = env->sd;
if (asym_active_balance(env))
return 1;
+ if (imbalanced_active_balance(env))
+ return 1;
+
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
* It's worth migrating the task if the src_cpu's capacity is reduced
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
*/
- if ((env->idle != CPU_NOT_IDLE) &&
- (env->src_rq->cfs.h_nr_running == 1)) {
+ if (env->idle &&
+ (env->src_rq->cfs.h_nr_runnable == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@@ -9434,22 +11620,13 @@ voluntary_active_balance(struct lb_env *env)
return 0;
}
-static int need_active_balance(struct lb_env *env)
-{
- struct sched_domain *sd = env->sd;
-
- if (voluntary_active_balance(env))
- return 1;
-
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
-}
-
static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
+ struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
struct sched_group *sg = env->sd->groups;
- int cpu;
+ int cpu, idle_smt = -1;
/*
* Ensure the balancing environment is consistent; can happen
@@ -9461,28 +11638,83 @@ static int should_we_balance(struct lb_env *env)
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
+ *
+ * However, we bail out if we already have tasks or a wakeup pending,
+ * to optimize wakeup latency.
*/
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
+ return 0;
return 1;
+ }
+ cpumask_copy(swb_cpus, group_balance_mask(sg));
/* Try to find first idle CPU */
- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ for_each_cpu_and(cpu, swb_cpus, env->cpus) {
if (!idle_cpu(cpu))
continue;
- /* Are we the first idle CPU? */
+ /*
+ * Don't balance to idle SMT in busy core right away when
+ * balancing cores, but remember the first idle SMT CPU for
+ * later consideration. Find CPU on an idle core first.
+ */
+ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+ if (idle_smt == -1)
+ idle_smt = cpu;
+ /*
+ * If the core is not idle, and first SMT sibling which is
+ * idle has been found, then its not needed to check other
+ * SMT siblings for idleness:
+ */
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
+ continue;
+ }
+
+ /*
+ * Are we the first idle core in a non-SMT domain or higher,
+ * or the first idle CPU in a SMT domain?
+ */
return cpu == env->dst_cpu;
}
+ /* Are we the first idle CPU with busy siblings? */
+ if (idle_smt != -1)
+ return idle_smt == env->dst_cpu;
+
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
}
+static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ if (!schedstat_enabled())
+ return;
+
+ switch (env->migration_type) {
+ case migrate_load:
+ __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
+ break;
+ case migrate_util:
+ __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
+ break;
+ case migrate_task:
+ __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
+ break;
+ case migrate_misfit:
+ __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ break;
+ }
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
-static int load_balance(int this_cpu, struct rq *this_rq,
+static int sched_balance_rq(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
@@ -9492,14 +11724,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct rq *busiest;
struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
-
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_span(sd->groups),
+ .dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
- .loop_break = sched_nr_migrate_break,
+ .loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
@@ -9515,34 +11746,35 @@ redo:
goto out_balanced;
}
- group = find_busiest_group(&env);
+ group = sched_balance_find_src_group(&env);
if (!group) {
schedstat_inc(sd->lb_nobusyg[idle]);
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group);
+ busiest = sched_balance_find_src_rq(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
goto out_balanced;
}
- BUG_ON(busiest == env.dst_rq);
+ WARN_ON_ONCE(busiest == env.dst_rq);
- schedstat_add(sd->lb_imbalance[idle], env.imbalance);
+ update_lb_imbalance_stat(&env, sd, idle);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
ld_moved = 0;
+ /* Clear this flag as soon as we find a pullable task */
+ env.flags |= LBF_ALL_PINNED;
if (busiest->nr_running > 1) {
/*
- * Attempt to move tasks. If find_busiest_group has found
+ * Attempt to move tasks. If sched_balance_find_src_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- env.flags |= LBF_ALL_PINNED;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
@@ -9591,7 +11823,7 @@ more_balance:
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
- * given_cpu) causing exceess load to be moved to given_cpu.
+ * given_cpu) causing excess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
@@ -9605,7 +11837,7 @@ more_balance:
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
/*
* Go back to "more_balance" rather than "redo" since we
@@ -9637,7 +11869,7 @@ more_balance:
*/
if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
+ env.loop_break = SCHED_NR_MIGRATE_BREAK;
goto redo;
}
goto out_all_pinned;
@@ -9651,14 +11883,18 @@ more_balance:
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
+ *
+ * Similarly for migration_misfit which is not related to
+ * load/util migration, don't pollute nr_balance_failed.
*/
- if (idle != CPU_NEWLY_IDLE)
+ if (idle != CPU_NEWLY_IDLE &&
+ env.migration_type != migrate_misfit)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
unsigned long flags;
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ raw_spin_rq_lock_irqsave(busiest, flags);
/*
* Don't kick the active_load_balance_cpu_stop,
@@ -9666,12 +11902,13 @@ more_balance:
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
- raw_spin_unlock_irqrestore(&busiest->lock,
- flags);
- env.flags |= LBF_ALL_PINNED;
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
goto out_one_pinned;
}
+ /* Record that we found at least one task that could run on this_cpu */
+ env.flags &= ~LBF_ALL_PINNED;
+
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
@@ -9682,32 +11919,23 @@ more_balance:
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ preempt_disable();
+ raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
-
- /* We've kicked active balancing, force task migration. */
- sd->nr_balance_failed = sd->cache_nice_tries+1;
+ preempt_enable();
}
- } else
+ } else {
sd->nr_balance_failed = 0;
+ }
- if (likely(!active_balance) || voluntary_active_balance(&env)) {
+ if (likely(!active_balance) || need_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
- } else {
- /*
- * If we've begun active balancing, start to back off. This
- * case may not be covered by the all_pinned logic if there
- * is only 1 task on the busy runqueue (because we don't call
- * detach_tasks).
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval *= 2;
}
goto out;
@@ -9739,12 +11967,17 @@ out_one_pinned:
ld_moved = 0;
/*
- * newidle_balance() disregards balance intervals, so we could
+ * sched_balance_newidle() disregards balance intervals, so we could
* repeatedly reach this code, which would lead to balance_interval
- * skyrocketting in a short amount of time. Skip the balance_interval
+ * skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
+ *
+ * Similarly misfit migration which is not necessarily an indication of
+ * the system being busy and requires lb to backoff to let it settle
+ * down.
*/
- if (env.idle == CPU_NEWLY_IDLE)
+ if (env.idle == CPU_NEWLY_IDLE ||
+ env.migration_type == migrate_misfit)
goto out;
/* tune up the balancing interval */
@@ -9766,6 +11999,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
+
+ /*
+ * Reduce likelihood of busy balancing at higher domains racing with
+ * balancing at lower domains by preventing their balancing periods
+ * from being multiples of each other.
+ */
+ if (cpu_busy)
+ interval -= 1;
+
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
@@ -9823,7 +12065,7 @@ static int active_load_balance_cpu_stop(void *data)
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-CPU setup.
*/
- BUG_ON(busiest_rq == target_rq);
+ WARN_ON_ONCE(busiest_rq == target_rq);
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
@@ -9840,13 +12082,7 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
- /*
- * can_migrate_task() doesn't need to compute new_dst_cpu
- * for active balancing. Since we have CPU_IDLE, but no
- * @dst_grpmask we need to make that test go away with lying
- * about DST_PINNED.
- */
- .flags = LBF_DST_PINNED,
+ .flags = LBF_ACTIVE_LB,
};
schedstat_inc(sd->alb_count);
@@ -9874,10 +12110,23 @@ out_unlock:
return 0;
}
-static DEFINE_SPINLOCK(balancing);
+/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ * is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ * execution, as non-SD_SERIALIZE domains will still be
+ * load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
/*
- * Scale the max load_balance interval with the number of CPUs in the system.
+ * Scale the max sched_balance_rq interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
void update_max_interval(void)
@@ -9885,13 +12134,37 @@ void update_max_interval(void)
max_load_balance_interval = HZ*num_online_cpus()/10;
}
+static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
+{
+ if (cost > sd->max_newidle_lb_cost) {
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ sd->max_newidle_lb_cost = cost;
+ sd->last_decay_max_lb_cost = jiffies;
+ } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
+ /*
+ * Decay the newidle max times by ~1% per second to ensure that
+ * it is not outdated and the current max cost is actually
+ * shorter.
+ */
+ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
+ sd->last_decay_max_lb_cost = jiffies;
+
+ return true;
+ }
+
+ return false;
+}
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in init_sched_domains.
*/
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
@@ -9908,14 +12181,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
for_each_domain(cpu, sd) {
/*
* Decay the newidle max times here because this is a regular
- * visit to all the domains. Decay ~1% per second.
+ * visit to all the domains.
*/
- if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
- sd->max_newidle_lb_cost =
- (sd->max_newidle_lb_cost * 253) / 256;
- sd->next_decay_max_lb_cost = jiffies + HZ;
- need_decay = 1;
- }
+ need_decay = update_newidle_cost(sd, 0);
max_cost += sd->max_newidle_lb_cost;
/*
@@ -9933,25 +12201,25 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
- if (!spin_trylock(&balancing))
+ if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+ if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
- busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ idle = idle_cpu(cpu);
+ busy = !idle && !sched_idle_cpu(cpu);
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, busy);
}
if (need_serialize)
- spin_unlock(&balancing);
+ atomic_set_release(&sched_balance_running, 0);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
@@ -9973,22 +12241,9 @@ out:
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
- if (likely(update_next_balance)) {
+ if (likely(update_next_balance))
rq->next_balance = next_balance;
-#ifdef CONFIG_NO_HZ_COMMON
- /*
- * If this CPU has been elected to perform the nohz idle
- * balance. Other idle CPUs have already rebalanced with
- * nohz_idle_balance() and nohz.next_balance has been
- * updated accordingly. This CPU is now running the idle load
- * balance for itself and we need to update the
- * nohz.next_balance accordingly.
- */
- if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
- nohz.next_balance = rq->next_balance;
-#endif
- }
}
static inline int on_null_domain(struct rq *rq)
@@ -9998,40 +12253,58 @@ static inline int on_null_domain(struct rq *rq)
#ifdef CONFIG_NO_HZ_COMMON
/*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
- * anywhere yet.
*/
-
static inline int find_new_ilb(void)
{
- int ilb;
+ const struct cpumask *hk_mask;
+ int ilb_cpu;
+
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
+
+ for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
+
+ if (ilb_cpu == smp_processor_id())
+ continue;
- for_each_cpu_and(ilb, nohz.idle_cpus_mask,
- housekeeping_cpumask(HK_FLAG_MISC)) {
- if (idle_cpu(ilb))
- return ilb;
+ if (idle_cpu(ilb_cpu))
+ return ilb_cpu;
}
- return nr_cpu_ids;
+ return -1;
}
/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
+ * (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
+ if (ilb_cpu < 0)
+ return;
- if (ilb_cpu >= nr_cpu_ids)
+ /*
+ * Don't bother if no new NOHZ balance work items for ilb_cpu,
+ * i.e. all bits in flags are already set in ilb_cpu.
+ */
+ if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
return;
/*
@@ -10044,7 +12317,7 @@ static void kick_ilb(unsigned int flags)
/*
* This way we generate an IPI on the target CPU which
- * is idle. And the softirq performing nohz idle load balance
+ * is idle, and the softirq performing NOHZ idle load balancing
* will be run before returning from the IPI.
*/
smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
@@ -10073,7 +12346,7 @@ static void nohz_balancer_kick(struct rq *rq)
/*
* None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
+ * balancing:
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
@@ -10086,7 +12359,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto out;
if (rq->nr_running >= 2) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto out;
}
@@ -10095,12 +12368,11 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(rq->sd);
if (sd) {
/*
- * If there's a CFS task and the current CPU has reduced
- * capacity; kick the ILB to see if there's a better CPU to run
- * on.
+ * If there's a runnable CFS task and the current CPU has reduced
+ * capacity, kick the ILB to see if there's a better CPU to run on:
*/
- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10111,10 +12383,13 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
+ *
+ * When balancing between cores, all the SMT siblings of the
+ * preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_asym_prefer(i, cpu)) {
- flags = NOHZ_KICK_MASK;
+ if (sched_asym(sd, i, cpu)) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
@@ -10126,8 +12401,8 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
- flags = NOHZ_KICK_MASK;
+ if (check_misfit_status(rq)) {
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -10145,22 +12420,25 @@ static void nohz_balancer_kick(struct rq *rq)
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
- * increase the overall cache use), we need some less-loaded LLC
- * domain to pull some load. Likewise, we may need to spread
+ * increase the overall cache utilization), we need a less-loaded LLC
+ * domain to pull some load from. Likewise, we may need to spread
* load within the current LLC domain (e.g. packed SMT cores but
* other CPUs are idle). We can't really know from here how busy
- * the others are - so just get a nohz balance going if it looks
+ * the others are - so just get a NOHZ balance going if it looks
* like this LLC domain has tasks we could move.
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
+ flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
}
unlock:
rcu_read_unlock();
out:
+ if (READ_ONCE(nohz.needs_update))
+ flags |= NOHZ_NEXT_KICK;
+
if (flags)
kick_ilb(flags);
}
@@ -10225,10 +12503,6 @@ void nohz_balance_enter_idle(int cpu)
if (!cpu_active(cpu))
return;
- /* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
- return;
-
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
@@ -10257,29 +12531,45 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
* @idle_cpus_mask store, it must observe the @has_blocked
- * store.
+ * and @needs_update stores.
*/
smp_mb__after_atomic();
set_cpu_sd_state_idle(cpu);
+ WRITE_ONCE(nohz.needs_update, 1);
out:
/*
* Each time a cpu enter idle, we assume that it has blocked load and
- * enable the periodic update of the load of idle cpus
+ * enable the periodic update of the load of idle CPUs
*/
WRITE_ONCE(nohz.has_blocked, 1);
}
+static bool update_nohz_stats(struct rq *rq)
+{
+ unsigned int cpu = rq->cpu;
+
+ if (!rq->has_blocked_load)
+ return false;
+
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return false;
+
+ if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
+ return true;
+
+ sched_balance_update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+}
+
/*
- * Internal function that runs load balance for all idle cpus. The load balance
+ * Internal function that runs load balance for all idle CPUs. The load balance
* can be a simple update of blocked load or a complete load balance with
* tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
*/
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
- enum cpu_idle_type idle)
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
{
/* Earliest time when we have to do rebalance again */
unsigned long now = jiffies;
@@ -10288,7 +12578,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
- int ret = false;
struct rq *rq;
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -10296,12 +12585,17 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
/*
* We assume there will be no idle load after this update and clear
* the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trig another update of idle load.
+ * set the has_blocked flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
+ *
+ * Same applies to idle_cpus_mask vs needs_update.
*/
- WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.has_blocked, 0);
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 0);
/*
* Ensures that if we miss the CPU, we must see the has_blocked
@@ -10309,8 +12603,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
*/
smp_mb();
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+ /*
+ * Start with the next CPU after this_cpu so we will end with this_cpu and let a
+ * chance for other idle cpu to pull load.
+ */
+ for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
+ if (!idle_cpu(balance_cpu))
continue;
/*
@@ -10318,14 +12616,18 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
* work being done for other CPUs. Next load
* balancing owner will pick it up.
*/
- if (need_resched()) {
- has_blocked_load = true;
+ if (!idle_cpu(this_cpu) && need_resched()) {
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load = true;
+ if (flags & NOHZ_NEXT_KICK)
+ WRITE_ONCE(nohz.needs_update, 1);
goto abort;
}
rq = cpu_rq(balance_cpu);
- has_blocked_load |= update_nohz_stats(rq, true);
+ if (flags & NOHZ_STATS_KICK)
+ has_blocked_load |= update_nohz_stats(rq);
/*
* If time for next balance is due,
@@ -10339,7 +12641,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(rq, CPU_IDLE);
+ sched_balance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
@@ -10348,26 +12650,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
- /* Newly idle CPU doesn't need an update */
- if (idle != CPU_NEWLY_IDLE) {
- update_blocked_averages(this_cpu);
- has_blocked_load |= this_rq->has_blocked_load;
- }
-
- if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(this_rq, CPU_IDLE);
-
- WRITE_ONCE(nohz.next_blocked,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
-
- /* The full idle balance loop has been done */
- ret = true;
-
-abort:
- /* There is still blocked load, enable periodic update */
- if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
-
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
@@ -10376,12 +12658,19 @@ abort:
if (likely(update_next_balance))
nohz.next_balance = next_balance;
- return ret;
+ if (flags & NOHZ_STATS_KICK)
+ WRITE_ONCE(nohz.next_blocked,
+ now + msecs_to_jiffies(LOAD_AVG_PERIOD));
+
+abort:
+ /* There is still blocked load, enable periodic update */
+ if (has_blocked_load)
+ WRITE_ONCE(nohz.has_blocked, 1);
}
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * rebalancing for all the CPUs for whom scheduler ticks are stopped.
*/
static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
@@ -10395,21 +12684,43 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (idle != CPU_IDLE)
return false;
- _nohz_idle_balance(this_rq, flags, idle);
+ _nohz_idle_balance(this_rq, flags);
return true;
}
-static void nohz_newidle_balance(struct rq *this_rq)
+/*
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / IRQ off phase of the local
+ * cpu about to enter idle, because it can take a long time.
+ */
+void nohz_run_idle_balance(int cpu)
{
- int this_cpu = this_rq->cpu;
+ unsigned int flags;
+
+ flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
/*
- * This CPU doesn't want to be disturbed by scheduler
- * housekeeping
+ * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+ * (i.e. NOHZ_STATS_KICK set) and will do the same.
*/
- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
- return;
+ if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+ _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
+}
+
+static void nohz_newidle_balance(struct rq *this_rq)
+{
+ int this_cpu = this_rq->cpu;
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
@@ -10420,16 +12731,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
- raw_spin_unlock(&this_rq->lock);
/*
- * This CPU is going to be idle and blocked load of idle CPUs
- * need to be updated. Run the ilb locally as it is a good
- * candidate for ilb instead of waking up another idle CPU.
- * Kick an normal ilb if we failed to do the update.
+ * Set the need to trigger ILB in order to update blocked load
+ * before entering idle state.
*/
- if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
- kick_ilb(NOHZ_STATS_KICK);
- raw_spin_lock(&this_rq->lock);
+ atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
#else /* !CONFIG_NO_HZ_COMMON */
@@ -10444,7 +12750,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
/*
- * idle_balance is called by schedule() if this_cpu is about to become
+ * sched_balance_newidle is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
@@ -10452,18 +12758,28 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
-static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
+ int continue_balancing = 1;
+ u64 t0, t1, curr_cost = 0;
struct sched_domain *sd;
int pulled_task = 0;
- u64 curr_cost = 0;
update_misfit_status(NULL, this_rq);
+
/*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
+ * There is a task waiting to run. No need to search for one.
+ * Return 0; the task will be enqueued when switching to idle.
+ */
+ if (this_rq->ttwu_pending)
+ return 0;
+
+ /*
+ * We must set idle_stamp _before_ calling sched_balance_rq()
+ * for CPU_NEWLY_IDLE, such that we measure the this duration
+ * as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
@@ -10481,82 +12797,83 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !READ_ONCE(this_rq->rd->overload)) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+
+ if (!get_rd_overloaded(this_rq->rd) ||
+ (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
update_next_balance(sd, &next_balance);
rcu_read_unlock();
- nohz_newidle_balance(this_rq);
-
goto out;
}
+ rcu_read_unlock();
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
+
+ t0 = sched_clock_cpu(this_cpu);
+ sched_balance_update_blocked_averages(this_cpu);
- update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
- u64 t0, domain_cost;
+ u64 domain_cost;
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, &next_balance);
+ update_next_balance(sd, &next_balance);
+
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
break;
- }
if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
- pulled_task = load_balance(this_cpu, this_rq,
+ pulled_task = sched_balance_rq(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
+ t1 = sched_clock_cpu(this_cpu);
+ domain_cost = t1 - t0;
+ update_newidle_cost(sd, domain_cost);
curr_cost += domain_cost;
+ t0 = t1;
}
- update_next_balance(sd, &next_balance);
-
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
*/
- if (pulled_task || this_rq->nr_running > 0)
+ if (pulled_task || !continue_balancing)
break;
}
rcu_read_unlock();
- raw_spin_lock(&this_rq->lock);
+ raw_spin_rq_lock(this_rq);
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
-out:
/*
* While browsing the domains, we released the rq lock, a task could
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task)
+ if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
+ pulled_task = -1;
+
+out:
/* Move the next balance forward */
if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
- pulled_task = -1;
-
if (pulled_task)
this_rq->idle_stamp = 0;
+ else
+ nohz_newidle_balance(this_rq);
rq_repin_lock(this_rq, rf);
@@ -10564,19 +12881,21 @@ out:
}
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
+ *
+ * - directly from the local sched_tick() for periodic load balancing
+ *
+ * - indirectly from a remote sched_tick() for NOHZ idle balancing
+ * through the SMP cross-call nohz_csd_func()
*/
-static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void sched_balance_softirq(void)
{
struct rq *this_rq = this_rq();
- enum cpu_idle_type idle = this_rq->idle_balance ?
- CPU_IDLE : CPU_NOT_IDLE;
-
+ enum cpu_idle_type idle = this_rq->idle_balance;
/*
- * If this CPU has a pending nohz_balance_kick, then do the
+ * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
* balancing on behalf of the other idle CPUs whose ticks are
- * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ * stopped. Do nohz_idle_balance *before* sched_balance_domains to
* give the idle CPUs a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
@@ -10585,17 +12904,20 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
return;
/* normal load balance */
- update_blocked_averages(this_rq->cpu);
- rebalance_domains(this_rq, idle);
+ sched_balance_update_blocked_averages(this_rq->cpu);
+ sched_balance_domains(this_rq, idle);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq)
+void sched_balance_trigger(struct rq *rq)
{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
+ /*
+ * Don't need to rebalance while attached to NULL domain or
+ * runqueue CPU is not active
+ */
+ if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
return;
if (time_after_eq(jiffies, rq->next_balance))
@@ -10617,10 +12939,140 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
+
+ /* Ensure that we remove rq contribution to group share: */
+ clear_tg_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_CORE
+static inline bool
+__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+{
+ u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
+
+ return (rtime * min_nr_tasks > slice);
+}
+
+#define MIN_NR_TASKS_DURING_FORCEIDLE 2
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ /*
+ * If runqueue has only one task which used up its slice and
+ * if the sibling is forced idle, then trigger schedule to
+ * give forced idle task a chance.
+ *
+ * sched_slice() considers only this active rq and it gets the
+ * whole slice. But during force idle, we have siblings acting
+ * like a single runqueue and hence we need to consider runnable
+ * tasks on this CPU and the forced idle CPU. Ideally, we should
+ * go through the forced idle rq, but that would be a perf hit.
+ * We can assume that the forced idle CPU has at least
+ * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+ * if we need to give up the CPU.
+ */
+ if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
+ __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
+ resched_curr(rq);
+}
+
+/*
+ * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
+ bool forceidle)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (forceidle) {
+ if (cfs_rq->forceidle_seq == fi_seq)
+ break;
+ cfs_rq->forceidle_seq = fi_seq;
+ }
+
+ cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+ struct sched_entity *se = &p->se;
+
+ if (p->sched_class != &fair_sched_class)
+ return;
+
+ se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
+ bool in_fi)
+{
+ struct rq *rq = task_rq(a);
+ const struct sched_entity *sea = &a->se;
+ const struct sched_entity *seb = &b->se;
+ struct cfs_rq *cfs_rqa;
+ struct cfs_rq *cfs_rqb;
+ s64 delta;
+
+ SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Find an se in the hierarchy for tasks a and b, such that the se's
+ * are immediate siblings.
+ */
+ while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ int sea_depth = sea->depth;
+ int seb_depth = seb->depth;
+
+ if (sea_depth >= seb_depth)
+ sea = parent_entity(sea);
+ if (sea_depth <= seb_depth)
+ seb = parent_entity(seb);
+ }
+
+ se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+ se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+ cfs_rqa = sea->cfs_rq;
+ cfs_rqb = seb->cfs_rq;
+#else
+ cfs_rqa = &task_rq(a)->cfs;
+ cfs_rqb = &task_rq(b)->cfs;
+#endif
+
+ /*
+ * Find delta after normalizing se's vruntime with its cfs_rq's
+ * min_vruntime_fi, which would have been updated in prior calls
+ * to se_fi_update().
+ */
+ delta = (s64)(sea->vruntime - seb->vruntime) +
+ (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+ return delta > 0;
+}
+
+static int task_is_throttled_fair(struct task_struct *p, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq = task_group(p)->cfs_rq[cpu];
+#else
+ cfs_rq = &cpu_rq(cpu)->cfs;
+#endif
+ return throttled_hierarchy(cfs_rq);
+}
+#else
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+#endif
+
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -10643,7 +13095,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
- update_overutilized_status(task_rq(curr));
+ check_update_overutilized_status(task_rq(curr));
+
+ task_tick_core(rq, curr);
}
/*
@@ -10653,33 +13107,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se, *curr;
- struct rq *rq = this_rq();
- struct rq_flags rf;
-
- rq_lock(rq, &rf);
- update_rq_clock(rq);
-
- cfs_rq = task_cfs_rq(current);
- curr = cfs_rq->curr;
- if (curr) {
- update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
- }
- place_entity(cfs_rq, se, 1);
-
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
-
- se->vruntime -= cfs_rq->min_vruntime;
- rq_unlock(rq, &rf);
+ set_task_max_allowed_capacity(p);
}
/*
@@ -10692,7 +13120,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (rq->cfs.nr_running == 1)
+ if (rq->cfs.nr_queued == 1)
return;
/*
@@ -10700,39 +13128,11 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (rq->curr == p) {
+ if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
} else
- check_preempt_curr(rq, p, 0);
-}
-
-static inline bool vruntime_normalized(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
-
- /*
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
- * the dequeue_entity(.flags=0) will already have normalized the
- * vruntime.
- */
- if (p->on_rq)
- return true;
-
- /*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - A forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - A task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- */
- if (!se->sum_exec_runtime ||
- (p->state == TASK_WAKING && p->sched_remote_wakeup))
- return true;
-
- return false;
+ wakeup_preempt(rq, p, 0);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -10742,7 +13142,13 @@ static inline bool vruntime_normalized(struct task_struct *p)
*/
static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
se = se->parent;
@@ -10750,10 +13156,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
+ update_load_avg(cfs_rq, se, UPDATE_TG);
+
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
}
#else
@@ -10764,10 +13173,21 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+#ifdef CONFIG_SMP
+ /*
+ * In case the task sched_avg hasn't been attached:
+ * - A forked task which hasn't been woken up by wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() but is
+ * waiting for actually being woken up by sched_ttwu_pending().
+ */
+ if (!se->avg.last_update_time)
+ return;
+#endif
+
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@@ -10775,34 +13195,16 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
- }
detach_entity_cfs_rq(se);
}
@@ -10810,12 +13212,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
attach_entity_cfs_rq(se);
-
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -10825,27 +13223,26 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ SCHED_WARN_ON(p->se.sched_delayed);
+
attach_task_cfs_rq(p);
+ set_task_max_allowed_capacity(p);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (rq->curr == p)
+ if (task_current_donor(rq, p))
resched_curr(rq);
else
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
}
-/* Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
@@ -10858,6 +13255,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
list_move(&se->group_node, &rq->cfs_tasks);
}
#endif
+ if (!first)
+ return;
+
+ SCHED_WARN_ON(se->sched_delayed);
+
+ if (hrtick_enabled_fair(rq))
+ hrtick_start_fair(rq, p);
+
+ update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
+}
+
+/*
+ * Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+{
+ struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10866,60 +13284,43 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
+
+ __set_next_task_fair(rq, p, first);
}
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_set_group_fair(struct task_struct *p)
+static void task_change_group_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
-
- set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-}
+ /*
+ * We couldn't detach or attach a forked task which
+ * hasn't been woken up by wake_up_new_task().
+ */
+ if (READ_ONCE(p->__state) == TASK_NEW)
+ return;
-static void task_move_group_fair(struct task_struct *p)
-{
detach_task_cfs_rq(p);
- set_task_rq(p, task_cpu(p));
#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
#endif
+ set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
-static void task_change_group_fair(struct task_struct *p, int type)
-{
- switch (type) {
- case TASK_SET_GROUP:
- task_set_group_fair(p);
- break;
-
- case TASK_MOVE_GROUP:
- task_move_group_fair(p);
- break;
- }
-}
-
void free_fair_sched_group(struct task_group *tg)
{
int i;
- destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
@@ -10946,7 +13347,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
@@ -10954,7 +13355,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!cfs_rq)
goto err;
- se = kzalloc_node(sizeof(struct sched_entity),
+ se = kzalloc_node(sizeof(struct sched_entity_stats),
GFP_KERNEL, cpu_to_node(i));
if (!se)
goto err_free_rq;
@@ -10992,26 +13393,35 @@ void online_fair_sched_group(struct task_group *tg)
void unregister_fair_sched_group(struct task_group *tg)
{
- unsigned long flags;
- struct rq *rq;
int cpu;
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
for_each_possible_cpu(cpu) {
- if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
- if (!tg->cfs_rq[cpu]->on_list)
- continue;
-
- rq = cpu_rq(cpu);
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
}
@@ -11048,10 +13458,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
static DEFINE_MUTEX(shares_mutex);
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
+ lockdep_assert_held(&shares_mutex);
+
/*
* We can't change the weight of the root cgroup.
*/
@@ -11060,9 +13472,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
- mutex_lock(&shares_mutex);
if (tg->shares == shares)
- goto done;
+ return 0;
tg->shares = shares;
for_each_possible_cpu(i) {
@@ -11080,22 +13491,87 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_unlock_irqrestore(rq, &rf);
}
-done:
- mutex_unlock(&shares_mutex);
return 0;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-
-void free_fair_sched_group(struct task_group *tg) { }
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
- return 1;
+ int ret;
+
+ mutex_lock(&shares_mutex);
+ if (tg_is_idle(tg))
+ ret = -EINVAL;
+ else
+ ret = __sched_group_set_shares(tg, shares);
+ mutex_unlock(&shares_mutex);
+
+ return ret;
}
-void online_fair_sched_group(struct task_group *tg) { }
+int sched_group_set_idle(struct task_group *tg, long idle)
+{
+ int i;
-void unregister_fair_sched_group(struct task_group *tg) { }
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (idle < 0 || idle > 1)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->idle == idle) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->idle = idle;
+
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se = tg->se[i];
+ struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+ bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+ long idle_task_delta;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+
+ grp_cfs_rq->idle = idle;
+ if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+ goto next_cpu;
+
+ idle_task_delta = grp_cfs_rq->h_nr_queued -
+ grp_cfs_rq->h_nr_idle;
+ if (!cfs_rq_is_idle(grp_cfs_rq))
+ idle_task_delta *= -1;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!se->on_rq)
+ break;
+
+ cfs_rq->h_nr_idle += idle_task_delta;
+
+ /* Already accounted at parent level and above. */
+ if (cfs_rq_is_idle(cfs_rq))
+ break;
+ }
+
+next_cpu:
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ /* Idle groups have minimum weight. */
+ if (tg_is_idle(tg))
+ __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+ else
+ __sched_group_set_shares(tg, NICE_0_LOAD);
+
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -11110,7 +13586,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);
return rr_interval;
}
@@ -11118,15 +13594,16 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
-const struct sched_class fair_sched_class = {
- .next = &idle_sched_class,
+DEFINE_SCHED_CLASS(fair) = {
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .check_preempt_curr = check_preempt_wakeup,
+ .wakeup_preempt = check_preempt_wakeup_fair,
+ .pick_task = pick_task_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
@@ -11140,12 +13617,13 @@ const struct sched_class fair_sched_class = {
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
+ .reweight_task = reweight_task_fair,
.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
@@ -11158,6 +13636,10 @@ const struct sched_class fair_sched_class = {
.task_change_group = task_change_group_fair,
#endif
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_fair,
+#endif
+
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
@@ -11202,93 +13684,27 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-
-#ifdef CONFIG_NO_HZ_COMMON
- nohz.next_balance = jiffies;
- nohz.next_blocked = jiffies;
- zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-#endif
-#endif /* SMP */
-
-}
+ int i;
-/*
- * Helper functions to facilitate extracting info from tracepoints.
- */
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));
-const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
-{
-#ifdef CONFIG_SMP
- return cfs_rq ? &cfs_rq->avg : NULL;
-#else
- return NULL;
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
+ INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
-
-char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
-{
- if (!cfs_rq) {
- if (str)
- strlcpy(str, "(null)", len);
- else
- return NULL;
}
- cfs_rq_tg_path(cfs_rq, str, len);
- return str;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
-
-int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
-{
- return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
-
-const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_rt : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
-
-const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_dl : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
+ open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
-const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
- return rq ? &rq->avg_irq : NULL;
-#else
- return NULL;
+#ifdef CONFIG_NO_HZ_COMMON
+ nohz.next_balance = jiffies;
+ nohz.next_blocked = jiffies;
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
-
-int sched_trace_rq_cpu(struct rq *rq)
-{
- return rq ? cpu_of(rq) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+#endif /* SMP */
-const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
-{
-#ifdef CONFIG_SMP
- return rd ? rd->span : NULL;
-#else
- return NULL;
-#endif
}
-EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7481cd96f391..3c12d9f93331 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -1,16 +1,28 @@
/* SPDX-License-Identifier: GPL-2.0 */
+
/*
- * Only give sleepers 50% of their service deficit. This allows
- * them to run sooner, but does not allow tons of sleepers to
- * rip the spread apart.
+ * Using the avg_vruntime, do the right thing and preserve lag across
+ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
-
+SCHED_FEAT(PLACE_LAG, true)
+/*
+ * Give new tasks half a slice to ease into the competition.
+ */
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
+/*
+ * Preserve relative virtual deadline on 'migration'.
+ */
+SCHED_FEAT(PLACE_REL_DEADLINE, true)
+/*
+ * Inhibit (wakeup) preemption until the current task has either matched the
+ * 0-lag point or until is has exhausted it's slice.
+ */
+SCHED_FEAT(RUN_TO_PARITY, true)
/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
+ * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for
+ * current.
*/
-SCHED_FEAT(START_DEBIT, true)
+SCHED_FEAT(PREEMPT_SHORT, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -20,42 +32,60 @@ SCHED_FEAT(START_DEBIT, true)
SCHED_FEAT(NEXT_BUDDY, false)
/*
- * Prefer to schedule the task that ran last (when we did
- * wake-preempt) as that likely will touch the same data, increases
- * cache locality.
+ * Allow completely ignoring cfs_rq->next; which can be set from various
+ * places:
+ * - NEXT_BUDDY (wakeup preemption)
+ * - yield_to_task()
+ * - cgroup dequeue / pick
*/
-SCHED_FEAT(LAST_BUDDY, true)
+SCHED_FEAT(PICK_BUDDY, true)
/*
- * Consider buddies to be cache hot, decreases the likelyness of a
+ * Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
+ * Delay dequeueing tasks until they get selected or woken.
+ *
+ * By delaying the dequeue for non-eligible tasks, they remain in the
+ * competition and can burn off their negative lag. When they get selected
+ * they'll have positive lag by definition.
+ *
+ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0.
+ */
+SCHED_FEAT(DELAY_DEQUEUE, true)
+SCHED_FEAT(DELAY_ZERO, true)
+
+/*
* Allow wakeup-time preemption of the current task:
*/
SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
-SCHED_FEAT(DOUBLE_TICK, false)
+SCHED_FEAT(HRTICK_DL, false)
/*
* Decrement CPU capacity based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_CAPACITY, true)
+#ifdef CONFIG_PREEMPT_RT
+SCHED_FEAT(TTWU_QUEUE, false)
+#else
+
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
SCHED_FEAT(TTWU_QUEUE, true)
+#endif
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
-SCHED_FEAT(SIS_AVG_CPU, false)
-SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_UTIL, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
@@ -77,7 +107,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
@@ -89,4 +119,5 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
-SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+SCHED_FEAT(LATENCY_WARN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1ae95b9150d3..2c85c86b455f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,9 +6,6 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
-#include "sched.h"
-
-#include <trace/events/power.h>
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -54,17 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup);
static noinline int __cpuidle cpu_idle_poll(void)
{
- rcu_idle_enter();
- trace_cpu_idle_rcuidle(0, smp_processor_id());
- local_irq_enable();
+ instrumentation_begin();
+ trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
+ ct_cpuidle_enter();
+ raw_local_irq_enable();
while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ raw_local_irq_disable();
+
+ ct_cpuidle_exit();
start_critical_timings();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
- rcu_idle_exit();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ local_irq_enable();
+ instrumentation_end();
return 1;
}
@@ -73,13 +75,31 @@ static noinline int __cpuidle cpu_idle_poll(void)
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { }
+void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- local_irq_enable();
}
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE
+DEFINE_STATIC_KEY_FALSE(arch_needs_tick_broadcast);
+
+static inline void cond_tick_broadcast_enter(void)
+{
+ if (static_branch_unlikely(&arch_needs_tick_broadcast))
+ tick_broadcast_enter();
+}
+
+static inline void cond_tick_broadcast_exit(void)
+{
+ if (static_branch_unlikely(&arch_needs_tick_broadcast))
+ tick_broadcast_exit();
+}
+#else
+static inline void cond_tick_broadcast_enter(void) { }
+static inline void cond_tick_broadcast_exit(void) { }
+#endif
+
/**
* default_idle_call - Default CPU idle routine.
*
@@ -87,13 +107,22 @@ void __weak arch_cpu_idle(void)
*/
void __cpuidle default_idle_call(void)
{
- if (current_clr_polling_and_test()) {
- local_irq_enable();
- } else {
+ instrumentation_begin();
+ if (!current_clr_polling_and_test()) {
+ cond_tick_broadcast_enter();
+ trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
+
+ ct_cpuidle_enter();
arch_cpu_idle();
+ ct_cpuidle_exit();
+
start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ cond_tick_broadcast_exit();
}
+ local_irq_enable();
+ instrumentation_end();
}
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
@@ -131,7 +160,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*
* NOTE: no locks or semaphores should be used here
*
- * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * On architectures that support TIF_POLLING_NRFLAG, is called with polling
* set, and it returns with polling set. If it ever stops polling, it
* must clear the polling bit.
*/
@@ -143,22 +172,15 @@ static void cpuidle_idle_call(void)
/*
* Check if the idle task must be rescheduled. If it is the
- * case, exit the function after re-enabling the local irq.
+ * case, exit the function after re-enabling the local IRQ.
*/
if (need_resched()) {
local_irq_enable();
return;
}
- /*
- * The RCU framework needs to be told that we are entering an idle
- * section, so no more rcu read side critical sections and one more
- * step to the grace period
- */
-
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
default_idle_call();
goto exit_idle;
@@ -168,7 +190,7 @@ static void cpuidle_idle_call(void)
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in interrupts (if any). In that case bypass
- * the cpuidle governor and go stratight for the deepest idle state
+ * the cpuidle governor and go straight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
@@ -178,21 +200,17 @@ static void cpuidle_idle_call(void)
u64 max_latency_ns;
if (idle_should_enter_s2idle()) {
- rcu_idle_enter();
entered_state = call_cpuidle_s2idle(drv, dev);
if (entered_state > 0)
goto exit_idle;
- rcu_idle_exit();
-
max_latency_ns = U64_MAX;
} else {
max_latency_ns = dev->forced_idle_latency_limit_ns;
}
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
@@ -209,8 +227,6 @@ static void cpuidle_idle_call(void)
else
tick_nohz_idle_retain_tick();
- rcu_idle_enter();
-
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
@@ -222,12 +238,10 @@ exit_idle:
__current_set_polling();
/*
- * It is up to the idle functions to reenable local interrupts
+ * It is up to the idle functions to re-enable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
-
- rcu_idle_exit();
}
/*
@@ -238,6 +252,12 @@ exit_idle:
static void do_idle(void)
{
int cpu = smp_processor_id();
+
+ /*
+ * Check if we need to update blocked load
+ */
+ nohz_run_idle_balance(cpu);
+
/*
* If the arch has a polling bit, we maintain an invariant:
*
@@ -251,20 +271,49 @@ static void do_idle(void)
tick_nohz_idle_enter();
while (!need_resched()) {
- rmb();
+ /*
+ * Interrupts shouldn't be re-enabled from that point on until
+ * the CPU sleeping instruction is reached. Otherwise an interrupt
+ * may fire and queue a timer that would be ignored until the CPU
+ * wakes from the sleeping instruction. And testing need_resched()
+ * doesn't tell about pending needed timer reprogram.
+ *
+ * Several cases to consider:
+ *
+ * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as
+ * "wfi" or "mwait" are fine because they can be entered with
+ * interrupt disabled.
+ *
+ * - sti;mwait() couple is fine because the interrupts are
+ * re-enabled only upon the execution of mwait, leaving no gap
+ * in-between.
+ *
+ * - ROLLBACK based idle handlers with the sleeping instruction
+ * called with interrupts enabled are NOT fine. In this scheme
+ * when the interrupt detects it has interrupted an idle handler,
+ * it rolls back to its beginning which performs the
+ * need_resched() check before re-executing the sleeping
+ * instruction. This can leak a pending needed timer reprogram.
+ * If such a scheme is really mandatory due to the lack of an
+ * appropriate CPU sleeping instruction, then a FAST-FORWARD
+ * must instead be applied: when the interrupt detects it has
+ * interrupted an idle handler, it must resume to the end of
+ * this idle handler so that the generic idle loop is iterated
+ * again to reprogram the tick.
+ */
local_irq_disable();
if (cpu_is_offline(cpu)) {
- tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
arch_cpu_idle_enter();
+ rcu_nocb_flush_deferred_wakeup();
/*
- * In poll mode we reenable interrupts and spin. Also if we
+ * In poll mode we re-enable interrupts and spin. Also if we
* detected in the wakeup from idle path that the tick
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
@@ -300,7 +349,7 @@ static void do_idle(void)
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
@@ -341,6 +390,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
WARN_ON_ONCE(!duration_ns);
+ WARN_ON_ONCE(current->mm);
rcu_sleep_check();
preempt_disable();
@@ -348,10 +398,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
cpuidle_use_deepest_state(latency_ns);
it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- it.timer.function = idle_inject_timer_fn;
+ hrtimer_setup_on_stack(&it.timer, idle_inject_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
while (!READ_ONCE(it.done))
do_idle();
@@ -366,6 +416,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise);
void cpu_startup_entry(enum cpuhp_state state)
{
+ current->flags |= PF_IDLE;
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
@@ -378,7 +429,7 @@ void cpu_startup_entry(enum cpuhp_state state)
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
@@ -393,41 +444,43 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Idle tasks are unconditionally rescheduled:
*/
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
+ dl_server_update_idle_time(rq, prev);
+ scx_update_idle(rq, false, true);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
+ scx_update_idle(rq, true, true);
schedstat_inc(rq->sched_goidle);
+ next->se.exec_start = rq_clock_task(rq);
}
-struct task_struct *pick_next_task_idle(struct rq *rq)
+struct task_struct *pick_task_idle(struct rq *rq)
{
- struct task_struct *next = rq->idle;
-
- set_next_task_idle(rq, next, true);
-
- return next;
+ scx_update_idle(rq, true, false);
+ return rq->idle;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
-static void
+static bool
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_rq_lock_irq(rq);
+ return true;
}
/*
@@ -453,11 +506,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
BUG();
}
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_idle(struct rq *rq)
{
}
@@ -465,16 +513,16 @@ static void update_curr_idle(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
-const struct sched_class idle_sched_class = {
- /* .next is NULL */
+DEFINE_SCHED_CLASS(idle) = {
+
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
- .check_preempt_curr = check_preempt_curr_idle,
+ .wakeup_preempt = wakeup_preempt_idle,
- .pick_next_task = pick_next_task_idle,
+ .pick_task = pick_task_idle,
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
@@ -486,8 +534,6 @@ const struct sched_class idle_sched_class = {
.task_tick = task_tick_idle,
- .get_rr_interval = get_rr_interval_idle,
-
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 808244f3ddd9..81bc8b329ef1 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -7,140 +7,189 @@
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
-#include "sched.h"
+
+enum hk_flags {
+ HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
+ HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
+ HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
+};
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
-static cpumask_var_t housekeeping_mask;
-static unsigned int housekeeping_flags;
-bool housekeeping_enabled(enum hk_flags flags)
+struct housekeeping {
+ cpumask_var_t cpumasks[HK_TYPE_MAX];
+ unsigned long flags;
+};
+
+static struct housekeeping housekeeping;
+
+bool housekeeping_enabled(enum hk_type type)
{
- return !!(housekeeping_flags & flags);
+ return !!(housekeeping.flags & BIT(type));
}
EXPORT_SYMBOL_GPL(housekeeping_enabled);
-int housekeeping_any_cpu(enum hk_flags flags)
+int housekeeping_any_cpu(enum hk_type type)
{
int cpu;
if (static_branch_unlikely(&housekeeping_overridden)) {
- if (housekeeping_flags & flags) {
- cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+ if (housekeeping.flags & BIT(type)) {
+ cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
if (cpu < nr_cpu_ids)
return cpu;
- return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
+ if (likely(cpu < nr_cpu_ids))
+ return cpu;
+ /*
+ * Unless we have another problem this can only happen
+ * at boot time before start_secondary() brings the 1st
+ * housekeeping CPU up.
+ */
+ WARN_ON_ONCE(system_state == SYSTEM_RUNNING ||
+ type != HK_TYPE_TIMER);
}
}
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
-const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- return housekeeping_mask;
+ if (housekeeping.flags & BIT(type))
+ return housekeeping.cpumasks[type];
return cpu_possible_mask;
}
EXPORT_SYMBOL_GPL(housekeeping_cpumask);
-void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+void housekeeping_affine(struct task_struct *t, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- set_cpus_allowed_ptr(t, housekeeping_mask);
+ if (housekeeping.flags & BIT(type))
+ set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
}
EXPORT_SYMBOL_GPL(housekeeping_affine);
-bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- return cpumask_test_cpu(cpu, housekeeping_mask);
+ if (housekeeping.flags & BIT(type))
+ return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
return true;
}
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
void __init housekeeping_init(void)
{
- if (!housekeeping_flags)
+ enum hk_type type;
+
+ if (!housekeeping.flags)
return;
static_branch_enable(&housekeeping_overridden);
- if (housekeeping_flags & HK_FLAG_TICK)
+ if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
sched_tick_offload_init();
- /* We need at least one CPU to handle housekeeping work */
- WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+ for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+ /* We need at least one CPU to handle housekeeping work */
+ WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+ }
}
-static int __init housekeeping_setup(char *str, enum hk_flags flags)
+static void __init housekeeping_setup_type(enum hk_type type,
+ cpumask_var_t housekeeping_staging)
{
- cpumask_var_t non_housekeeping_mask;
- cpumask_var_t tmp;
- int err;
+
+ alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
+ cpumask_copy(housekeeping.cpumasks[type],
+ housekeeping_staging);
+}
+
+static int __init housekeeping_setup(char *str, unsigned long flags)
+{
+ cpumask_var_t non_housekeeping_mask, housekeeping_staging;
+ unsigned int first_cpu;
+ int err = 0;
+
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ pr_warn("Housekeeping: nohz unsupported."
+ " Build with CONFIG_NO_HZ_FULL\n");
+ return 0;
+ }
+ }
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
- err = cpulist_parse(str, non_housekeeping_mask);
- if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+ if (cpulist_parse(str, non_housekeeping_mask) < 0) {
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
+ goto free_non_housekeeping_mask;
}
- alloc_bootmem_cpumask_var(&tmp);
- if (!housekeeping_flags) {
- alloc_bootmem_cpumask_var(&housekeeping_mask);
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, non_housekeeping_mask);
+ alloc_bootmem_cpumask_var(&housekeeping_staging);
+ cpumask_andnot(housekeeping_staging,
+ cpu_possible_mask, non_housekeeping_mask);
- cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
- if (cpumask_empty(tmp)) {
+ first_cpu = cpumask_first_and(cpu_present_mask, housekeeping_staging);
+ if (first_cpu >= nr_cpu_ids || first_cpu >= setup_max_cpus) {
+ __cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+ if (!housekeeping.flags) {
pr_warn("Housekeeping: must include one present CPU, "
"using boot CPU:%d\n", smp_processor_id());
- __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
- __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
- }
- } else {
- cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
- if (cpumask_empty(tmp))
- __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
- cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
- if (!cpumask_equal(tmp, housekeeping_mask)) {
- pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
- free_bootmem_cpumask_var(tmp);
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
}
}
- free_bootmem_cpumask_var(tmp);
- if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
- if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
- tick_nohz_full_setup(non_housekeeping_mask);
- } else {
- pr_warn("Housekeeping: nohz unsupported."
- " Build with CONFIG_NO_HZ_FULL\n");
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
+ if (cpumask_empty(non_housekeeping_mask))
+ goto free_housekeeping_staging;
+
+ if (!housekeeping.flags) {
+ /* First setup call ("nohz_full=" or "isolcpus=") */
+ enum hk_type type;
+
+ for_each_set_bit(type, &flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
+ } else {
+ /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
+ enum hk_type type;
+ unsigned long iter_flags = flags & housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
+ if (!cpumask_equal(housekeeping_staging,
+ housekeeping.cpumasks[type])) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ goto free_housekeeping_staging;
+ }
}
+
+ iter_flags = flags & ~housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
}
- housekeeping_flags |= flags;
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
+ tick_nohz_full_setup(non_housekeeping_mask);
+ housekeeping.flags |= flags;
+ err = 1;
+
+free_housekeeping_staging:
+ free_bootmem_cpumask_var(housekeeping_staging);
+free_non_housekeeping_mask:
free_bootmem_cpumask_var(non_housekeeping_mask);
- return 1;
+ return err;
}
static int __init housekeeping_nohz_full_setup(char *str)
{
- unsigned int flags;
+ unsigned long flags;
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+ flags = HK_FLAG_KERNEL_NOISE;
return housekeeping_setup(str, flags);
}
@@ -148,15 +197,18 @@ __setup("nohz_full=", housekeeping_nohz_full_setup);
static int __init housekeeping_isolcpus_setup(char *str)
{
- unsigned int flags = 0;
+ unsigned long flags = 0;
bool illegal = false;
char *par;
int len;
while (isalpha(*str)) {
+ /*
+ * isolcpus=nohz is equivalent to nohz_full.
+ */
if (!strncmp(str, "nohz,", 5)) {
str += 5;
- flags |= HK_FLAG_TICK;
+ flags |= HK_FLAG_KERNEL_NOISE;
continue;
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index de22da666ac7..c48900b856a2 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,7 +6,6 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
-#include "sched.h"
/*
* Global load-average calculations
@@ -46,7 +45,7 @@
* again, being late doesn't loose the delta, just wrecks the sample.
*
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
- * this would add another cross-CPU cacheline miss and atomic operation
+ * this would add another cross-CPU cache-line miss and atomic operation
* to the wakeup path. Instead we increment on whatever CPU the task ran
* when it went into uninterruptible state and decrement on whatever CPU
* did the wakeup. This means that only the sum of nr_uninterruptible over
@@ -63,7 +62,7 @@ EXPORT_SYMBOL(avenrun); /* should be removed */
/**
* get_avenrun - get the load average array
- * @loads: pointer to dest load array
+ * @loads: pointer to destination load array
* @offset: offset to add
* @shift: shift count to shift the result left
*
@@ -81,7 +80,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
long nr_active, delta = 0;
nr_active = this_rq->nr_running - adjust;
- nr_active += (long)this_rq->nr_uninterruptible;
+ nr_active += (int)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -189,7 +188,7 @@ calc_load_n(unsigned long load, unsigned long exp,
* w:0 1 1 0 0 1 1 0 0
*
* This ensures we'll fold the old NO_HZ contribution in this window while
- * accumlating the new one.
+ * accumulating the new one.
*
* - When we wake up from NO_HZ during the window, we push up our
* contribution, since we effectively move our sample point to a known
@@ -347,7 +346,7 @@ static inline void calc_global_nohz(void) { }
*
* Called from the global timer code.
*/
-void calc_global_load(unsigned long ticks)
+void calc_global_load(void)
{
unsigned long sample_window;
long active, delta;
@@ -380,7 +379,7 @@ void calc_global_load(unsigned long ticks)
}
/*
- * Called from scheduler_tick() to periodically update this CPU's
+ * Called from sched_tick() to periodically update this CPU's
* active count.
*/
void calc_global_load_tick(struct rq *this_rq)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 168479a7d61b..809194cd779f 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -4,7 +4,134 @@
*
* membarrier system call
*/
-#include "sched.h"
+
+/*
+ * For documentation purposes, here are some membarrier ordering
+ * scenarios to keep in mind:
+ *
+ * A) Userspace thread execution after IPI vs membarrier's memory
+ * barrier before sending the IPI
+ *
+ * Userspace variables:
+ *
+ * int x = 0, y = 0;
+ *
+ * The memory barrier at the start of membarrier() on CPU0 is necessary in
+ * order to enforce the guarantee that any writes occurring on CPU0 before
+ * the membarrier() is executed will be visible to any code executing on
+ * CPU1 after the IPI-induced memory barrier:
+ *
+ * CPU0 CPU1
+ *
+ * x = 1
+ * membarrier():
+ * a: smp_mb()
+ * b: send IPI IPI-induced mb
+ * c: smp_mb()
+ * r2 = y
+ * y = 1
+ * barrier()
+ * r1 = x
+ *
+ * BUG_ON(r1 == 0 && r2 == 0)
+ *
+ * The write to y and load from x by CPU1 are unordered by the hardware,
+ * so it's possible to have "r1 = x" reordered before "y = 1" at any
+ * point after (b). If the memory barrier at (a) is omitted, then "x = 1"
+ * can be reordered after (a) (although not after (c)), so we get r1 == 0
+ * and r2 == 0. This violates the guarantee that membarrier() is
+ * supposed by provide.
+ *
+ * The timing of the memory barrier at (a) has to ensure that it executes
+ * before the IPI-induced memory barrier on CPU1.
+ *
+ * B) Userspace thread execution before IPI vs membarrier's memory
+ * barrier after completing the IPI
+ *
+ * Userspace variables:
+ *
+ * int x = 0, y = 0;
+ *
+ * The memory barrier at the end of membarrier() on CPU0 is necessary in
+ * order to enforce the guarantee that any writes occurring on CPU1 before
+ * the membarrier() is executed will be visible to any code executing on
+ * CPU0 after the membarrier():
+ *
+ * CPU0 CPU1
+ *
+ * x = 1
+ * barrier()
+ * y = 1
+ * r2 = y
+ * membarrier():
+ * a: smp_mb()
+ * b: send IPI IPI-induced mb
+ * c: smp_mb()
+ * r1 = x
+ * BUG_ON(r1 == 0 && r2 == 1)
+ *
+ * The writes to x and y are unordered by the hardware, so it's possible to
+ * have "r2 = 1" even though the write to x doesn't execute until (b). If
+ * the memory barrier at (c) is omitted then "r1 = x" can be reordered
+ * before (b) (although not before (a)), so we get "r1 = 0". This violates
+ * the guarantee that membarrier() is supposed to provide.
+ *
+ * The timing of the memory barrier at (c) has to ensure that it executes
+ * after the IPI-induced memory barrier on CPU1.
+ *
+ * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * d: switch to kthread (includes mb)
+ * b: read rq->curr->mm == NULL
+ * e: switch to user (includes mb)
+ * c: smp_mb()
+ *
+ * Using the scenario from (A), we can show that (a) needs to be paired
+ * with (e). Using the scenario from (B), we can show that (c) needs to
+ * be paired with (d).
+ *
+ * D) exit_mm vs membarrier
+ *
+ * Two thread groups are created, A and B. Thread group B is created by
+ * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
+ * Let's assume we have a single thread within each thread group (Thread A
+ * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1.
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * exit_mm():
+ * d: smp_mb()
+ * e: current->mm = NULL
+ * b: read rq->curr->mm == NULL
+ * c: smp_mb()
+ *
+ * Using scenario (B), we can show that (c) needs to be paired with (d).
+ *
+ * E) kthread_{use,unuse}_mm vs membarrier
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * kthread_unuse_mm()
+ * d: smp_mb()
+ * e: current->mm = NULL
+ * b: read rq->curr->mm == NULL
+ * kthread_use_mm()
+ * f: current->mm = mm
+ * g: smp_mb()
+ * c: smp_mb()
+ *
+ * Using the scenario from (A), we can show that (a) needs to be paired
+ * with (g). Using the scenario from (B), we can show that (c) needs to
+ * be paired with (d).
+ */
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
@@ -18,18 +145,61 @@
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
+#ifdef CONFIG_RSEQ
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
- | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ | MEMBARRIER_CMD_GET_REGISTRATIONS)
+
+static DEFINE_MUTEX(membarrier_ipi_mutex);
+#define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex)
static void ipi_mb(void *info)
{
smp_mb(); /* IPIs should be serializing but paranoid. */
}
+static void ipi_sync_core(void *info)
+{
+ /*
+ * The smp_mb() in membarrier after all the IPIs is supposed to
+ * ensure that memory on remote CPUs that occur before the IPI
+ * become visible to membarrier()'s caller -- see scenario B in
+ * the big comment at the top of this file.
+ *
+ * A sync_core() would provide this guarantee, but
+ * sync_core_before_usermode() might end up being deferred until
+ * after membarrier()'s smp_mb().
+ */
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+
+ sync_core_before_usermode();
+}
+
+static void ipi_rseq(void *info)
+{
+ /*
+ * Ensure that all stores done by the calling thread are visible
+ * to the current task before the current task resumes. We could
+ * probably optimize this away on most architectures, but by the
+ * time we've already sent an IPI, the cost of the extra smp_mb()
+ * is negligible.
+ */
+ smp_mb();
+ rseq_preempt(current);
+}
+
static void ipi_sync_rq_state(void *info)
{
struct mm_struct *mm = (struct mm_struct *) info;
@@ -63,6 +233,18 @@ void membarrier_exec_mmap(struct mm_struct *mm)
this_cpu_write(runqueues.membarrier_state, 0);
}
+void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+ struct rq *rq = this_rq();
+ int membarrier_state = 0;
+
+ if (next_mm)
+ membarrier_state = atomic_read(&next_mm->membarrier_state);
+ if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+ return;
+ WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+
static int membarrier_global_expedited(void)
{
int cpu;
@@ -72,7 +254,7 @@ static int membarrier_global_expedited(void)
return 0;
/*
- * Matches memory barriers around rq->curr modification in
+ * Matches memory barriers after rq->curr modification in
* scheduler.
*/
smp_mb(); /* system call entry is not a mb. */
@@ -80,6 +262,7 @@ static int membarrier_global_expedited(void)
if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ SERIALIZE_IPI();
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
@@ -101,12 +284,11 @@ static int membarrier_global_expedited(void)
continue;
/*
- * Skip the CPU if it runs a kernel thread. The scheduler
- * leaves the prior task mm in place as an optimization when
- * scheduling a kthread.
+ * Skip the CPU if it runs a kernel thread which is not using
+ * a task mm.
*/
p = rcu_dereference(cpu_rq(cpu)->curr);
- if (p->flags & PF_KTHREAD)
+ if (!p->mm)
continue;
__cpumask_set_cpu(cpu, tmpmask);
@@ -122,74 +304,128 @@ static int membarrier_global_expedited(void)
/*
* Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers around
+ * waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
return 0;
}
-static int membarrier_private_expedited(int flags)
+static int membarrier_private_expedited(int flags, int cpu_id)
{
- int cpu;
cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm;
+ smp_call_func_t ipi_func = ipi_mb;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
+ ipi_func = ipi_sync_core;
+ prepare_sync_core_cmd(mm);
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ if (!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
+ return -EPERM;
+ ipi_func = ipi_rseq;
} else {
+ WARN_ON_ONCE(flags);
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
}
- if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
+ (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
return 0;
/*
- * Matches memory barriers around rq->curr modification in
+ * Matches memory barriers after rq->curr modification in
* scheduler.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
*/
smp_mb(); /* system call entry is not a mb. */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
+ SERIALIZE_IPI();
cpus_read_lock();
- rcu_read_lock();
- for_each_online_cpu(cpu) {
+
+ if (cpu_id >= 0) {
struct task_struct *p;
+ if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+ goto out;
+ rcu_read_lock();
+ p = rcu_dereference(cpu_rq(cpu_id)->curr);
+ if (!p || p->mm != mm) {
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
+ } else {
+ int cpu;
+
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
+ }
+
+ if (cpu_id >= 0) {
/*
- * Skipping the current CPU is OK even through we can be
- * migrated at any point. The current CPU, at the point
- * where we read raw_smp_processor_id(), is ensured to
- * be in program order with respect to the caller
- * thread. Therefore, we can skip this CPU from the
- * iteration.
+ * smp_call_function_single() will call ipi_func() if cpu_id
+ * is the calling CPU.
*/
- if (cpu == raw_smp_processor_id())
- continue;
- p = rcu_dereference(cpu_rq(cpu)->curr);
- if (p && p->mm == mm)
- __cpumask_set_cpu(cpu, tmpmask);
+ smp_call_function_single(cpu_id, ipi_func, NULL, 1);
+ } else {
+ /*
+ * For regular membarrier, we can save a few cycles by
+ * skipping the current cpu -- we're about to do smp_mb()
+ * below, and if we migrate to a different cpu, this cpu
+ * and the new cpu will execute a full barrier in the
+ * scheduler.
+ *
+ * For SYNC_CORE, we do need a barrier on the current cpu --
+ * otherwise, if we are migrated and replaced by a different
+ * task in the same mm just before, during, or after
+ * membarrier, we will end up with some thread in the mm
+ * running without a core sync.
+ *
+ * For RSEQ, don't rseq_preempt() the caller. User code
+ * is not supposed to issue syscalls at all from inside an
+ * rseq critical section.
+ */
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_func, NULL, true);
+ preempt_enable();
+ } else {
+ on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
+ }
}
- rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
- preempt_enable();
-
- free_cpumask_var(tmpmask);
+out:
+ if (cpu_id < 0)
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
* Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers around
+ * waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
@@ -229,11 +465,12 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
/*
* For each cpu runqueue, if the task's mm match @mm, ensure that all
- * @mm's membarrier state set bits are also set in in the runqueue's
+ * @mm's membarrier state set bits are also set in the runqueue's
* membarrier state. This ensures that a runqueue scheduling
* between threads which are users of @mm has its membarrier state
* updated.
*/
+ SERIALIZE_IPI();
cpus_read_lock();
rcu_read_lock();
for_each_online_cpu(cpu) {
@@ -246,9 +483,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
}
rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
- preempt_enable();
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
free_cpumask_var(tmpmask);
cpus_read_unlock();
@@ -283,11 +518,18 @@ static int membarrier_register_private_expedited(int flags)
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
ret;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
+ } else {
+ WARN_ON_ONCE(flags);
}
/*
@@ -299,6 +541,8 @@ static int membarrier_register_private_expedited(int flags)
return 0;
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ if (flags & MEMBARRIER_FLAG_RSEQ)
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
atomic_or(set_state, &mm->membarrier_state);
ret = sync_runqueues_membarrier_state(mm);
if (ret)
@@ -308,10 +552,51 @@ static int membarrier_register_private_expedited(int flags)
return 0;
}
+static int membarrier_get_registrations(void)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ int registrations_mask = 0, membarrier_state, i;
+ static const int states[] = {
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED |
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY,
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ |
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY
+ };
+ static const int registration_cmds[] = {
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE,
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+ };
+ BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds));
+
+ membarrier_state = atomic_read(&mm->membarrier_state);
+ for (i = 0; i < ARRAY_SIZE(states); ++i) {
+ if (membarrier_state & states[i]) {
+ registrations_mask |= registration_cmds[i];
+ membarrier_state &= ~states[i];
+ }
+ }
+ WARN_ON_ONCE(membarrier_state != 0);
+ return registrations_mask;
+}
+
/**
* sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0 for all commands other than
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
+ * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
+ * contains the CPU on which to interrupt (= restart)
+ * the RSEQ critical section.
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
+ * RSEQ CS should be interrupted (@cmd must be
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
*
* If this system call is not implemented, -ENOSYS is returned. If the
* command specified does not exist, not available on the running
@@ -337,10 +622,21 @@ static int membarrier_register_private_expedited(int flags)
* smp_mb() X O O
* sys_membarrier() O O O
*/
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
{
- if (unlikely(flags))
- return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
+ return -EINVAL;
+ break;
+ default:
+ if (unlikely(flags))
+ return -EINVAL;
+ }
+
+ if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
+ cpu_id = -1;
+
switch (cmd) {
case MEMBARRIER_CMD_QUERY:
{
@@ -362,13 +658,19 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited(0);
+ return membarrier_private_expedited(0, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
return membarrier_register_private_expedited(0);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
- return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
+ case MEMBARRIER_CMD_GET_REGISTRATIONS:
+ return membarrier_get_registrations();
default:
return -EINVAL;
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b4b1ff96642f..7a8534a2deff 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Per Entity Load Tracking
+ * Per Entity Load Tracking (PELT)
*
* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
@@ -24,12 +24,6 @@
* Author: Vincent Guittot <vincent.guittot@linaro.org>
*/
-#include <linux/sched.h>
-#include "sched.h"
-#include "pelt.h"
-
-#include <trace/events/sched.h>
-
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -83,8 +77,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -137,7 +129,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
* runnable = running = 0;
*
* clause from ___update_load_sum(); this results in
- * the below usage of @contrib to dissapear entirely,
+ * the below usage of @contrib to disappear entirely,
* so no point in calculating it.
*/
contrib = __accumulate_pelt_segments(periods,
@@ -216,8 +208,8 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* se has been already dequeued but cfs_rq->curr still points to it.
* This means that weight will be 0 but not running for a sched_entity
* but also for a cfs_rq if the latter becomes idle. As an example,
- * this happens during idle_balance() which calls
- * update_blocked_averages().
+ * this happens during sched_balance_newidle() which calls
+ * sched_balance_update_blocked_averages().
*
* Also see the comment in accumulate_sum().
*/
@@ -264,7 +256,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load)
{
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(sa);
/*
* Step 2: update *_avg.
@@ -283,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = grq->h_nr_running
+ * se_runnable() = grq->h_nr_runnable
*
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
* runnable_avg = runnable_sum
@@ -329,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- cfs_rq->h_nr_running,
+ cfs_rq->h_nr_runnable,
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1);
@@ -392,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+#ifdef CONFIG_SCHED_HW_PRESSURE
/*
- * thermal:
+ * hardware:
*
* load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
*
* util_avg and runnable_load_avg are not supported and meaningless.
*
* Unlike rt/dl utilization tracking that track time spent by a cpu
- * running a rt/dl task through util_avg, the average thermal pressure is
- * tracked through load_avg. This is because thermal pressure signal is
+ * running a rt/dl task through util_avg, the average HW pressure is
+ * tracked through load_avg. This is because HW pressure signal is
* time weighted "delta" capacity unlike util_avg which is binary.
* "delta capacity" = actual capacity -
- * capped capacity a cpu due to a thermal event.
+ * capped capacity a cpu due to a HW event.
*/
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
- if (___update_load_sum(now, &rq->avg_thermal,
+ if (___update_load_sum(now, &rq->avg_hw,
capacity,
capacity,
capacity)) {
- ___update_load_avg(&rq->avg_thermal, 1);
- trace_pelt_thermal_tp(rq);
+ ___update_load_avg(&rq->avg_hw, 1);
+ trace_pelt_hw_tp(rq);
return 1;
}
@@ -425,7 +417,7 @@ int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
- * irq:
+ * IRQ:
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
@@ -440,7 +432,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
int ret = 0;
/*
- * We can't use clock_pelt because irq time is not accounted in
+ * We can't use clock_pelt because IRQ time is not accounted in
* clock_task. Instead we directly scale the running time to
* reflect the real amount of computation
*/
@@ -475,3 +467,23 @@ int update_irq_load_avg(struct rq *rq, u64 running)
return ret;
}
#endif
+
+/*
+ * Load avg and utiliztion metrics need to be updated periodically and before
+ * consumption. This function updates the metrics for all subsystems except for
+ * the fair class. @rq must be locked and have its clock updated.
+ */
+bool update_other_load_avgs(struct rq *rq)
+{
+ u64 now = rq_clock_pelt(rq);
+ const struct sched_class *curr_class = rq->donor->sched_class;
+ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+
+ lockdep_assert_rq_held(rq);
+
+ /* hw_pressure doesn't care about invariance */
+ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
+ update_irq_load_avg(rq, 0);
+}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index eb034d9f024d..f4f6a0875c66 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -6,22 +6,23 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+bool update_other_load_avgs(struct rq *rq);
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+#ifdef CONFIG_SCHED_HW_PRESSURE
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
- return READ_ONCE(rq->avg_thermal.load_avg);
+ return READ_ONCE(rq->avg_hw.load_avg);
}
#else
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
@@ -37,14 +38,12 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
+#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
+
+static inline u32 get_pelt_divider(struct sched_avg *avg)
+{
+ return PELT_MIN_DIVIDER + avg->period_contrib;
+}
static inline void cfs_se_util_change(struct sched_avg *avg)
{
@@ -53,14 +52,33 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
if (!sched_feat(UTIL_EST))
return;
- /* Avoid store if the flag has been already set */
- enqueued = avg->util_est.enqueued;
+ /* Avoid store if the flag has been already reset */
+ enqueued = avg->util_est;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
- WRITE_ONCE(avg->util_est.enqueued, enqueued);
+ WRITE_ONCE(avg->util_est, enqueued);
+}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+/* The rq is idle, we can sync to clock_task */
+static inline void _update_idle_rq_clock_pelt(struct rq *rq)
+{
+ rq->clock_pelt = rq_clock_task(rq);
+
+ u64_u32_store(rq->clock_idle, rq_clock(rq));
+ /* Paired with smp_rmb in migrate_se_pelt_lag() */
+ smp_wmb();
+ u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
}
/*
@@ -78,8 +96,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
{
if (unlikely(is_idle_task(rq->curr))) {
- /* The rq is idle, we can sync to clock_task */
- rq->clock_pelt = rq_clock_task(rq);
+ _update_idle_rq_clock_pelt(rq);
return;
}
@@ -125,33 +142,40 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
* Reflecting stolen time makes sense only if the idle
* phase would be present at max capacity. As soon as the
* utilization of a rq has reached the maximum value, it is
- * considered as an always runnig rq without idle time to
+ * considered as an always running rq without idle time to
* steal. This potential idle time is considered as lost in
* this case. We keep track of this lost idle time compare to
* rq's clock_task.
*/
if (util_sum >= divider)
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+
+ _update_idle_rq_clock_pelt(rq);
}
-static inline u64 rq_clock_pelt(struct rq *rq)
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- lockdep_assert_held(&rq->lock);
- assert_clock_updated(rq);
+ u64 throttled;
- return rq->clock_pelt - rq->lost_idle_time;
+ if (unlikely(cfs_rq->throttle_count))
+ throttled = U64_MAX;
+ else
+ throttled = cfs_rq->throttled_clock_pelt_time;
+
+ u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
}
-#ifdef CONFIG_CFS_BANDWIDTH
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+ return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
- return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
@@ -179,12 +203,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
}
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
@@ -206,6 +230,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { }
static inline void
update_idle_rq_clock_pelt(struct rq *rq) { }
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
#endif
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8f45cdb6463b..bb56805e3d47 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Pressure stall information for CPU, memory and IO
*
@@ -34,12 +35,21 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
- * (Naturally, the FULL state doesn't exist for the CPU resource.)
- *
* SOME = nr_delayed_tasks != 0
- * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
+ * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
+ *
+ * What it means for a task to be productive is defined differently
+ * for each resource. For IO, productive means a running task. For
+ * memory, productive means a running task that isn't a reclaimer. For
+ * CPU, productive means an on-CPU task.
*
- * The percentage of wallclock time spent in those compound stall
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level. At the cgroup level,
+ * FULL means all non-idle tasks in the cgroup are delayed on the CPU
+ * resource which is being used by others outside of the cgroup or
+ * throttled by the cgroup cpu.max configuration.
+ *
+ * The percentage of wall clock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
* where the SOME percentage indicates workload slowdowns and the FULL
* percentage indicates reduced CPU utilization:
@@ -59,7 +69,7 @@
* states, we would have to conclude a CPU SOME pressure number of
* 100%, since *somebody* is waiting on a runqueue at all
* times. However, that is clearly not the amount of contention the
- * workload is experiencing: only one out of 256 possible exceution
+ * workload is experiencing: only one out of 256 possible execution
* threads will be contended at any given time, or about 0.4%.
*
* Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
@@ -73,18 +83,18 @@
* we have to base our calculation on the number of non-idle tasks in
* conjunction with the number of available CPUs, which is the number
* of potential execution threads. SOME becomes then the proportion of
- * delayed tasks to possibe threads, and FULL is the share of possible
+ * delayed tasks to possible threads, and FULL is the share of possible
* threads that are unproductive due to delays:
*
* threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1)
- * FULL = (threads - min(nr_running_tasks, threads)) / threads
+ * FULL = (threads - min(nr_productive_tasks, threads)) / threads
*
* For the 257 number crunchers on 256 CPUs, this yields:
*
* threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4%
- * FULL = (256 - min(257, 256)) / 256 = 0%
+ * FULL = (256 - min(256, 256)) / 256 = 0%
*
* For the 1 out of 4 memory-delayed tasks, this yields:
*
@@ -109,7 +119,7 @@
* For each runqueue, we track:
*
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
- * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
+ * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
*
* and then periodically aggregate:
@@ -127,24 +137,10 @@
* sampling of the aggregate task states would be.
*/
-#include "../workqueue_internal.h"
-#include <linux/sched/loadavg.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/seqlock.h>
-#include <linux/uaccess.h>
-#include <linux/cgroup.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/ctype.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/psi.h>
-#include "sched.h"
-
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
@@ -164,7 +160,6 @@ __setup("psi=", setup_psi);
#define EXP_300s 2034 /* 1/exp(2s/300s) */
/* PSI trigger definitions */
-#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
@@ -179,59 +174,76 @@ struct psi_group psi_system = {
static void psi_avgs_work(struct work_struct *work);
+static void poll_timer_fn(struct timer_list *t);
+
static void group_init(struct psi_group *group)
{
int cpu;
+ group->enabled = true;
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
- INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
- /* Init trigger-related members */
- atomic_set(&group->poll_scheduled, 0);
- mutex_init(&group->trigger_lock);
- INIT_LIST_HEAD(&group->triggers);
- memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
- group->poll_states = 0;
- group->poll_min_period = U32_MAX;
- memset(group->polling_total, 0, sizeof(group->polling_total));
- group->polling_next_update = ULLONG_MAX;
- group->polling_until = 0;
- rcu_assign_pointer(group->poll_kworker, NULL);
+
+ /* Init avg trigger-related members */
+ INIT_LIST_HEAD(&group->avg_triggers);
+ memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
+ INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+
+ /* Init rtpoll trigger-related members */
+ atomic_set(&group->rtpoll_scheduled, 0);
+ mutex_init(&group->rtpoll_trigger_lock);
+ INIT_LIST_HEAD(&group->rtpoll_triggers);
+ group->rtpoll_min_period = U32_MAX;
+ group->rtpoll_next_update = ULLONG_MAX;
+ init_waitqueue_head(&group->rtpoll_wait);
+ timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
+ rcu_assign_pointer(group->rtpoll_task, NULL);
}
void __init psi_init(void)
{
if (!psi_enable) {
static_branch_enable(&psi_disabled);
+ static_branch_disable(&psi_cgroups_enabled);
return;
}
+ if (!cgroup_psi_enabled())
+ static_branch_disable(&psi_cgroups_enabled);
+
psi_period = jiffies_to_nsecs(PSI_FREQ);
group_init(&psi_system);
}
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static u32 test_states(unsigned int *tasks, u32 state_mask)
{
- switch (state) {
- case PSI_IO_SOME:
- return tasks[NR_IOWAIT];
- case PSI_IO_FULL:
- return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
- case PSI_MEM_SOME:
- return tasks[NR_MEMSTALL];
- case PSI_MEM_FULL:
- return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
- case PSI_CPU_SOME:
- return tasks[NR_RUNNING] > tasks[NR_ONCPU];
- case PSI_NONIDLE:
- return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
- tasks[NR_RUNNING];
- default:
- return false;
+ const bool oncpu = state_mask & PSI_ONCPU;
+
+ if (tasks[NR_IOWAIT]) {
+ state_mask |= BIT(PSI_IO_SOME);
+ if (!tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_IO_FULL);
}
+
+ if (tasks[NR_MEMSTALL]) {
+ state_mask |= BIT(PSI_MEM_SOME);
+ if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING])
+ state_mask |= BIT(PSI_MEM_FULL);
+ }
+
+ if (tasks[NR_RUNNING] > oncpu)
+ state_mask |= BIT(PSI_CPU_SOME);
+
+ if (tasks[NR_RUNNING] && !oncpu)
+ state_mask |= BIT(PSI_CPU_FULL);
+
+ if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_NONIDLE);
+
+ return state_mask;
}
static void get_recent_times(struct psi_group *group, int cpu,
@@ -239,6 +251,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+ int current_cpu = raw_smp_processor_id();
+ unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
enum psi_states s;
unsigned int seq;
@@ -253,6 +267,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
+ if (cpu == current_cpu)
+ memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
} while (read_seqcount_retry(&groupc->seq, seq));
/* Calculate state time deltas against the previous snapshot */
@@ -277,6 +293,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
if (delta)
*pchanged_states |= (1 << s);
}
+
+ /*
+ * When collect_percpu_times() from the avgs_work, we don't want to
+ * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+ * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+ * So for the current CPU, we need to re-arm avgs_work only when
+ * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+ * we can just check PSI_NONIDLE delta.
+ */
+ if (current_work() == &group->avgs_work.work) {
+ bool reschedule;
+
+ if (cpu == current_cpu)
+ reschedule = tasks[NR_RUNNING] +
+ tasks[NR_IOWAIT] +
+ tasks[NR_MEMSTALL] > 1;
+ else
+ reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+ if (reschedule)
+ *pchanged_states |= PSI_STATE_RESCHEDULE;
+ }
}
static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -311,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group,
/*
* Collect the per-cpu time buckets and average them into a
- * single time sample that is normalized to wallclock time.
+ * single time sample that is normalized to wall clock time.
*
* For averaging, each CPU is weighted by its non-idle time in
* the sampling period. This eliminates artifacts from uneven
@@ -354,6 +392,114 @@ static void collect_percpu_times(struct psi_group *group,
*pchanged_states = changed_states;
}
+/* Trigger tracking window manipulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+ u64 prev_growth)
+{
+ win->start_time = now;
+ win->start_value = value;
+ win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+ u64 elapsed;
+ u64 growth;
+
+ elapsed = now - win->start_time;
+ growth = value - win->start_value;
+ /*
+ * After each tracking window passes win->start_value and
+ * win->start_time get reset and win->prev_growth stores
+ * the average per-window growth of the previous window.
+ * win->prev_growth is then used to interpolate additional
+ * growth from the previous window assuming it was linear.
+ */
+ if (elapsed > win->size)
+ window_reset(win, now, value, growth);
+ else {
+ u32 remaining;
+
+ remaining = win->size - elapsed;
+ growth += div64_u64(win->prev_growth * remaining, win->size);
+ }
+
+ return growth;
+}
+
+static void update_triggers(struct psi_group *group, u64 now,
+ enum psi_aggregators aggregator)
+{
+ struct psi_trigger *t;
+ u64 *total = group->total[aggregator];
+ struct list_head *triggers;
+ u64 *aggregator_total;
+
+ if (aggregator == PSI_AVGS) {
+ triggers = &group->avg_triggers;
+ aggregator_total = group->avg_total;
+ } else {
+ triggers = &group->rtpoll_triggers;
+ aggregator_total = group->rtpoll_total;
+ }
+
+ /*
+ * On subsequent updates, calculate growth deltas and let
+ * watchers know when their specified thresholds are exceeded.
+ */
+ list_for_each_entry(t, triggers, node) {
+ u64 growth;
+ bool new_stall;
+
+ new_stall = aggregator_total[t->state] != total[t->state];
+
+ /* Check for stall activity or a previous threshold breach */
+ if (!new_stall && !t->pending_event)
+ continue;
+ /*
+ * Check for new stall activity, as well as deferred
+ * events that occurred in the last window after the
+ * trigger had already fired (we want to ratelimit
+ * events without dropping any).
+ */
+ if (new_stall) {
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (!t->pending_event) {
+ if (growth < t->threshold)
+ continue;
+
+ t->pending_event = true;
+ }
+ }
+ /* Limit event signaling to once per window */
+ if (now < t->last_event_time + t->win.size)
+ continue;
+
+ /* Generate an event */
+ if (cmpxchg(&t->event, 0, 1) == 0) {
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
+ }
+ t->last_event_time = now;
+ /* Reset threshold breach flag once event got generated */
+ t->pending_event = false;
+ }
+}
+
static u64 update_averages(struct psi_group *group, u64 now)
{
unsigned long missed_periods = 0;
@@ -412,7 +558,6 @@ static void psi_avgs_work(struct work_struct *work)
struct delayed_work *dwork;
struct psi_group *group;
u32 changed_states;
- bool nonidle;
u64 now;
dwork = to_delayed_work(work);
@@ -423,7 +568,6 @@ static void psi_avgs_work(struct work_struct *work)
now = sched_clock();
collect_percpu_times(group, PSI_AVGS, &changed_states);
- nonidle = changed_states & (1 << PSI_NONIDLE);
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -431,10 +575,12 @@ static void psi_avgs_work(struct work_struct *work)
* Once restarted, we'll catch up the running averages in one
* go - see calc_avgs() and missed_periods.
*/
- if (now >= group->avg_next_update)
+ if (now >= group->avg_next_update) {
+ update_triggers(group, now, PSI_AVGS);
group->avg_next_update = update_averages(group, now);
+ }
- if (nonidle) {
+ if (changed_states & PSI_STATE_RESCHEDULE) {
schedule_delayed_work(dwork, nsecs_to_jiffies(
group->avg_next_update - now) + 1);
}
@@ -442,194 +588,161 @@ static void psi_avgs_work(struct work_struct *work)
mutex_unlock(&group->avgs_lock);
}
-/* Trigger tracking window manupulations */
-static void window_reset(struct psi_window *win, u64 now, u64 value,
- u64 prev_growth)
-{
- win->start_time = now;
- win->start_value = value;
- win->prev_growth = prev_growth;
-}
-
-/*
- * PSI growth tracking window update and growth calculation routine.
- *
- * This approximates a sliding tracking window by interpolating
- * partially elapsed windows using historical growth data from the
- * previous intervals. This minimizes memory requirements (by not storing
- * all the intermediate values in the previous window) and simplifies
- * the calculations. It works well because PSI signal changes only in
- * positive direction and over relatively small window sizes the growth
- * is close to linear.
- */
-static u64 window_update(struct psi_window *win, u64 now, u64 value)
-{
- u64 elapsed;
- u64 growth;
-
- elapsed = now - win->start_time;
- growth = value - win->start_value;
- /*
- * After each tracking window passes win->start_value and
- * win->start_time get reset and win->prev_growth stores
- * the average per-window growth of the previous window.
- * win->prev_growth is then used to interpolate additional
- * growth from the previous window assuming it was linear.
- */
- if (elapsed > win->size)
- window_reset(win, now, value, growth);
- else {
- u32 remaining;
-
- remaining = win->size - elapsed;
- growth += div64_u64(win->prev_growth * remaining, win->size);
- }
-
- return growth;
-}
-
-static void init_triggers(struct psi_group *group, u64 now)
+static void init_rtpoll_triggers(struct psi_group *group, u64 now)
{
struct psi_trigger *t;
- list_for_each_entry(t, &group->triggers, node)
+ list_for_each_entry(t, &group->rtpoll_triggers, node)
window_reset(&t->win, now,
group->total[PSI_POLL][t->state], 0);
- memcpy(group->polling_total, group->total[PSI_POLL],
- sizeof(group->polling_total));
- group->polling_next_update = now + group->poll_min_period;
+ memcpy(group->rtpoll_total, group->total[PSI_POLL],
+ sizeof(group->rtpoll_total));
+ group->rtpoll_next_update = now + group->rtpoll_min_period;
}
-static u64 update_triggers(struct psi_group *group, u64 now)
+/* Schedule rtpolling if it's not already scheduled or forced. */
+static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
+ bool force)
{
- struct psi_trigger *t;
- bool new_stall = false;
- u64 *total = group->total[PSI_POLL];
+ struct task_struct *task;
/*
- * On subsequent updates, calculate growth deltas and let
- * watchers know when their specified thresholds are exceeded.
+ * atomic_xchg should be called even when !force to provide a
+ * full memory barrier (see the comment inside psi_rtpoll_work).
*/
- list_for_each_entry(t, &group->triggers, node) {
- u64 growth;
-
- /* Check for stall activity */
- if (group->polling_total[t->state] == total[t->state])
- continue;
-
- /*
- * Multiple triggers might be looking at the same state,
- * remember to update group->polling_total[] once we've
- * been through all of them. Also remember to extend the
- * polling time if we see new stall activity.
- */
- new_stall = true;
-
- /* Calculate growth since last update */
- growth = window_update(&t->win, now, total[t->state]);
- if (growth < t->threshold)
- continue;
-
- /* Limit event signaling to once per window */
- if (now < t->last_event_time + t->win.size)
- continue;
-
- /* Generate an event */
- if (cmpxchg(&t->event, 0, 1) == 0)
- wake_up_interruptible(&t->event_wait);
- t->last_event_time = now;
- }
-
- if (new_stall)
- memcpy(group->polling_total, total,
- sizeof(group->polling_total));
-
- return now + group->poll_min_period;
-}
-
-/*
- * Schedule polling if it's not already scheduled. It's safe to call even from
- * hotpath because even though kthread_queue_delayed_work takes worker->lock
- * spinlock that spinlock is never contended due to poll_scheduled atomic
- * preventing such competition.
- */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
-{
- struct kthread_worker *kworker;
-
- /* Do not reschedule if already scheduled */
- if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+ if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force)
return;
rcu_read_lock();
- kworker = rcu_dereference(group->poll_kworker);
+ task = rcu_dereference(group->rtpoll_task);
/*
* kworker might be NULL in case psi_trigger_destroy races with
* psi_task_change (hotpath) which can't use locks
*/
- if (likely(kworker))
- kthread_queue_delayed_work(kworker, &group->poll_work, delay);
+ if (likely(task))
+ mod_timer(&group->rtpoll_timer, jiffies + delay);
else
- atomic_set(&group->poll_scheduled, 0);
+ atomic_set(&group->rtpoll_scheduled, 0);
rcu_read_unlock();
}
-static void psi_poll_work(struct kthread_work *work)
+static void psi_rtpoll_work(struct psi_group *group)
{
- struct kthread_delayed_work *dwork;
- struct psi_group *group;
+ bool force_reschedule = false;
u32 changed_states;
u64 now;
- dwork = container_of(work, struct kthread_delayed_work, work);
- group = container_of(dwork, struct psi_group, poll_work);
+ mutex_lock(&group->rtpoll_trigger_lock);
- atomic_set(&group->poll_scheduled, 0);
+ now = sched_clock();
- mutex_lock(&group->trigger_lock);
+ if (now > group->rtpoll_until) {
+ /*
+ * We are either about to start or might stop rtpolling if no
+ * state change was recorded. Resetting rtpoll_scheduled leaves
+ * a small window for psi_group_change to sneak in and schedule
+ * an immediate rtpoll_work before we get to rescheduling. One
+ * potential extra wakeup at the end of the rtpolling window
+ * should be negligible and rtpoll_next_update still keeps
+ * updates correctly on schedule.
+ */
+ atomic_set(&group->rtpoll_scheduled, 0);
+ /*
+ * A task change can race with the rtpoll worker that is supposed to
+ * report on it. To avoid missing events, ensure ordering between
+ * rtpoll_scheduled and the task state accesses, such that if the
+ * rtpoll worker misses the state update, the task change is
+ * guaranteed to reschedule the rtpoll worker:
+ *
+ * rtpoll worker:
+ * atomic_set(rtpoll_scheduled, 0)
+ * smp_mb()
+ * LOAD states
+ *
+ * task change:
+ * STORE states
+ * if atomic_xchg(rtpoll_scheduled, 1) == 0:
+ * schedule rtpoll worker
+ *
+ * The atomic_xchg() implies a full barrier.
+ */
+ smp_mb();
+ } else {
+ /* The rtpolling window is not over, keep rescheduling */
+ force_reschedule = true;
+ }
- now = sched_clock();
collect_percpu_times(group, PSI_POLL, &changed_states);
- if (changed_states & group->poll_states) {
- /* Initialize trigger windows when entering polling mode */
- if (now > group->polling_until)
- init_triggers(group, now);
+ if (changed_states & group->rtpoll_states) {
+ /* Initialize trigger windows when entering rtpolling mode */
+ if (now > group->rtpoll_until)
+ init_rtpoll_triggers(group, now);
/*
* Keep the monitor active for at least the duration of the
* minimum tracking window as long as monitor states are
* changing.
*/
- group->polling_until = now +
- group->poll_min_period * UPDATES_PER_WINDOW;
+ group->rtpoll_until = now +
+ group->rtpoll_min_period * UPDATES_PER_WINDOW;
}
- if (now > group->polling_until) {
- group->polling_next_update = ULLONG_MAX;
+ if (now > group->rtpoll_until) {
+ group->rtpoll_next_update = ULLONG_MAX;
goto out;
}
- if (now >= group->polling_next_update)
- group->polling_next_update = update_triggers(group, now);
+ if (now >= group->rtpoll_next_update) {
+ if (changed_states & group->rtpoll_states) {
+ update_triggers(group, now, PSI_POLL);
+ memcpy(group->rtpoll_total, group->total[PSI_POLL],
+ sizeof(group->rtpoll_total));
+ }
+ group->rtpoll_next_update = now + group->rtpoll_min_period;
+ }
- psi_schedule_poll_work(group,
- nsecs_to_jiffies(group->polling_next_update - now) + 1);
+ psi_schedule_rtpoll_work(group,
+ nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
+ force_reschedule);
out:
- mutex_unlock(&group->trigger_lock);
+ mutex_unlock(&group->rtpoll_trigger_lock);
}
-static void record_times(struct psi_group_cpu *groupc, int cpu,
- bool memstall_tick)
+static int psi_rtpoll_worker(void *data)
+{
+ struct psi_group *group = (struct psi_group *)data;
+
+ sched_set_fifo_low(current);
+
+ while (true) {
+ wait_event_interruptible(group->rtpoll_wait,
+ atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ psi_rtpoll_work(group);
+ }
+ return 0;
+}
+
+static void poll_timer_fn(struct timer_list *t)
+{
+ struct psi_group *group = from_timer(group, t, rtpoll_timer);
+
+ atomic_set(&group->rtpoll_wakeup, 1);
+ wake_up_interruptible(&group->rtpoll_wait);
+}
+
+static void record_times(struct psi_group_cpu *groupc, u64 now)
{
u32 delta;
- u64 now;
- now = cpu_clock(cpu);
delta = now - groupc->state_start;
groupc->state_start = now;
@@ -643,27 +756,13 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
groupc->times[PSI_MEM_SOME] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
- else if (memstall_tick) {
- u32 sample;
- /*
- * Since we care about lost potential, a
- * memstall is FULL when there are no other
- * working tasks, but also when the CPU is
- * actively reclaiming and nothing productive
- * could run even if it were runnable.
- *
- * When the timer tick sees a reclaiming CPU,
- * regardless of runnable tasks, sample a FULL
- * tick (or less if it hasn't been a full tick
- * since the last state change).
- */
- sample = min(delta, (u32)jiffies_to_nsecs(1));
- groupc->times[PSI_MEM_FULL] += sample;
- }
}
- if (groupc->state_mask & (1 << PSI_CPU_SOME))
+ if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
groupc->times[PSI_CPU_SOME] += delta;
+ if (groupc->state_mask & (1 << PSI_CPU_FULL))
+ groupc->times[PSI_CPU_FULL] += delta;
+ }
if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
@@ -674,78 +773,112 @@ static void psi_group_change(struct psi_group *group, int cpu,
bool wake_clock)
{
struct psi_group_cpu *groupc;
- u32 state_mask = 0;
unsigned int t, m;
- enum psi_states s;
+ u32 state_mask;
+ u64 now;
+ lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we assess the aggregate resource states this CPU's
- * tasks have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- *
- * Then we update the task counts according to the state
+ * First we update the task counts according to the state
* change requested through the @clear and @set bits.
+ *
+ * Then if the cgroup PSI stats accounting enabled, we
+ * assess the aggregate resource states this CPU's tasks
+ * have been in since the last change, and account any
+ * SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
+ now = cpu_clock(cpu);
- record_times(groupc, cpu, false);
+ /*
+ * Start with TSK_ONCPU, which doesn't have a corresponding
+ * task count - it's just a boolean flag directly encoded in
+ * the state mask. Clear, set, or carry the current state if
+ * no changes are requested.
+ */
+ if (unlikely(clear & TSK_ONCPU)) {
+ state_mask = 0;
+ clear &= ~TSK_ONCPU;
+ } else if (unlikely(set & TSK_ONCPU)) {
+ state_mask = PSI_ONCPU;
+ set &= ~TSK_ONCPU;
+ } else {
+ state_mask = groupc->state_mask & PSI_ONCPU;
+ }
+ /*
+ * The rest of the state mask is calculated based on the task
+ * counts. Update those first, then construct the mask.
+ */
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
- if (groupc->tasks[t] == 0 && !psi_bug) {
+ if (groupc->tasks[t]) {
+ groupc->tasks[t]--;
+ } else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], clear, set);
psi_bug = 1;
}
- groupc->tasks[t]--;
}
for (t = 0; set; set &= ~(1 << t), t++)
if (set & (1 << t))
groupc->tasks[t]++;
- /* Calculate state mask representing active states */
- for (s = 0; s < NR_PSI_STATES; s++) {
- if (test_state(groupc->tasks, s))
- state_mask |= (1 << s);
+ if (!group->enabled) {
+ /*
+ * On the first group change after disabling PSI, conclude
+ * the current state and flush its time. This is unlikely
+ * to matter to the user, but aggregation (get_recent_times)
+ * may have already incorporated the live state into times_prev;
+ * avoid a delta sample underflow when PSI is later re-enabled.
+ */
+ if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+ record_times(groupc, now);
+
+ groupc->state_mask = state_mask;
+
+ write_seqcount_end(&groupc->seq);
+ return;
}
+
+ state_mask = test_states(groupc->tasks, state_mask);
+
+ /*
+ * Since we care about lost potential, a memstall is FULL
+ * when there are no other working tasks, but also when
+ * the CPU is actively reclaiming and nothing productive
+ * could run even if it were runnable. So when the current
+ * task in a cgroup is in_memstall, the corresponding groupc
+ * on that cpu is in PSI_MEM_FULL state.
+ */
+ if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
+ state_mask |= (1 << PSI_MEM_FULL);
+
+ record_times(groupc, now);
+
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
- if (state_mask & group->poll_states)
- psi_schedule_poll_work(group, 1);
+ if (state_mask & group->rtpoll_states)
+ psi_schedule_rtpoll_work(group, 1, false);
if (wake_clock && !delayed_work_pending(&group->avgs_work))
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *task_psi_group(struct task_struct *task)
{
#ifdef CONFIG_CGROUPS
- struct cgroup *cgroup = NULL;
-
- if (!*iter)
- cgroup = task->cgroups->dfl_cgrp;
- else if (*iter == &psi_system)
- return NULL;
- else
- cgroup = cgroup_parent(*iter);
-
- if (cgroup && cgroup_parent(cgroup)) {
- *iter = cgroup;
- return cgroup_psi(cgroup);
- }
-#else
- if (*iter)
- return NULL;
+ if (static_branch_likely(&psi_cgroups_enabled))
+ return cgroup_psi(task_dfl_cgroup(task));
#endif
- *iter = &psi_system;
return &psi_system;
}
@@ -768,27 +901,16 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
- bool wake_clock = true;
- void *iter = NULL;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
- /*
- * Periodic aggregation shuts off if there is a period of no
- * task changes, so we wake it back up if necessary. However,
- * don't do this if the task change is the aggregation worker
- * itself going to sleep, or we'll ping-pong forever.
- */
- if (unlikely((clear & TSK_RUNNING) &&
- (task->flags & PF_WQ_WORKER) &&
- wq_worker_last_func(task) == psi_avgs_work))
- wake_clock = false;
-
- while ((group = iterate_groups(task, &iter)))
- psi_group_change(group, cpu, clear, set, wake_clock);
+ group = task_psi_group(task);
+ do {
+ psi_group_change(group, cpu, clear, set, true);
+ } while ((group = group->parent));
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -796,59 +918,124 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
- void *iter;
if (next->pid) {
psi_flags_change(next, 0, TSK_ONCPU);
/*
- * When moving state between tasks, the group that
- * contains them both does not change: we can stop
- * updating the tree once we reach the first common
- * ancestor. Iterate @next's ancestors until we
- * encounter @prev's state.
+ * Set TSK_ONCPU on @next's cgroups. If @next shares any
+ * ancestors with @prev, those will already have @prev's
+ * TSK_ONCPU bit set, and we can stop the iteration there.
*/
- iter = NULL;
- while ((group = iterate_groups(next, &iter))) {
- if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+ group = task_psi_group(next);
+ do {
+ if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+ PSI_ONCPU) {
common = group;
break;
}
psi_group_change(group, cpu, 0, TSK_ONCPU, true);
- }
+ } while ((group = group->parent));
}
- /*
- * If this is a voluntary sleep, dequeue will have taken care
- * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
- * only need to deal with it during preemption.
- */
- if (sleep)
- return;
-
if (prev->pid) {
- psi_flags_change(prev, TSK_ONCPU, 0);
+ int clear = TSK_ONCPU, set = 0;
+ bool wake_clock = true;
+
+ /*
+ * When we're going to sleep, psi_dequeue() lets us
+ * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
+ * TSK_IOWAIT here, where we can combine it with
+ * TSK_ONCPU and save walking common ancestors twice.
+ */
+ if (sleep) {
+ clear |= TSK_RUNNING;
+ if (prev->in_memstall)
+ clear |= TSK_MEMSTALL_RUNNING;
+ if (prev->in_iowait)
+ set |= TSK_IOWAIT;
+
+ /*
+ * Periodic aggregation shuts off if there is a period of no
+ * task changes, so we wake it back up if necessary. However,
+ * don't do this if the task change is the aggregation worker
+ * itself going to sleep, or we'll ping-pong forever.
+ */
+ if (unlikely((prev->flags & PF_WQ_WORKER) &&
+ wq_worker_last_func(prev) == psi_avgs_work))
+ wake_clock = false;
+ }
- iter = NULL;
- while ((group = iterate_groups(prev, &iter)) && group != common)
- psi_group_change(group, cpu, TSK_ONCPU, 0, true);
+ psi_flags_change(prev, clear, set);
+
+ group = task_psi_group(prev);
+ do {
+ if (group == common)
+ break;
+ psi_group_change(group, cpu, clear, set, wake_clock);
+ } while ((group = group->parent));
+
+ /*
+ * TSK_ONCPU is handled up to the common ancestor. If there are
+ * any other differences between the two tasks (e.g. prev goes
+ * to sleep, or only one task is memstall), finish propagating
+ * those differences all the way up to the root.
+ */
+ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
+ clear &= ~TSK_ONCPU;
+ for (; group; group = group->parent)
+ psi_group_change(group, cpu, clear, set, wake_clock);
+ }
}
}
-void psi_memstall_tick(struct task_struct *task, int cpu)
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
+ int cpu = task_cpu(curr);
struct psi_group *group;
- void *iter = NULL;
+ struct psi_group_cpu *groupc;
+ s64 delta;
+ u64 irq;
+
+ if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
+ return;
- while ((group = iterate_groups(task, &iter))) {
- struct psi_group_cpu *groupc;
+ if (!curr->pid)
+ return;
+
+ lockdep_assert_rq_held(rq);
+ group = task_psi_group(curr);
+ if (prev && task_psi_group(prev) == group)
+ return;
+
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
+
+ do {
+ u64 now;
+
+ if (!group->enabled)
+ continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
+
write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu, true);
+ now = cpu_clock(cpu);
+
+ record_times(groupc, now);
+ groupc->times[PSI_IRQ_FULL] += delta;
+
write_seqcount_end(&groupc->seq);
- }
+
+ if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_rtpoll_work(group, 1, false);
+ } while ((group = group->parent));
}
+#endif
/**
* psi_memstall_enter - mark the beginning of a memory stall section
@@ -876,10 +1063,11 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 1;
- psi_task_change(current, 0, TSK_MEMSTALL);
+ psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf);
}
+EXPORT_SYMBOL_GPL(psi_memstall_enter);
/**
* psi_memstall_leave - mark the end of an memory stall section
@@ -905,33 +1093,42 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 0;
- psi_task_change(current, TSK_MEMSTALL, 0);
+ psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf);
}
+EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return 0;
- cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
- if (!cgroup->psi.pcpu)
+ cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
+ if (!cgroup->psi)
+ return -ENOMEM;
+
+ cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu);
+ if (!cgroup->psi->pcpu) {
+ kfree(cgroup->psi);
return -ENOMEM;
- group_init(&cgroup->psi);
+ }
+ group_init(cgroup->psi);
+ cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
return 0;
}
void psi_cgroup_free(struct cgroup *cgroup)
{
- if (static_branch_likely(&psi_disabled))
+ if (!static_branch_likely(&psi_cgroups_enabled))
return;
- cancel_delayed_work_sync(&cgroup->psi.avgs_work);
- free_percpu(cgroup->psi.pcpu);
+ cancel_delayed_work_sync(&cgroup->psi->avgs_work);
+ free_percpu(cgroup->psi->pcpu);
/* All triggers must be removed by now */
- WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
+ WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n");
+ kfree(cgroup->psi);
}
/**
@@ -948,11 +1145,11 @@ void psi_cgroup_free(struct cgroup *cgroup)
*/
void cgroup_move_task(struct task_struct *task, struct css_set *to)
{
- unsigned int task_flags = 0;
+ unsigned int task_flags;
struct rq_flags rf;
struct rq *rq;
- if (static_branch_likely(&psi_disabled)) {
+ if (!static_branch_likely(&psi_cgroups_enabled)) {
/*
* Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/.
@@ -963,15 +1160,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task)) {
- task_flags = TSK_RUNNING;
- if (task_current(rq, task))
- task_flags |= TSK_ONCPU;
- } else if (task->in_iowait)
- task_flags = TSK_IOWAIT;
-
- if (task->in_memstall)
- task_flags |= TSK_MEMSTALL;
+ /*
+ * We may race with schedule() dropping the rq lock between
+ * deactivating prev and switching to next. Because the psi
+ * updates from the deactivation are deferred to the switch
+ * callback to save cgroup tree updates, the task's scheduling
+ * state here is not coherent with its psi state:
+ *
+ * schedule() cgroup_move_task()
+ * rq_lock()
+ * deactivate_task()
+ * p->on_rq = 0
+ * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates
+ * pick_next_task()
+ * rq_unlock()
+ * rq_lock()
+ * psi_task_change() // old cgroup
+ * task->cgroups = to
+ * psi_task_change() // new cgroup
+ * rq_unlock()
+ * rq_lock()
+ * psi_sched_switch() // does deferred updates in new cgroup
+ *
+ * Don't rely on the scheduling state. Use psi_flags instead.
+ */
+ task_flags = task->psi_flags;
if (task_flags)
psi_task_change(task, task_flags, 0);
@@ -984,16 +1197,54 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf);
}
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+ int cpu;
+
+ /*
+ * After we disable psi_group->enabled, we don't actually
+ * stop percpu tasks accounting in each psi_group_cpu,
+ * instead only stop test_states() loop, record_times()
+ * and averaging worker, see psi_group_change() for details.
+ *
+ * When disable cgroup PSI, this function has nothing to sync
+ * since cgroup pressure files are hidden and percpu psi_group_cpu
+ * would see !psi_group->enabled and only do task accounting.
+ *
+ * When re-enable cgroup PSI, this function use psi_group_change()
+ * to get correct state mask from test_states() loop on tasks[],
+ * and restart groupc->state_start from now, use .clear = .set = 0
+ * here since no task status really changed.
+ */
+ if (!group->enabled)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irq(rq, &rf);
+ psi_group_change(group, cpu, 0, 0, true);
+ rq_unlock_irq(rq, &rf);
+ }
+}
#endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
+ bool only_full = false;
int full;
u64 now;
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (!irqtime_enabled() && res == PSI_IRQ)
+ return -EOPNOTSUPP;
+#endif
+
/* Update averages before reporting them */
mutex_lock(&group->avgs_lock);
now = sched_clock();
@@ -1002,18 +1253,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2 - (res == PSI_CPU); full++) {
- unsigned long avg[3];
- u64 total;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ only_full = res == PSI_IRQ;
+#endif
+
+ for (full = 0; full < 2 - only_full; full++) {
+ unsigned long avg[3] = { 0, };
+ u64 total = 0;
int w;
- for (w = 0; w < 3; w++)
- avg[w] = group->avg[res * 2 + full][w];
- total = div_u64(group->total[PSI_AVGS][res * 2 + full],
- NSEC_PER_USEC);
+ /* CPU FULL is undefined at the system level */
+ if (!(group == &psi_system && res == PSI_CPU && full)) {
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg[res * 2 + full][w];
+ total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+ NSEC_PER_USEC);
+ }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
- full ? "full" : "some",
+ full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1023,47 +1281,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
return 0;
}
-static int psi_io_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_IO);
-}
-
-static int psi_memory_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_MEM);
-}
-
-static int psi_cpu_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_CPU);
-}
-
-static int psi_io_open(struct inode *inode, struct file *file)
-{
- return single_open(file, psi_io_show, NULL);
-}
-
-static int psi_memory_open(struct inode *inode, struct file *file)
-{
- return single_open(file, psi_memory_show, NULL);
-}
-
-static int psi_cpu_open(struct inode *inode, struct file *file)
-{
- return single_open(file, psi_cpu_show, NULL);
-}
-
-struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, size_t nbytes, enum psi_res res)
+struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
+ enum psi_res res, struct file *file,
+ struct kernfs_open_file *of)
{
struct psi_trigger *t;
enum psi_states state;
u32 threshold_us;
+ bool privileged;
u32 window_us;
if (static_branch_likely(&psi_disabled))
return ERR_PTR(-EOPNOTSUPP);
+ /*
+ * Checking the privilege here on file->f_cred implies that a privileged user
+ * could open the file and delegate the write to an unprivileged one.
+ */
+ privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
+
if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
state = PSI_IO_SOME + res * 2;
else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
@@ -1071,11 +1307,22 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+ return ERR_PTR(-EINVAL);
+#endif
+
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
- if (window_us < WINDOW_MIN_US ||
- window_us > WINDOW_MAX_US)
+ if (window_us == 0 || window_us > WINDOW_MAX_US)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Unprivileged users can only use 2s windows so that averages aggregation
+ * work is used, and no RT threads need to be spawned.
+ */
+ if (!privileged && window_us % 2000000)
return ERR_PTR(-EINVAL);
/* Check threshold */
@@ -1090,123 +1337,137 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->state = state;
t->threshold = threshold_us * NSEC_PER_USEC;
t->win.size = window_us * NSEC_PER_USEC;
- window_reset(&t->win, 0, 0, 0);
+ window_reset(&t->win, sched_clock(),
+ group->total[PSI_POLL][t->state], 0);
t->event = 0;
t->last_event_time = 0;
- init_waitqueue_head(&t->event_wait);
- kref_init(&t->refcount);
-
- mutex_lock(&group->trigger_lock);
-
- if (!rcu_access_pointer(group->poll_kworker)) {
- struct sched_param param = {
- .sched_priority = 1,
- };
- struct kthread_worker *kworker;
-
- kworker = kthread_create_worker(0, "psimon");
- if (IS_ERR(kworker)) {
- kfree(t);
- mutex_unlock(&group->trigger_lock);
- return ERR_CAST(kworker);
+ t->of = of;
+ if (!of)
+ init_waitqueue_head(&t->event_wait);
+ t->pending_event = false;
+ t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
+
+ if (privileged) {
+ mutex_lock(&group->rtpoll_trigger_lock);
+
+ if (!rcu_access_pointer(group->rtpoll_task)) {
+ struct task_struct *task;
+
+ task = kthread_create(psi_rtpoll_worker, group, "psimon");
+ if (IS_ERR(task)) {
+ kfree(t);
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ return ERR_CAST(task);
+ }
+ atomic_set(&group->rtpoll_wakeup, 0);
+ wake_up_process(task);
+ rcu_assign_pointer(group->rtpoll_task, task);
}
- sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
- kthread_init_delayed_work(&group->poll_work,
- psi_poll_work);
- rcu_assign_pointer(group->poll_kworker, kworker);
- }
- list_add(&t->node, &group->triggers);
- group->poll_min_period = min(group->poll_min_period,
- div_u64(t->win.size, UPDATES_PER_WINDOW));
- group->nr_triggers[t->state]++;
- group->poll_states |= (1 << t->state);
+ list_add(&t->node, &group->rtpoll_triggers);
+ group->rtpoll_min_period = min(group->rtpoll_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->rtpoll_nr_triggers[t->state]++;
+ group->rtpoll_states |= (1 << t->state);
- mutex_unlock(&group->trigger_lock);
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ } else {
+ mutex_lock(&group->avgs_lock);
+ list_add(&t->node, &group->avg_triggers);
+ group->avg_nr_triggers[t->state]++;
+
+ mutex_unlock(&group->avgs_lock);
+ }
return t;
}
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(struct psi_trigger *t)
{
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
- struct psi_group *group = t->group;
- struct kthread_worker *kworker_to_destroy = NULL;
+ struct psi_group *group;
+ struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled))
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
return;
+ group = t->group;
/*
- * Wakeup waiters to stop polling. Can happen if cgroup is deleted
- * from under a polling process.
+ * Wakeup waiters to stop polling and clear the queue to prevent it from
+ * being accessed later. Can happen if cgroup is deleted from under a
+ * polling process.
*/
- wake_up_interruptible(&t->event_wait);
-
- mutex_lock(&group->trigger_lock);
-
- if (!list_empty(&t->node)) {
- struct psi_trigger *tmp;
- u64 period = ULLONG_MAX;
-
- list_del(&t->node);
- group->nr_triggers[t->state]--;
- if (!group->nr_triggers[t->state])
- group->poll_states &= ~(1 << t->state);
- /* reset min update period for the remaining triggers */
- list_for_each_entry(tmp, &group->triggers, node)
- period = min(period, div_u64(tmp->win.size,
- UPDATES_PER_WINDOW));
- group->poll_min_period = period;
- /* Destroy poll_kworker when the last trigger is destroyed */
- if (group->poll_states == 0) {
- group->polling_until = 0;
- kworker_to_destroy = rcu_dereference_protected(
- group->poll_kworker,
- lockdep_is_held(&group->trigger_lock));
- rcu_assign_pointer(group->poll_kworker, NULL);
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
+
+ if (t->aggregator == PSI_AVGS) {
+ mutex_lock(&group->avgs_lock);
+ if (!list_empty(&t->node)) {
+ list_del(&t->node);
+ group->avg_nr_triggers[t->state]--;
}
+ mutex_unlock(&group->avgs_lock);
+ } else {
+ mutex_lock(&group->rtpoll_trigger_lock);
+ if (!list_empty(&t->node)) {
+ struct psi_trigger *tmp;
+ u64 period = ULLONG_MAX;
+
+ list_del(&t->node);
+ group->rtpoll_nr_triggers[t->state]--;
+ if (!group->rtpoll_nr_triggers[t->state])
+ group->rtpoll_states &= ~(1 << t->state);
+ /*
+ * Reset min update period for the remaining triggers
+ * iff the destroying trigger had the min window size.
+ */
+ if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
+ list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->rtpoll_min_period = period;
+ }
+ /* Destroy rtpoll_task when the last trigger is destroyed */
+ if (group->rtpoll_states == 0) {
+ group->rtpoll_until = 0;
+ task_to_destroy = rcu_dereference_protected(
+ group->rtpoll_task,
+ lockdep_is_held(&group->rtpoll_trigger_lock));
+ rcu_assign_pointer(group->rtpoll_task, NULL);
+ del_timer(&group->rtpoll_timer);
+ }
+ }
+ mutex_unlock(&group->rtpoll_trigger_lock);
}
- mutex_unlock(&group->trigger_lock);
-
/*
- * Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_kworker RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_kworker
+ * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * rtpoll_task.
*/
synchronize_rcu();
/*
- * Destroy the kworker after releasing trigger_lock to prevent a
- * deadlock while waiting for psi_poll_work to acquire trigger_lock
+ * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent
+ * a deadlock while waiting for psi_rtpoll_work to acquire
+ * rtpoll_trigger_lock
*/
- if (kworker_to_destroy) {
+ if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
- * can no longer be found through group->poll_kworker.
- * But it might have been already scheduled before
- * that - deschedule it cleanly before destroying it.
+ * can no longer be found through group->rtpoll_task.
*/
- kthread_cancel_delayed_work_sync(&group->poll_work);
- atomic_set(&group->poll_scheduled, 0);
-
- kthread_destroy_worker(kworker_to_destroy);
+ kthread_stop(task_to_destroy);
+ atomic_set(&group->rtpoll_scheduled, 0);
}
kfree(t);
}
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
- struct psi_trigger *old = *trigger_ptr;
-
- if (static_branch_likely(&psi_disabled))
- return;
-
- rcu_assign_pointer(*trigger_ptr, new);
- if (old)
- kref_put(&old->refcount, psi_trigger_destroy);
-}
-
__poll_t psi_trigger_poll(void **trigger_ptr,
struct file *file, poll_table *wait)
{
@@ -1216,27 +1477,52 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock();
-
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
- if (!t) {
- rcu_read_unlock();
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- }
- kref_get(&t->refcount);
- rcu_read_unlock();
-
- poll_wait(file, &t->event_wait, wait);
+ if (t->of)
+ kernfs_generic_poll(t->of, wait);
+ else
+ poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy);
-
return ret;
}
+#ifdef CONFIG_PROC_FS
+static int psi_io_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_io_show, NULL);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_memory_show, NULL);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_cpu_show, NULL);
+}
+
static ssize_t psi_write(struct file *file, const char __user *user_buf,
size_t nbytes, enum psi_res res)
{
@@ -1257,14 +1543,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
seq = file->private_data;
+
/* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock);
- psi_trigger_replace(&seq->private, new);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, res, file, NULL);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock);
return nbytes;
@@ -1299,7 +1595,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL);
+ psi_trigger_destroy(seq->private);
return single_release(inode, file);
}
@@ -1330,14 +1626,46 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, psi_irq_show, NULL);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+ .proc_open = psi_irq_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = psi_irq_write,
+ .proc_poll = psi_fop_poll,
+ .proc_release = psi_fop_release,
+};
+#endif
+
static int __init psi_proc_init(void)
{
if (psi_enable) {
proc_mkdir("pressure", NULL);
- proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
- proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
- proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
+ proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
+ proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
+ proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
}
return 0;
}
module_init(psi_proc_init);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f395ddb75f38..4b8e33c615b1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,18 +3,98 @@
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
-#include "sched.h"
-
-#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+/*
+ * period over which we measure -rt task CPU usage in us.
+ * default: 1s
+ */
+int sysctl_sched_rt_period = 1000000;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+#ifdef CONFIG_SYSCTL
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
+static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+static const struct ctl_table sched_rt_sysctls[] = {
+ {
+ .procname = "sched_rt_period_us",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+ {
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = (void *)&sysctl_sched_rt_period,
+ },
+ {
+ .procname = "sched_rr_timeslice_ms",
+ .data = &sysctl_sched_rr_timeslice,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_rr_handler,
+ },
+};
+
+static int __init sched_rt_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_rt_sysctls);
+ return 0;
+}
+late_initcall(sched_rt_sysctl_init);
+#endif
+
+void init_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array;
+ int i;
+
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ INIT_LIST_HEAD(array->queue + i);
+ __clear_bit(i, array->bitmap);
+ }
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
-struct rt_bandwidth def_rt_bandwidth;
+#if defined CONFIG_SMP
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ rt_rq->highest_prio.next = MAX_RT_PRIO-1;
+ rt_rq->overloaded = 0;
+ plist_head_init(&rt_rq->pushable_tasks);
+#endif /* CONFIG_SMP */
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ rt_rq->rt_runtime = 0;
+ raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+#endif
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
@@ -52,11 +132,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
@@ -75,36 +152,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-void init_rt_rq(struct rt_rq *rt_rq)
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- struct rt_prio_array *array;
- int i;
-
- array = &rt_rq->active;
- for (i = 0; i < MAX_RT_PRIO; i++) {
- INIT_LIST_HEAD(array->queue + i);
- __clear_bit(i, array->bitmap);
- }
- /* delimiter for bitsearch: */
- __set_bit(MAX_RT_PRIO, array->bitmap);
-
-#if defined CONFIG_SMP
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
- rt_rq->highest_prio.next = MAX_RT_PRIO;
- rt_rq->rt_nr_migratory = 0;
- rt_rq->overloaded = 0;
- plist_head_init(&rt_rq->pushable_tasks);
-#endif /* CONFIG_SMP */
- /* We start is dequeued state, because no RT tasks are queued */
- rt_rq->rt_queued = 0;
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
- rt_rq->rt_time = 0;
- rt_rq->rt_throttled = 0;
- rt_rq->rt_runtime = 0;
- raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+ do_start_rt_bandwidth(rt_b);
}
-#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
hrtimer_cancel(&rt_b->rt_period_timer);
@@ -137,12 +192,15 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
return rt_rq->rq;
}
-void free_rt_sched_group(struct task_group *tg)
+void unregister_rt_sched_group(struct task_group *tg)
{
- int i;
-
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
for_each_possible_cpu(i) {
if (tg->rt_rq)
@@ -161,7 +219,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{
struct rq *rq = cpu_rq(cpu);
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
rt_rq->tg = tg;
@@ -195,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!tg->rt_se)
goto err;
- init_rt_bandwidth(&tg->rt_bandwidth,
- ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+ init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
for_each_possible_cpu(i) {
rt_rq = kzalloc_node(sizeof(struct rt_rq),
@@ -250,6 +307,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return &rq->rt;
}
+void unregister_rt_sched_group(struct task_group *tg) { }
+
void free_rt_sched_group(struct task_group *tg) { }
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
@@ -260,12 +319,10 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
-static void pull_rt_task(struct rq *this_rq);
-
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- return rq->rt.highest_prio.curr > prev->prio;
+ return rq->online && rq->rt.highest_prio.curr > prev->prio;
}
static inline int rt_overloaded(struct rq *rq)
@@ -302,60 +359,13 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
-
- update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
-}
-
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
}
-static DEFINE_PER_CPU(struct callback_head, rt_push_head);
-static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
+static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
+static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
static void push_rt_tasks(struct rq *);
static void pull_rt_task(struct rq *);
@@ -382,6 +392,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -393,8 +408,14 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
p = plist_first_entry(&rq->rt.pushable_tasks,
struct task_struct, pushable_tasks);
rq->rt.highest_prio.next = p->prio;
- } else
- rq->rt.highest_prio.next = MAX_RT_PRIO;
+ } else {
+ rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
+ }
}
#else
@@ -407,32 +428,13 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_rt_task(struct rq *this_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
-static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
@@ -461,13 +463,13 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
unsigned int cpu_cap;
/* Only heterogeneous systems can benefit from this check */
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ if (!sched_asym_cpucap_active())
return true;
min_cap = uclamp_eff_value(p, UCLAMP_MIN);
max_cap = uclamp_eff_value(p, UCLAMP_MAX);
- cpu_cap = capacity_orig_of(cpu);
+ cpu_cap = arch_scale_cpu_capacity(cpu);
return cpu_cap >= min(min_cap, max_cap);
}
@@ -526,7 +528,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor;
struct rq *rq = rq_of_rt_rq(rt_rq);
struct sched_rt_entity *rt_se;
@@ -540,7 +542,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, 0);
- if (rt_rq->highest_prio.curr < curr->prio)
+ if (rt_rq->highest_prio.curr < donor->prio)
resched_curr(rq);
}
}
@@ -553,7 +555,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (!rt_se) {
- dequeue_top_rt_rq(rt_rq);
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
}
@@ -601,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
return &rt_rq->tg->rt_bandwidth;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
-
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_runtime;
-}
-
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
- return ktime_to_ns(def_rt_bandwidth.rt_period);
-}
-
-typedef struct rt_rq *rt_rq_iter_t;
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
- for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-#define for_each_sched_rt_entity(rt_se) \
- for (; rt_se; rt_se = NULL)
-
-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
- return NULL;
-}
-
-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- if (!rt_rq->rt_nr_running)
- return;
-
- enqueue_top_rt_rq(rt_rq);
- resched_curr(rq);
-}
-
-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
- dequeue_top_rt_rq(rt_rq);
-}
-
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled;
-}
-
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
- return &cpu_rq(cpu)->rt;
-}
-
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
- return &def_rt_bandwidth;
-}
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -699,7 +637,7 @@ static void do_balance_runtime(struct rt_rq *rt_rq)
/*
* Either all rqs have inf runtime and there's nothing to steal
* or __disable_runtime() below sets a specific rq to inf to
- * indicate its been disabled and disalow stealing.
+ * indicate its been disabled and disallow stealing.
*/
if (iter->rt_runtime == RUNTIME_INF)
goto next;
@@ -795,7 +733,7 @@ static void __disable_runtime(struct rq *rq)
* We cannot be left wanting - that would mean some runtime
* leaked out of the system.
*/
- BUG_ON(want);
+ WARN_ON_ONCE(want);
balanced:
/*
* Disable all the borrow logic by pretending we have inf
@@ -856,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
const struct cpumask *span;
span = sched_rt_period_mask();
-#ifdef CONFIG_RT_GROUP_SCHED
+
/*
* FIXME: isolated CPUs should really leave the root task group,
* whether they are isolcpus or were isolated via cpusets, lest
@@ -868,11 +806,12 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
*/
if (rt_b == &root_task_group.rt_bandwidth)
span = cpu_online_mask;
-#endif
+
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq_flags rf;
int skip;
/*
@@ -887,7 +826,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -904,7 +843,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
/*
* When we're idle and a woken (rt) task is
- * throttled check_preempt_curr() will set
+ * throttled wakeup_preempt() will set
* skip_update and the time between the wakeup
* and this unthrottle will get accounted as
* 'runtime'.
@@ -925,7 +864,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
@@ -934,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
return idle;
}
-static inline int rt_se_prio(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
- struct rt_rq *rt_rq = group_rt_rq(rt_se);
-
- if (rt_rq)
- return rt_rq->highest_prio.curr;
-#endif
-
- return rt_task_of(rt_se)->prio;
-}
-
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -989,52 +916,114 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
+#else /* !CONFIG_RT_GROUP_SCHED */
+
+typedef struct rt_rq *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+#define for_each_sched_rt_entity(rt_se) \
+ for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+ return NULL;
+}
+
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_curr(rq);
+}
+
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return false;
+}
+
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+ return cpu_online_mask;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+ return &cpu_rq(cpu)->rt;
+}
+
+#ifdef CONFIG_SMP
+static void __enable_runtime(struct rq *rq) { }
+static void __disable_runtime(struct rq *rq) { }
+#endif
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq)
+ return rt_rq->highest_prio.curr;
+#endif
+
+ return rt_task_of(rt_se)->prio;
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static void update_curr_rt(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
- struct sched_rt_entity *rt_se = &curr->rt;
- u64 delta_exec;
- u64 now;
+ struct task_struct *donor = rq->donor;
+ s64 delta_exec;
- if (curr->sched_class != &rt_sched_class)
+ if (donor->sched_class != &rt_sched_class)
return;
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
return;
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
-
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity *rt_se = &donor->rt;
if (!rt_bandwidth_enabled())
return;
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
+ exceeded = sched_rt_runtime_exceeded(rt_rq);
+ if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (exceeded)
+ do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
+#endif
}
static void
-dequeue_top_rt_rq(struct rt_rq *rt_rq)
+dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -1045,7 +1034,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
BUG_ON(!rq->nr_running);
- sub_nr_running(rq, rt_rq->rt_nr_running);
+ sub_nr_running(rq, count);
rt_rq->rt_queued = 0;
}
@@ -1138,7 +1127,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
/*
* This may have been our highest task, and therefore
- * we may have some recomputation to do
+ * we may have some re-computation to do
*/
if (prio == prev_prio) {
struct rt_prio_array *array = &rt_rq->active;
@@ -1147,8 +1136,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
sched_find_first_bit(array->bitmap);
}
- } else
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ } else {
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ }
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
@@ -1186,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
- start_rt_bandwidth(&def_rt_bandwidth);
}
static inline
@@ -1229,7 +1218,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}
@@ -1242,7 +1230,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
@@ -1269,6 +1256,112 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr
rt_se->on_list = 0;
}
+static inline struct sched_statistics *
+__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* schedstats is not supported for rt group. */
+ if (!rt_entity_is_task(rt_se))
+ return NULL;
+#endif
+
+ return &rt_task_of(rt_se)->stats;
+}
+
+static inline void
+update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ if (!schedstat_enabled())
+ return;
+
+ if (flags & ENQUEUE_WAKEUP)
+ update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
+}
+
+static inline void
+update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct sched_statistics *stats;
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ stats = __schedstats_from_rt_se(rt_se);
+ if (!stats)
+ return;
+
+ __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
+}
+
+static inline void
+update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+ int flags)
+{
+ struct task_struct *p = NULL;
+
+ if (!schedstat_enabled())
+ return;
+
+ if (rt_entity_is_task(rt_se))
+ p = rt_task_of(rt_se);
+
+ if ((flags & DEQUEUE_SLEEP) && p) {
+ unsigned int state;
+
+ state = READ_ONCE(p->__state);
+ if (state & TASK_INTERRUPTIBLE)
+ __schedstat_set(p->stats.sleep_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+
+ if (state & TASK_UNINTERRUPTIBLE)
+ __schedstat_set(p->stats.block_start,
+ rq_clock(rq_of_rt_rq(rt_rq)));
+ }
+}
+
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
@@ -1324,24 +1417,29 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
+ unsigned int rt_nr_running;
for_each_sched_rt_entity(rt_se) {
rt_se->back = back;
back = rt_se;
}
- dequeue_top_rt_rq(rt_rq_of_se(back));
+ rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se, flags);
}
+
+ dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, flags);
@@ -1352,6 +1450,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
+ update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
+
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
@@ -1374,13 +1474,16 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
+ check_schedstat_required();
+ update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
+
enqueue_rt_entity(rt_se, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
@@ -1388,6 +1491,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se, flags);
dequeue_pushable_task(rq, p);
+
+ return true;
}
/*
@@ -1428,20 +1533,21 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{
- struct task_struct *curr;
+ struct task_struct *curr, *donor;
struct rq *rq;
bool test;
/* For anything but wake ups, just return the task_cpu */
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ if (!(flags & (WF_TTWU | WF_FORK)))
goto out;
rq = cpu_rq(cpu);
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
+ donor = READ_ONCE(rq->donor);
/*
* If the current task on @p's runqueue is an RT task, then
@@ -1459,7 +1565,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
*
* For equal prio tasks, we just let the scheduler sort it out.
*
- * Otherwise, just let it ride on the affined RQ and the
+ * Otherwise, just let it ride on the affine RQ and the
* post-schedule router will push the preempted task away
*
* This test is optimistic, if we get it wrong the load-balancer
@@ -1470,8 +1576,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* systems like big.LITTLE.
*/
test = curr &&
- unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+ unlikely(rt_task(donor)) &&
+ (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio);
if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
@@ -1501,12 +1607,8 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- /*
- * Current can't be migrated, useless to reschedule,
- * let's hope p can move out.
- */
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->donor, NULL))
return;
/*
@@ -1547,9 +1649,11 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
- if (p->prio < rq->curr->prio) {
+ struct task_struct *donor = rq->donor;
+
+ if (p->prio < donor->prio) {
resched_curr(rq);
return;
}
@@ -1567,14 +1671,19 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
* to move current somewhere else, making room for our non-migratable
* task.
*/
- if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
+ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
#endif
}
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
p->se.exec_start = rq_clock_task(rq);
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_end_rt(rt_rq, rt_se);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
@@ -1587,14 +1696,13 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f
* utilization. We only care of the case where we start to schedule a
* rt task
*/
- if (rq->curr->sched_class != &rt_sched_class)
+ if (rq->donor->sched_class != &rt_sched_class)
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
rt_queue_push_tasks(rq);
}
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
- struct rt_rq *rt_rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
@@ -1605,6 +1713,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
+ if (SCHED_WARN_ON(list_empty(queue)))
+ return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
return next;
@@ -1616,15 +1726,16 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
struct rt_rq *rt_rq = &rq->rt;
do {
- rt_se = pick_next_rt_entity(rq, rt_rq);
- BUG_ON(!rt_se);
+ rt_se = pick_next_rt_entity(rt_rq);
+ if (unlikely(!rt_se))
+ return NULL;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
return rt_task_of(rt_se);
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq)
{
struct task_struct *p;
@@ -1632,12 +1743,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p, true);
+
return p;
}
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (on_rt_rq(&p->rt))
+ update_stats_wait_start_rt(rt_rq, rt_se);
+
update_curr_rt(rq);
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
@@ -1655,15 +1772,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
-{
- if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, p->cpus_ptr))
- return 1;
-
- return 0;
-}
-
/*
* Return the highest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise
@@ -1677,7 +1785,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
return NULL;
plist_for_each_entry(p, head, pushable_tasks) {
- if (pick_rt_task(rq, p, cpu))
+ if (task_is_pushable(rq, p, cpu))
return p;
}
@@ -1705,7 +1813,7 @@ static int find_lowest_rq(struct task_struct *task)
* If we're on asym system ensure we consider the different capacities
* of the CPUs when searching for the lowest_mask.
*/
- if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ if (sched_asym_cpucap_active()) {
ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
task, lowest_mask,
@@ -1752,8 +1860,8 @@ static int find_lowest_rq(struct task_struct *task)
return this_cpu;
}
- best_cpu = cpumask_first_and(lowest_mask,
- sched_domain_span(sd));
+ best_cpu = cpumask_any_and_distribute(lowest_mask,
+ sched_domain_span(sd));
if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
@@ -1770,7 +1878,7 @@ static int find_lowest_rq(struct task_struct *task)
if (this_cpu != -1)
return this_cpu;
- cpu = cpumask_any(lowest_mask);
+ cpu = cpumask_any_distribute(lowest_mask);
if (cpu < nr_cpu_ids)
return cpu;
@@ -1809,11 +1917,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* the mean time, task could have
* migrated already or had its affinity changed.
* Also make sure that it wasn't scheduled on its rq.
+ * It is possible the task was scheduled, set
+ * "migrate_disabled" and then got preempted, so we must
+ * check the task migration disable flag here too.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
- task_running(rq, task) ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
+ task_on_cpu(rq, task) ||
!rt_task(task) ||
+ is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
@@ -1846,6 +1958,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
@@ -1859,7 +1972,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
* running task can migrate over to a CPU that is running a task
* of lesser priority.
*/
-static int push_rt_task(struct rq *rq)
+static int push_rt_task(struct rq *rq, bool pull)
{
struct task_struct *next_task;
struct rq *lowest_rq;
@@ -1873,19 +1986,61 @@ static int push_rt_task(struct rq *rq)
return 0;
retry:
- if (WARN_ON(next_task == rq->curr))
- return 0;
-
/*
* It's possible that the next_task slipped in of
* higher priority than current. If that's the case
* just reschedule current.
*/
- if (unlikely(next_task->prio < rq->curr->prio)) {
+ if (unlikely(next_task->prio < rq->donor->prio)) {
resched_curr(rq);
return 0;
}
+ if (is_migration_disabled(next_task)) {
+ struct task_struct *push_task = NULL;
+ int cpu;
+
+ if (!pull || rq->push_busy)
+ return 0;
+
+ /*
+ * Invoking find_lowest_rq() on anything but an RT task doesn't
+ * make sense. Per the above priority check, curr has to
+ * be of higher priority than next_task, so no need to
+ * reschedule when bailing out.
+ *
+ * Note that the stoppers are masqueraded as SCHED_FIFO
+ * (cf. sched_set_stop_task()), so we can't rely on rt_task().
+ */
+ if (rq->donor->sched_class != &rt_sched_class)
+ return 0;
+
+ cpu = find_lowest_rq(rq->curr);
+ if (cpu == -1 || cpu == rq->cpu)
+ return 0;
+
+ /*
+ * Given we found a CPU with lower priority than @next_task,
+ * therefore it should be running. However we cannot migrate it
+ * to this other CPU, instead attempt to push the current
+ * running task on this CPU away.
+ */
+ push_task = get_push_task(rq);
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(rq);
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ push_task, &rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(rq);
+ }
+
+ return 0;
+ }
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -1924,15 +2079,11 @@ retry:
goto retry;
}
- deactivate_task(rq, next_task, 0);
- set_task_cpu(next_task, lowest_rq->cpu);
- activate_task(lowest_rq, next_task, 0);
- ret = 1;
-
+ move_queued_task_locked(rq, lowest_rq, next_task);
resched_curr(lowest_rq);
+ ret = 1;
double_unlock_balance(rq, lowest_rq);
-
out:
put_task_struct(next_task);
@@ -1942,7 +2093,7 @@ out:
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
- while (push_rt_task(rq))
+ while (push_rt_task(rq, false))
;
}
@@ -1968,14 +2119,14 @@ static void push_rt_tasks(struct rq *rq)
* if its the only CPU with multiple RT tasks queued, and a large number
* of CPUs scheduling a lower priority task at the same time.
*
- * Each root domain has its own irq work function that can iterate over
+ * Each root domain has its own IRQ work function that can iterate over
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
- * tassk must be checked if there's one or many CPUs that are lowering
- * their priority, there's a single irq work iterator that will try to
+ * task must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single IRQ work iterator that will try to
* push off RT tasks that are waiting to run.
*
* When a CPU schedules a lower priority task, it will kick off the
- * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
* As it only takes the first CPU that schedules a lower priority task
* to start the process, the rto_start variable is incremented and if
* the atomic result is one, then that CPU will try to take the rto_lock.
@@ -1983,7 +2134,7 @@ static void push_rt_tasks(struct rq *rq)
* CPUs scheduling lower priority tasks.
*
* All CPUs that are scheduling a lower priority task will increment the
- * rt_loop_next variable. This will make sure that the irq work iterator
+ * rt_loop_next variable. This will make sure that the IRQ work iterator
* checks all RT overloaded CPUs whenever a CPU schedules a new lower
* priority task, even if the iterator is in the middle of a scan. Incrementing
* the rt_loop_next will cause the iterator to perform another scan.
@@ -2063,7 +2214,7 @@ static void tell_cpu_to_push(struct rq *rq)
* The rto_cpu is updated under the lock, if it has a valid CPU
* then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here.
- * Otherwise it is finishing up and an ipi needs to be sent.
+ * Otherwise it is finishing up and an IPI needs to be sent.
*/
if (rq->rd->rto_cpu < 0)
cpu = rto_next_cpu(rq->rd);
@@ -2094,9 +2245,10 @@ void rto_push_irq_work_func(struct irq_work *work)
* When it gets updated, a check is made if a push is possible.
*/
if (has_pushable_tasks(rq)) {
- raw_spin_lock(&rq->lock);
- push_rt_tasks(rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_lock(rq);
+ while (push_rt_task(rq, true))
+ ;
+ raw_spin_rq_unlock(rq);
}
raw_spin_lock(&rd->rto_lock);
@@ -2120,7 +2272,7 @@ static void pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
- struct task_struct *p;
+ struct task_struct *p, *push_task;
struct rq *src_rq;
int rt_overload_count = rt_overloaded(this_rq);
@@ -2167,6 +2319,7 @@ static void pull_rt_task(struct rq *this_rq)
* double_lock_balance, and another CPU could
* alter this_rq
*/
+ push_task = NULL;
double_lock_balance(this_rq, src_rq);
/*
@@ -2186,19 +2339,20 @@ static void pull_rt_task(struct rq *this_rq)
/*
* There's a chance that p is higher in priority
* than what's currently running on its CPU.
- * This is just that p is wakeing up and hasn't
+ * This is just that p is waking up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
* current task on the run queue
*/
- if (p->prio < src_rq->curr->prio)
+ if (p->prio < src_rq->donor->prio)
goto skip;
- resched = true;
-
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ if (is_migration_disabled(p)) {
+ push_task = get_push_task(src_rq);
+ } else {
+ move_queued_task_locked(src_rq, this_rq, p);
+ resched = true;
+ }
/*
* We continue with the search, just in
* case there's an even higher prio task
@@ -2208,6 +2362,15 @@ static void pull_rt_task(struct rq *this_rq)
}
skip:
double_unlock_balance(this_rq, src_rq);
+
+ if (push_task) {
+ preempt_disable();
+ raw_spin_rq_unlock(this_rq);
+ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
+ push_task, &src_rq->push_work);
+ preempt_enable();
+ raw_spin_rq_lock(this_rq);
+ }
}
if (resched)
@@ -2220,12 +2383,12 @@ skip:
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
- bool need_to_push = !task_running(rq, p) &&
+ bool need_to_push = !task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (dl_task(rq->donor) || rt_task(rq->donor)) &&
(rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio);
+ rq->donor->prio <= p->prio);
if (need_to_push)
push_rt_tasks(rq);
@@ -2291,18 +2454,25 @@ void __init init_sched_rt_class(void)
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
/*
- * If we are already running, then there's nothing
- * that needs to be done. But if we are not running
- * we may need to preempt the current running task.
- * If that current running task is also an RT task
+ * If we are running, update the avg_rt tracking, as the running time
+ * will now on be accounted into the latter.
+ */
+ if (task_current(rq, p)) {
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ return;
+ }
+
+ /*
+ * If we are not running we may need to preempt the current
+ * running task. If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (task_on_rq_queued(p) && rq->curr != p) {
+ if (task_on_rq_queued(p)) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
- if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
+ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
}
@@ -2317,7 +2487,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (rq->curr == p) {
+ if (task_current_donor(rq, p)) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
@@ -2343,7 +2513,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* greater than the current running task
* then reschedule.
*/
- if (p->prio < rq->curr->prio)
+ if (p->prio < rq->donor->prio)
resched_curr(rq);
}
}
@@ -2394,7 +2564,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
watchdog(rq, p);
/*
- * RR tasks need a special form of timeslice management.
+ * RR tasks need a special form of time-slice management.
* FIFO tasks have no timeslices.
*/
if (p->policy != SCHED_RR)
@@ -2429,15 +2599,30 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
-const struct sched_class rt_sched_class = {
- .next = &fair_sched_class,
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_rt(struct task_struct *p, int cpu)
+{
+ struct rt_rq *rt_rq;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq = task_group(p)->rt_rq[cpu];
+#else
+ rt_rq = &cpu_rq(cpu)->rt;
+#endif
+
+ return rt_rq_throttled(rt_rq);
+}
+#endif
+
+DEFINE_SCHED_CLASS(rt) = {
+
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
- .check_preempt_curr = check_preempt_curr_rt,
+ .wakeup_preempt = wakeup_preempt_rt,
- .pick_next_task = pick_next_task_rt,
+ .pick_task = pick_task_rt,
.put_prev_task = put_prev_task_rt,
.set_next_task = set_next_task_rt,
@@ -2449,6 +2634,7 @@ const struct sched_class rt_sched_class = {
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
+ .find_lock_rq = find_lock_lowest_rq,
#endif
.task_tick = task_tick_rt,
@@ -2460,6 +2646,10 @@ const struct sched_class rt_sched_class = {
.update_curr = update_curr_rt,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_rt,
+#endif
+
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
@@ -2664,6 +2854,7 @@ long sched_group_rt_period(struct task_group *tg)
return rt_period_us;
}
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
int ret = 0;
@@ -2674,10 +2865,11 @@ static int sched_rt_global_constraints(void)
return ret;
}
+#endif /* CONFIG_SYSCTL */
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
- /* Don't accept realtime tasks when there is no way for them to run */
+ /* Don't accept real-time tasks when there is no way for them to run */
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
return 0;
@@ -2685,30 +2877,18 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
}
#else /* !CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
- unsigned long flags;
- int i;
-
- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = global_rt_runtime();
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
return 0;
}
+#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
((u64)sysctl_sched_rt_runtime *
@@ -2720,11 +2900,9 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void)
{
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
}
-int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int old_period, old_runtime;
@@ -2735,7 +2913,7 @@ int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
ret = sched_rt_global_validate();
@@ -2763,7 +2941,7 @@ undo:
return ret;
}
-int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
+static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2773,17 +2951,21 @@ int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
ret = proc_dointvec(table, write, buffer, lenp, ppos);
/*
* Make sure that internally we keep jiffies.
- * Also, writing zero resets the timeslice to default:
+ * Also, writing zero resets the time-slice to default:
*/
if (!ret && write) {
sched_rr_timeslice =
sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
msecs_to_jiffies(sysctl_sched_rr_timeslice);
+
+ if (sysctl_sched_rr_timeslice <= 0)
+ sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
}
mutex_unlock(&mutex);
return ret;
}
+#endif /* CONFIG_SYSCTL */
#ifdef CONFIG_SCHED_DEBUG
void print_rt_stats(struct seq_file *m, int cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 877fb08eb1b0..023b844159c9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,88 +2,101 @@
/*
* Scheduler internal types and methods:
*/
-#include <linux/sched.h>
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/cpufreq.h>
-#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/hotplug.h>
-#include <linux/sched/idle.h>
-#include <linux/sched/init.h>
-#include <linux/sched/isolation.h>
-#include <linux/sched/jobctl.h>
+#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
-#include <linux/sched/nohz.h>
-#include <linux/sched/numa_balancing.h>
-#include <linux/sched/prio.h>
-#include <linux/sched/rt.h>
+#include <linux/sched/rseq_api.h>
#include <linux/sched/signal.h>
#include <linux/sched/smt.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
#include <linux/sched/topology.h>
-#include <linux/sched/user.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/xacct.h>
-#include <uapi/linux/sched/types.h>
-
-#include <linux/binfmts.h>
-#include <linux/blkdev.h>
-#include <linux/compat.h>
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/capability.h>
+#include <linux/cgroup_api.h>
+#include <linux/cgroup.h>
#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
-#include <linux/cpuidle.h>
-#include <linux/cpuset.h>
+#include <linux/cpumask_api.h>
#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/delayacct.h>
-#include <linux/energy_model.h>
-#include <linux/init_task.h>
-#include <linux/kprobes.h>
+#include <linux/file.h>
+#include <linux/fs_api.h>
+#include <linux/hrtimer_api.h>
+#include <linux/interrupt.h>
+#include <linux/irq_work.h>
+#include <linux/jiffies.h>
+#include <linux/kref_api.h>
#include <linux/kthread.h>
-#include <linux/membarrier.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/nmi.h>
+#include <linux/ktime_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex_api.h>
+#include <linux/plist.h>
+#include <linux/poll.h>
#include <linux/proc_fs.h>
-#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/psi.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/security.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/softirq.h>
+#include <linux/spinlock_api.h>
+#include <linux/static_key.h>
#include <linux/stop_machine.h>
-#include <linux/suspend.h>
-#include <linux/swait.h>
+#include <linux/syscalls_api.h>
#include <linux/syscalls.h>
-#include <linux/task_work.h>
-#include <linux/tsacct_kern.h>
+#include <linux/tick.h>
+#include <linux/topology.h>
+#include <linux/types.h>
+#include <linux/u64_stats_sync_api.h>
+#include <linux/uaccess.h>
+#include <linux/wait_api.h>
+#include <linux/wait_bit.h>
+#include <linux/workqueue_api.h>
+#include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <trace/events/power.h>
+#include <trace/events/sched.h>
+
+#include "../workqueue_internal.h"
+
+struct rq;
+struct cfs_rq;
+struct rt_rq;
+struct sched_group;
+struct cpuidle_state;
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
+# include <asm/paravirt_api_clock.h>
#endif
+#include <asm/barrier.h>
+
#include "cpupri.h"
#include "cpudeadline.h"
#ifdef CONFIG_SCHED_DEBUG
-# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
-# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
+# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
#endif
-struct rq;
-struct cpuidle_state;
-
/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2
@@ -96,15 +109,35 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
+extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
+
+extern int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
+extern int sched_rr_timeslice;
+
+/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ struct rcu_head rcu;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+extern struct list_head asym_cap_list;
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
-#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ))
/*
* Increase resolution of nice-level calculations for 64-bit architectures.
* The extra resolution improves shares distribution and load balancing of
- * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group
* hierarchies, especially on larger systems. This is not a user-visible change
* and does not change the user-interface for setting shares/weights.
*
@@ -118,12 +151,13 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w) \
-({ \
- unsigned long __w = (w); \
- if (__w) \
- __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
- __w; \
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
})
#else
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
@@ -137,7 +171,7 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);
* scale_load() and scale_load_down(w) to convert between them. The
* following must be true:
*
- * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD
*
*/
#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
@@ -158,9 +192,19 @@ static inline int idle_policy(int policy)
{
return policy == SCHED_IDLE;
}
+
+static inline int normal_policy(int policy)
+{
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (policy == SCHED_EXT)
+ return true;
+#endif
+ return policy == SCHED_NORMAL;
+}
+
static inline int fair_policy(int policy)
{
- return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+ return normal_policy(policy) || policy == SCHED_BATCH;
}
static inline int rt_policy(int policy)
@@ -172,6 +216,7 @@ static inline int dl_policy(int policy)
{
return policy == SCHED_DEADLINE;
}
+
static inline bool valid_policy(int policy)
{
return idle_policy(policy) || fair_policy(policy) ||
@@ -193,15 +238,41 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
static inline void update_avg(u64 *avg, u64 sample)
{
s64 diff = sample - *avg;
+
*avg += diff / 8;
}
/*
+ * Shifting a value by an exponent greater *or equal* to the size of said value
+ * is UB; cap at size-1.
+ */
+#define shr_bound(val, shift) \
+ (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
+
+/*
+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are
+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it
+ * maps pretty well onto the shares value used by scheduler and the round-trip
+ * conversions preserve the original value over the entire range.
+ */
+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight)
+{
+ return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL);
+}
+
+static inline unsigned long sched_weight_to_cgroup(unsigned long weight)
+{
+ return clamp_t(unsigned long,
+ DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024),
+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX);
+}
+
+/*
* !! For sched_setattr_nocheck() (kernel) only !!
*
* This is actually gross. :(
@@ -215,7 +286,9 @@ static inline void update_avg(u64 *avg, u64 sample)
*/
#define SCHED_FLAG_SUGOV 0x10000000
-static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
+static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
@@ -227,8 +300,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
/*
* Tells if entity @a should preempt entity @b.
*/
-static inline bool
-dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+static inline bool dl_entity_preempt(const struct sched_dl_entity *a,
+ const struct sched_dl_entity *b)
{
return dl_entity_is_special(a) ||
dl_time_before(a->deadline, b->deadline);
@@ -251,13 +324,17 @@ struct rt_bandwidth {
unsigned int rt_period_active;
};
-void __dl_clear_params(struct task_struct *p);
+static inline int dl_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
/*
- * To keep the bandwidth of -deadline tasks and groups under control
+ * To keep the bandwidth of -deadline tasks under control
* we need some place where:
- * - store the maximum -deadline bandwidth of the system (the group);
- * - cache the fraction of that bandwidth that is currently allocated.
+ * - store the maximum -deadline bandwidth of each cpu;
+ * - cache the fraction of bandwidth that is currently allocated in
+ * each root domain;
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
@@ -265,58 +342,17 @@ void __dl_clear_params(struct task_struct *p);
* do not decrease any runtime while the group "executes", neither we
* need a timer to replenish it.
*
- * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * With respect to SMP, bandwidth is given on a per root domain basis,
* meaning that:
- * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
- * - dl_total_bw array contains, in the i-eth element, the currently
- * allocated bandwidth on the i-eth CPU.
- * Moreover, groups consume bandwidth on each CPU, while tasks only
- * consume bandwidth on the CPU they're running on.
- * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
- * that will be shown the next time the proc or cgroup controls will
- * be red. It on its turn can be changed by writing on its own
- * control.
+ * - bw (< 100%) is the deadline bandwidth of each CPU;
+ * - total_bw is the currently allocated bandwidth in each root domain;
*/
-struct dl_bandwidth {
- raw_spinlock_t dl_runtime_lock;
- u64 dl_runtime;
- u64 dl_period;
-};
-
-static inline int dl_bandwidth_enabled(void)
-{
- return sysctl_sched_rt_runtime >= 0;
-}
-
struct dl_bw {
raw_spinlock_t lock;
u64 bw;
u64 total_bw;
};
-static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
-
-static inline
-void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
- dl_b->total_bw -= tsk_bw;
- __dl_update(dl_b, (s32)tsk_bw / cpus);
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
- dl_b->total_bw += tsk_bw;
- __dl_update(dl_b, -((s32)tsk_bw / cpus));
-}
-
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
-
extern void init_dl_bw(struct dl_bw *dl_b);
extern int sched_dl_global_validate(void);
extern void sched_dl_do_global(void);
@@ -325,17 +361,49 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
-extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern bool dl_cpu_busy(unsigned int cpu);
+extern int dl_bw_deactivate(int cpu);
+extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
+/*
+ * SCHED_DEADLINE supports servers (nested scheduling) with the following
+ * interface:
+ *
+ * dl_se::rq -- runqueue we belong to.
+ *
+ * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
+ * server when it runs out of tasks to run.
+ *
+ * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
+ * returns NULL.
+ *
+ * dl_server_update() -- called from update_curr_common(), propagates runtime
+ * to the server.
+ *
+ * dl_server_start()
+ * dl_server_stop() -- start/stop the server when it has (no) tasks.
+ *
+ * dl_server_init() -- initializes the server.
+ */
+extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_stop(struct sched_dl_entity *dl_se);
+extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_has_tasks_f has_tasks,
+ dl_server_pick_f pick_task);
-#ifdef CONFIG_CGROUP_SCHED
+extern void dl_server_update_idle_time(struct rq *rq,
+ struct task_struct *p);
+extern void fair_server_init(struct rq *rq);
+extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
+extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
+ u64 runtime, u64 period, bool init);
-#include <linux/cgroup.h>
-#include <linux/psi.h>
+static inline bool dl_server_active(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_server_active;
+}
-struct cfs_rq;
-struct rt_rq;
+#ifdef CONFIG_CGROUP_SCHED
extern struct list_head task_groups;
@@ -345,6 +413,8 @@ struct cfs_bandwidth {
ktime_t period;
u64 quota;
u64 runtime;
+ u64 burst;
+ u64 runtime_snap;
s64 hierarchical_quota;
u8 idle;
@@ -357,7 +427,9 @@ struct cfs_bandwidth {
/* Statistics: */
int nr_periods;
int nr_throttled;
+ int nr_burst;
u64 throttled_time;
+ u64 burst_time;
#endif
};
@@ -365,17 +437,21 @@ struct cfs_bandwidth {
struct task_group {
struct cgroup_subsys_state css;
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+ /* A positive value indicates that this is a SCHED_IDLE group. */
+ int idle;
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each CPU */
struct sched_entity **se;
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
-
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
- * it in its own cacheline separated from the fields above which
+ * it in its own cache-line separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
@@ -389,6 +465,11 @@ struct task_group {
struct rt_bandwidth rt_bandwidth;
#endif
+#ifdef CONFIG_EXT_GROUP_SCHED
+ u32 scx_flags; /* SCX_TG_* */
+ u32 scx_weight;
+#endif
+
struct rcu_head rcu;
struct list_head list;
@@ -413,7 +494,7 @@ struct task_group {
};
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
/*
@@ -444,23 +525,38 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
return walk_tg_tree_from(&root_task_group, down, up, data);
}
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
extern int tg_nop(struct task_group *tg, void *data);
+#ifdef CONFIG_FAIR_GROUP_SCHED
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
+#else
+static inline void free_fair_sched_group(struct task_group *tg) { }
+static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+static inline void online_fair_sched_group(struct task_group *tg) { }
+static inline void unregister_fair_sched_group(struct task_group *tg) { }
+#endif
+
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
-extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern bool cfs_task_bw_constrained(struct task_struct *p);
-extern void free_rt_sched_group(struct task_group *tg);
-extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
@@ -474,13 +570,15 @@ extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
-extern void sched_offline_group(struct task_group *tg);
+extern void sched_release_group(struct task_group *tg);
-extern void sched_move_task(struct task_struct *tsk);
+extern void sched_move_task(struct task_struct *tsk, bool for_autogroup);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern int sched_group_set_idle(struct task_group *tg, long idle);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
@@ -488,25 +586,82 @@ extern void set_task_rq_fair(struct sched_entity *se,
static inline void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next) { }
#endif /* CONFIG_SMP */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
+static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
#endif /* CONFIG_FAIR_GROUP_SCHED */
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
+static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
+
#endif /* CONFIG_CGROUP_SCHED */
+extern void unregister_rt_sched_group(struct task_group *tg);
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+
+/*
+ * u64_u32_load/u64_u32_store
+ *
+ * Use a copy of a u64 value to protect against data race. This is only
+ * applicable for 32-bits architectures.
+ */
+#ifdef CONFIG_64BIT
+# define u64_u32_load_copy(var, copy) var
+# define u64_u32_store_copy(var, copy, val) (var = val)
+#else
+# define u64_u32_load_copy(var, copy) \
+({ \
+ u64 __val, __val_copy; \
+ do { \
+ __val_copy = copy; \
+ /* \
+ * paired with u64_u32_store_copy(), ordering access \
+ * to var and copy. \
+ */ \
+ smp_rmb(); \
+ __val = var; \
+ } while (__val != __val_copy); \
+ __val; \
+})
+# define u64_u32_store_copy(var, copy, val) \
+do { \
+ typeof(val) __val = (val); \
+ var = __val; \
+ /* \
+ * paired with u64_u32_load_copy(), ordering access to var and \
+ * copy. \
+ */ \
+ smp_wmb(); \
+ copy = __val; \
+} while (0)
+#endif
+# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
+# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+
+struct balance_callback {
+ struct balance_callback *next;
+ void (*func)(struct rq *rq);
+};
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned int nr_running;
- unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int idle_h_nr_running; /* SCHED_IDLE */
+ unsigned int nr_queued;
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
+
+ s64 avg_vruntime;
+ u64 avg_load;
- u64 exec_clock;
u64 min_vruntime;
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
+#ifdef CONFIG_SCHED_CORE
+ unsigned int forceidle_seq;
+ u64 min_vruntime_fi;
#endif
struct rb_root_cached tasks_timeline;
@@ -517,12 +672,6 @@ struct cfs_rq {
*/
struct sched_entity *curr;
struct sched_entity *next;
- struct sched_entity *last;
- struct sched_entity *skip;
-
-#ifdef CONFIG_SCHED_DEBUG
- unsigned int nr_spread_over;
-#endif
#ifdef CONFIG_SMP
/*
@@ -530,7 +679,7 @@ struct cfs_rq {
*/
struct sched_avg avg;
#ifndef CONFIG_64BIT
- u64 load_last_update_time_copy;
+ u64 last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
@@ -541,6 +690,7 @@ struct cfs_rq {
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ u64 last_update_tg_load_avg;
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
@@ -572,20 +722,71 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+ /* Locally cached copy of our task_group's idle value */
+ int idle;
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
+ u64 throttled_pelt_idle;
+#ifndef CONFIG_64BIT
+ u64 throttled_pelt_idle_copy;
+#endif
u64 throttled_clock;
- u64 throttled_clock_task;
- u64 throttled_clock_task_time;
+ u64 throttled_clock_pelt;
+ u64 throttled_clock_pelt_time;
+ u64 throttled_clock_self;
+ u64 throttled_clock_self_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
+ struct list_head throttled_csd_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
+#ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+ /*
+ * A hotplugged CPU starts scheduling before rq_online_scx(). Track
+ * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called
+ * only while the BPF scheduler considers the CPU to be online.
+ */
+ SCX_RQ_ONLINE = 1 << 0,
+ SCX_RQ_CAN_STOP_TICK = 1 << 1,
+ SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
+ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
+ SCX_RQ_BYPASSING = 1 << 4,
+ SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
+
+ SCX_RQ_IN_WAKEUP = 1 << 16,
+ SCX_RQ_IN_BALANCE = 1 << 17,
+};
+
+struct scx_rq {
+ struct scx_dispatch_q local_dsq;
+ struct list_head runnable_list; /* runnable tasks on this rq */
+ struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */
+ unsigned long ops_qseq;
+ u64 extra_enq_flags; /* see move_task_to_local_dsq() */
+ u32 nr_running;
+ u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
+ bool cpu_released;
+ u32 flags;
+ u64 clock; /* current per-rq clock -- see scx_bpf_now() */
+ cpumask_var_t cpus_to_kick;
+ cpumask_var_t cpus_to_kick_if_idle;
+ cpumask_var_t cpus_to_preempt;
+ cpumask_var_t cpus_to_wait;
+ unsigned long pnt_seq;
+ struct balance_callback deferred_bal_cb;
+ struct irq_work deferred_irq_work;
+ struct irq_work kick_cpus_irq_work;
+};
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
static inline int rt_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
@@ -610,22 +811,20 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
- int overloaded;
+ bool overloaded;
struct plist_head pushable_tasks;
#endif /* CONFIG_SMP */
int rt_queued;
+#ifdef CONFIG_RT_GROUP_SCHED
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
-#ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
+ unsigned int rt_nr_boosted;
struct rq *rq;
struct task_group *tg;
@@ -642,7 +841,7 @@ struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
struct rb_root_cached root;
- unsigned long dl_nr_running;
+ unsigned int dl_nr_running;
#ifdef CONFIG_SMP
/*
@@ -656,8 +855,7 @@ struct dl_rq {
u64 next;
} earliest_dl;
- unsigned long dl_nr_migratory;
- int overloaded;
+ bool overloaded;
/*
* Tasks on this rq that can be pushed away. They are kept in
@@ -688,6 +886,12 @@ struct dl_rq {
u64 extra_bw;
/*
+ * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
+ * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
+ */
+ u64 max_bw;
+
+ /*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
@@ -695,33 +899,42 @@ struct dl_rq {
};
#ifdef CONFIG_FAIR_GROUP_SCHED
+
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
static inline void se_update_runnable(struct sched_entity *se)
{
if (!entity_is_task(se))
- se->runnable_weight = se->my_q->h_nr_running;
+ se->runnable_weight = se->my_q->h_nr_runnable;
}
static inline long se_runnable(struct sched_entity *se)
{
+ if (se->sched_delayed)
+ return false;
+
if (entity_is_task(se))
return !!se->on_rq;
else
return se->runnable_weight;
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+
#define entity_is_task(se) 1
-static inline void se_update_runnable(struct sched_entity *se) {}
+static inline void se_update_runnable(struct sched_entity *se) { }
static inline long se_runnable(struct sched_entity *se)
{
+ if (se->sched_delayed)
+ return false;
+
return !!se->on_rq;
}
-#endif
+
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
/*
@@ -744,10 +957,6 @@ struct perf_domain {
struct rcu_head rcu;
};
-/* Scheduling group status flags */
-#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
-#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
-
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -768,10 +977,10 @@ struct root_domain {
* - More than one runnable task
* - Running task is misfit
*/
- int overload;
+ bool overloaded;
- /* Indicate one or more cpus over-utilized (tipping point) */
- int overutilized;
+ /* Indicate one or more CPUs over-utilized (tipping point) */
+ bool overutilized;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
@@ -782,6 +991,15 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
+ /*
+ * Indicate whether a root_domain's dl_bw has been checked or
+ * updated. It's monotonously increasing value.
+ *
+ * Also, some corner cases, like 'wrap around' is dangerous, but given
+ * that u64 is 'big enough'. So that shouldn't be a concern.
+ */
+ u64 visit_gen;
+
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
@@ -802,8 +1020,6 @@ struct root_domain {
cpumask_var_t rto_mask;
struct cpupri cpupri;
- unsigned long max_cpu_capacity;
-
/*
* NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU.
@@ -817,6 +1033,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
extern void sched_get_rd(struct root_domain *rd);
extern void sched_put_rd(struct root_domain *rd);
+static inline int get_rd_overloaded(struct root_domain *rd)
+{
+ return READ_ONCE(rd->overloaded);
+}
+
+static inline void set_rd_overloaded(struct root_domain *rd, int status)
+{
+ if (get_rd_overloaded(rd) != status)
+ WRITE_ONCE(rd->overloaded, status);
+}
+
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
@@ -862,6 +1089,8 @@ struct uclamp_rq {
unsigned int value;
struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
+
+DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
/*
@@ -873,12 +1102,8 @@ struct uclamp_rq {
*/
struct rq {
/* runqueue lock: */
- raw_spinlock_t lock;
+ raw_spinlock_t __lock;
- /*
- * nr_running and cpu_load should be in the same cacheline because
- * remote CPUs use both these fields when doing load calculation.
- */
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -910,6 +1135,11 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ struct scx_rq scx;
+#endif
+
+ struct sched_dl_entity fair_server;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this CPU: */
@@ -923,9 +1153,13 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned long nr_uninterruptible;
+ unsigned int nr_uninterruptible;
- struct task_struct __rcu *curr;
+ union {
+ struct task_struct __rcu *donor; /* Scheduler context */
+ struct task_struct __rcu *curr; /* Execution context */
+ };
+ struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
@@ -937,9 +1171,20 @@ struct rq {
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
unsigned long lost_idle_time;
+ u64 clock_pelt_idle;
+ u64 clock_idle;
+#ifndef CONFIG_64BIT
+ u64 clock_pelt_idle_copy;
+ u64 clock_idle_copy;
+#endif
atomic_t nr_iowait;
+#ifdef CONFIG_SCHED_DEBUG
+ u64 last_seen_need_resched_ns;
+ int ticks_without_resched;
+#endif
+
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
#endif
@@ -949,9 +1194,8 @@ struct rq {
struct sched_domain __rcu *sd;
unsigned long cpu_capacity;
- unsigned long cpu_capacity_orig;
- struct callback_head *balance_callback;
+ struct balance_callback *balance_callback;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
@@ -974,18 +1218,23 @@ struct rq {
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
- struct sched_avg avg_thermal;
+#ifdef CONFIG_SCHED_HW_PRESSURE
+ struct sched_avg avg_hw;
#endif
u64 idle_stamp;
u64 avg_idle;
/* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ struct rcuwait hotplug_wait;
+#endif
#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
@@ -1003,13 +1252,13 @@ struct rq {
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
+ ktime_t hrtick_time;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
- /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_count;
@@ -1024,9 +1273,42 @@ struct rq {
#endif
#ifdef CONFIG_CPU_IDLE
- /* Must be inspected within a rcu lock section */
+ /* Must be inspected within a RCU lock section */
struct cpuidle_state *idle_state;
#endif
+
+#ifdef CONFIG_SMP
+ unsigned int nr_pinned;
+#endif
+ unsigned int push_busy;
+ struct cpu_stop_work push_work;
+
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ struct task_struct *core_pick;
+ struct sched_dl_entity *core_dl_server;
+ unsigned int core_enabled;
+ unsigned int core_sched_seq;
+ struct rb_root core_tree;
+
+ /* shared state -- careful with sched_core_cpu_deactivate() */
+ unsigned int core_task_seq;
+ unsigned int core_pick_seq;
+ unsigned long core_cookie;
+ unsigned int core_forceidle_count;
+ unsigned int core_forceidle_seq;
+ unsigned int core_forceidle_occupation;
+ u64 core_forceidle_start;
+#endif
+
+ /* Scratch cpumask to be temporarily used under rq_lock */
+ cpumask_var_t scratch_mask;
+
+#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+ call_single_data_t cfsb_csd;
+ struct list_head cfsb_csd_list;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1054,6 +1336,223 @@ static inline int cpu_of(struct rq *rq)
#endif
}
+#define MDF_PUSH 0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+ return p->migration_disabled;
+#else
+ return false;
+#endif
+}
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+#define this_rq() this_cpu_ptr(&runqueues)
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() raw_cpu_ptr(&runqueues)
+
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+ /* Do nothing */
+}
+
+#ifdef CONFIG_SCHED_CORE
+static inline struct cpumask *sched_group_span(struct sched_group *sg);
+
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return !static_branch_unlikely(&__sched_core_enabled);
+}
+
+/*
+ * Be careful with this function; not for general use. The return value isn't
+ * stable unless you actually hold a relevant rq->__lock.
+ */
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ if (rq->core_enabled)
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+extern bool
+cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi);
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+/*
+ * Helpers to check if the CPU's core cookie matches with the task's cookie
+ * when core scheduling is enabled.
+ * A special case is that the task's cookie always matches with CPU's core
+ * cookie if the CPU is in an idle core.
+ */
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ return rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ bool idle_core = true;
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) {
+ if (!available_idle_cpu(cpu)) {
+ idle_core = false;
+ break;
+ }
+ }
+
+ /*
+ * A CPU in an idle core is always the best choice for tasks with
+ * cookies.
+ */
+ return idle_core || rq->core->core_cookie == p->core_cookie;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ int cpu;
+
+ /* Ignore cookie match if core scheduler is not enabled on the CPU. */
+ if (!sched_core_enabled(rq))
+ return true;
+
+ for_each_cpu_and(cpu, sched_group_span(group), p->cpus_ptr) {
+ if (sched_core_cookie_match(cpu_rq(cpu), p))
+ return true;
+ }
+ return false;
+}
+
+static inline bool sched_core_enqueued(struct task_struct *p)
+{
+ return !RB_EMPTY_NODE(&p->core_node);
+}
+
+extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
+
+extern void sched_core_get(void);
+extern void sched_core_put(void);
+
+#else /* !CONFIG_SCHED_CORE: */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return true;
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
+
+static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p)
+{
+ return true;
+}
+
+static inline bool sched_group_cookie_match(struct rq *rq,
+ struct task_struct *p,
+ struct sched_group *group)
+{
+ return true;
+}
+
+#endif /* !CONFIG_SCHED_CORE */
+
+static inline void lockdep_assert_rq_held(struct rq *rq)
+{
+ lockdep_assert_held(__rq_lockp(rq));
+}
+
+extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass);
+extern bool raw_spin_rq_trylock(struct rq *rq);
+extern void raw_spin_rq_unlock(struct rq *rq);
+
+static inline void raw_spin_rq_lock(struct rq *rq)
+{
+ raw_spin_rq_lock_nested(rq, 0);
+}
+
+static inline void raw_spin_rq_lock_irq(struct rq *rq)
+{
+ local_irq_disable();
+ raw_spin_rq_lock(rq);
+}
+
+static inline void raw_spin_rq_unlock_irq(struct rq *rq)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_enable();
+}
+
+static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ raw_spin_rq_lock(rq);
+
+ return flags;
+}
+
+static inline void raw_spin_rq_unlock_irqrestore(struct rq *rq, unsigned long flags)
+{
+ raw_spin_rq_unlock(rq);
+ local_irq_restore(flags);
+}
+
+#define raw_spin_rq_lock_irqsave(rq, flags) \
+do { \
+ flags = _raw_spin_rq_lock_irqsave(rq); \
+} while (0)
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@@ -1068,21 +1567,58 @@ static inline void update_idle_core(struct rq *rq)
static inline void update_idle_core(struct rq *rq) { }
#endif
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_FAIR_GROUP_SCHED
-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() this_cpu_ptr(&runqueues)
-#define task_rq(p) cpu_rq(task_cpu(p))
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+ SCHED_WARN_ON(!entity_is_task(se));
+ return container_of(se, struct task_struct, se);
+}
-extern void update_rq_clock(struct rq *rq);
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
-static inline u64 __rq_clock_broken(struct rq *rq)
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
{
- return READ_ONCE(rq->clock);
+ return se->cfs_rq;
}
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return grp->my_q;
+}
+
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+
+#define task_of(_se) container_of(_se, struct task_struct, se)
+
+static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)
+{
+ const struct task_struct *p = task_of(se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+ return NULL;
+}
+
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
+
+extern void update_rq_clock(struct rq *rq);
+
/*
* rq::clock_update_flags bits
*
@@ -1102,7 +1638,7 @@ static inline u64 __rq_clock_broken(struct rq *rq)
*
* if (rq-clock_update_flags >= RQCF_UPDATED)
*
- * to check if %RQCF_UPADTED is set. It'll never be shifted more than
+ * to check if %RQCF_UPDATED is set. It'll never be shifted more than
* one position though, because the next rq_unpin_lock() will shift it
* back.
*/
@@ -1121,7 +1657,7 @@ static inline void assert_clock_updated(struct rq *rq)
static inline u64 rq_clock(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock;
@@ -1129,46 +1665,50 @@ static inline u64 rq_clock(struct rq *rq)
static inline u64 rq_clock_task(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_task;
}
-/**
- * By default the decay is the default pelt decay period.
- * The decay shift can change the decay period in
- * multiples of 32.
- * Decay shift Decay period(ms)
- * 0 32
- * 1 64
- * 2 128
- * 3 256
- * 4 512
- */
-extern int sched_thermal_decay_shift;
-
-static inline u64 rq_clock_thermal(struct rq *rq)
-{
- return rq_clock_task(rq) >> sched_thermal_decay_shift;
-}
-
static inline void rq_clock_skip_update(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rq->clock_update_flags |= RQCF_REQ_SKIP;
}
/*
* See rt task throttling, which is the only time a skip
- * request is cancelled.
+ * request is canceled.
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_start_loop_update() can be called after updating the clock
+ * once and before iterating over the list to prevent multiple update.
+ * After the iterative traversal, we need to call rq_clock_stop_loop_update()
+ * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ */
+static inline void rq_clock_start_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
+static inline void rq_clock_stop_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+}
+
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
@@ -1182,13 +1722,60 @@ struct rq_flags {
#endif
};
+extern struct balance_callback balance_push_callback;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+extern const struct sched_class ext_sched_class;
+
+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
+
+#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
+{
+ if (!scx_enabled())
+ return;
+ WRITE_ONCE(rq->scx.clock, clock);
+ smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID);
+}
+
+static inline void scx_rq_clock_invalidate(struct rq *rq)
+{
+ if (!scx_enabled())
+ return;
+ WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID);
+}
+
+#else /* !CONFIG_SCHED_CLASS_EXT */
+#define scx_enabled() false
+#define scx_switched_all() false
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
+static inline void scx_rq_clock_invalidate(struct rq *rq) {}
+#endif /* !CONFIG_SCHED_CLASS_EXT */
+
+/*
+ * Lockdep annotation that avoids accidental unlocks; it's like a
+ * sticky/continuous lockdep_assert_held().
+ *
+ * This avoids code that has access to 'struct rq *rq' (basically everything in
+ * the scheduler) from accidentally unlocking the rq if they do not also have a
+ * copy of the (on-stack) 'struct rq_flags rf'.
+ *
+ * Also see Documentation/locking/lockdep-design.rst.
+ */
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
+# ifdef CONFIG_SMP
+ SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
+# endif
#endif
}
@@ -1198,13 +1785,13 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
if (rq->clock_update_flags > RQCF_ACT_SKIP)
rf->clock_update_flags = RQCF_UPDATED;
#endif
-
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ scx_rq_clock_invalidate(rq);
+ lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
- lockdep_repin_lock(&rq->lock, rf->cookie);
+ lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
#ifdef CONFIG_SCHED_DEBUG
/*
@@ -1214,9 +1801,11 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
#endif
}
+extern
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
+extern
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
@@ -1225,7 +1814,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
static inline void
@@ -1234,68 +1823,73 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(p->pi_lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
-static inline void
-rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
- __acquires(rq->lock)
-{
- raw_spin_lock_irqsave(&rq->lock, rf->flags);
- rq_pin_lock(rq, rf);
-}
+DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
+ _T->rq = task_rq_lock(_T->lock, &_T->rf),
+ task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
-static inline void
-rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_rq_lock_irqsave(rq, rf->flags);
rq_pin_lock(rq, rf);
}
-static inline void
-rq_lock(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_rq_lock_irq(rq);
rq_pin_lock(rq, rf);
}
-static inline void
-rq_relock(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
- rq_repin_lock(rq, rf);
+ raw_spin_rq_lock(rq);
+ rq_pin_lock(rq, rf);
}
-static inline void
-rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+ raw_spin_rq_unlock_irqrestore(rq, rf->flags);
}
-static inline void
-rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_rq_unlock_irq(rq);
}
-static inline void
-rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_rq_unlock(rq);
}
-static inline struct rq *
-this_rq_lock_irq(struct rq_flags *rf)
+DEFINE_LOCK_GUARD_1(rq_lock, struct rq,
+ rq_lock(_T->lock, &_T->rf),
+ rq_unlock(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq,
+ rq_lock_irq(_T->lock, &_T->rf),
+ rq_unlock_irq(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
+ rq_lock_irqsave(_T->lock, &_T->rf),
+ rq_unlock_irqrestore(_T->lock, &_T->rf),
+ struct rq_flags rf)
+
+static inline struct rq *this_rq_lock_irq(struct rq_flags *rf)
__acquires(rq->lock)
{
struct rq *rq;
@@ -1303,33 +1897,43 @@ this_rq_lock_irq(struct rq_flags *rf)
local_irq_disable();
rq = this_rq();
rq_lock(rq, rf);
+
return rq;
}
#ifdef CONFIG_NUMA
+
enum numa_topology_type {
NUMA_DIRECT,
NUMA_GLUELESS_MESH,
NUMA_BACKPLANE,
};
+
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-extern void sched_init_numa(void);
+extern void sched_init_numa(int offline_node);
+extern void sched_update_numa(int cpu, bool online);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
-#else
-static inline void sched_init_numa(void) { }
+
+#else /* !CONFIG_NUMA: */
+
+static inline void sched_init_numa(int offline_node) { }
+static inline void sched_update_numa(int cpu, bool online) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+
static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
return nr_cpu_ids;
}
-#endif
+
+#endif /* !CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING
+
/* The regions in numa_faults array from task_struct */
enum numa_faults_stats {
NUMA_MEM = 0,
@@ -1337,38 +1941,46 @@ enum numa_faults_stats {
NUMA_MEMBUF,
NUMA_CPUBUF
};
+
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
-#else
+
+#else /* !CONFIG_NUMA_BALANCING: */
+
static inline void
init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
}
-#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* !CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP
static inline void
queue_balance_callback(struct rq *rq,
- struct callback_head *head,
+ struct balance_callback *head,
void (*func)(struct rq *rq))
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_rq_held(rq);
- if (unlikely(head->next))
+ /*
+ * Don't (re)queue an already queued item; nor queue anything when
+ * balance_push() is active, see the comment with
+ * balance_push_callback.
+ */
+ if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
return;
- head->func = (void (*)(struct callback_head *))func;
+ head->func = func;
head->next = rq->balance_callback;
rq->balance_callback = head;
}
#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), \
- lockdep_is_held(&sched_domains_mutex))
+ rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -1381,6 +1993,13 @@ queue_balance_callback(struct rq *rq,
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+static const unsigned int SD_SHARED_CHILD_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The CPU whose highest level of sched domain is to
@@ -1388,16 +2007,25 @@ queue_balance_callback(struct rq *rq,
* @flag: The flag to check for the highest sched_domain
* for the given CPU.
*
- * Returns the highest sched_domain of a CPU which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
struct sched_domain *sd, *hsd = NULL;
for_each_domain(cpu, sd) {
- if (!(sd->flags & flag))
+ if (sd->flags & flag) {
+ hsd = sd;
+ continue;
+ }
+
+ /*
+ * Stop the search if @flag is known to be shared at lower
+ * levels. It will not be found further up.
+ */
+ if (flag & SD_SHARED_CHILD_MASK)
break;
- hsd = sd;
}
return hsd;
@@ -1418,11 +2046,19 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_share_id);
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
extern struct static_key_false sched_asym_cpucapacity;
+extern struct static_key_false sched_cluster_active;
+
+static __always_inline bool sched_asym_cpucap_active(void)
+{
+ return static_branch_unlikely(&sched_asym_cpucapacity);
+}
struct sched_group_capacity {
atomic_t ref;
@@ -1440,7 +2076,7 @@ struct sched_group_capacity {
int id;
#endif
- unsigned long cpumask[0]; /* Balance mask */
+ unsigned long cpumask[]; /* Balance mask */
};
struct sched_group {
@@ -1448,8 +2084,10 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
+ unsigned int cores;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* CPU of highest priority in group */
+ int flags;
/*
* The CPUs this group covers.
@@ -1474,41 +2112,26 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
return to_cpumask(sg->sgc->cpumask);
}
-/**
- * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
- * @group: The group whose first CPU is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
- return cpumask_first(sched_group_span(group));
-}
-
extern int group_balance_cpu(struct sched_group *sg);
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-void register_sched_domain_sysctl(void);
-void dirty_sched_domain_sysctl(int cpu);
-void unregister_sched_domain_sysctl(void);
+#ifdef CONFIG_SCHED_DEBUG
+extern void update_sched_domain_debugfs(void);
+extern void dirty_sched_domain_sysctl(int cpu);
#else
-static inline void register_sched_domain_sysctl(void)
-{
-}
-static inline void dirty_sched_domain_sysctl(int cpu)
-{
-}
-static inline void unregister_sched_domain_sysctl(void)
-{
-}
+static inline void update_sched_domain_debugfs(void) { }
+static inline void dirty_sched_domain_sysctl(int cpu) { }
#endif
-extern void flush_smp_call_function_from_idle(void);
+extern int sched_update_scaling(void);
-#else /* !CONFIG_SMP: */
-static inline void flush_smp_call_function_from_idle(void) { }
-#endif
+static inline const struct cpumask *task_user_cpus(struct task_struct *p)
+{
+ if (!p->user_cpus_ptr)
+ return cpu_possible_mask; /* &init_task.cpus_mask */
+ return p->user_cpus_ptr;
+}
-#include "stats.h"
-#include "autogroup.h"
+#endif /* CONFIG_SMP */
#ifdef CONFIG_CGROUP_SCHED
@@ -1541,6 +2164,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
+ p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1549,15 +2173,16 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
}
-#else /* CONFIG_CGROUP_SCHED */
+#else /* !CONFIG_CGROUP_SCHED: */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+
static inline struct task_group *task_group(struct task_struct *p)
{
return NULL;
}
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* !CONFIG_CGROUP_SCHED */
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -1569,11 +2194,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* per-task data have been completed by this moment.
*/
smp_wmb();
-#ifdef CONFIG_THREAD_INFO_IN_TASK
- WRITE_ONCE(p->cpu, cpu);
-#else
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
-#endif
p->wake_cpu = cpu;
#endif
}
@@ -1582,7 +2203,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
# define const_debug __read_mostly
#else
# define const_debug const
@@ -1598,7 +2218,7 @@ enum {
#undef SCHED_FEAT
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#ifdef CONFIG_SCHED_DEBUG
/*
* To support run-time toggling of sched features, all the translation units
@@ -1606,6 +2226,8 @@ enum {
*/
extern const_debug unsigned int sysctl_sched_features;
+#ifdef CONFIG_JUMP_LABEL
+
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@@ -1618,7 +2240,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
-#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */
+#else /* !CONFIG_JUMP_LABEL: */
+
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
+#endif /* !CONFIG_JUMP_LABEL */
+
+#else /* !SCHED_DEBUG: */
/*
* Each translation unit has its own copy of sysctl_sched_features to allow
@@ -1634,7 +2262,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */
+#endif /* !SCHED_DEBUG */
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -1652,12 +2280,26 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
+/*
+ * Is p the current execution context?
+ */
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
}
-static inline int task_running(struct rq *rq, struct task_struct *p)
+/*
+ * Is p the current scheduling context?
+ *
+ * Note that it might be the current execution context at the same time if
+ * rq->curr == rq->donor == p.
+ */
+static inline int task_current_donor(struct rq *rq, struct task_struct *p)
+{
+ return rq->donor == p;
+}
+
+static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->on_cpu;
@@ -1668,7 +2310,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
static inline int task_on_rq_queued(struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_QUEUED;
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED;
}
static inline int task_on_rq_migrating(struct task_struct *p)
@@ -1676,13 +2318,21 @@ static inline int task_on_rq_migrating(struct task_struct *p)
return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
}
-/*
- * wake flags
- */
-#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
-#define WF_FORK 0x02 /* Child wakeup after fork */
-#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
-#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
+/* Wake flags. The first three directly map to some SD flag value */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
+#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
+
+#ifdef CONFIG_SMP
+static_assert(WF_EXEC == SD_BALANCE_EXEC);
+static_assert(WF_FORK == SD_BALANCE_FORK);
+static_assert(WF_TTWU == SD_BALANCE_WAKE);
+#endif
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1712,16 +2362,24 @@ extern const u32 sched_prio_to_wmult[40];
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
* in the runqueue.
*
+ * NOCLOCK - skip the update_rq_clock() (avoids double updates)
+ *
+ * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
+ * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called
*
*/
-#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_SPECIAL 0x10
+#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -1735,40 +2393,63 @@ extern const u32 sched_prio_to_wmult[40];
#else
#define ENQUEUE_MIGRATED 0x00
#endif
+#define ENQUEUE_INITIAL 0x80
+#define ENQUEUE_MIGRATING 0x100
+#define ENQUEUE_DELAYED 0x200
+#define ENQUEUE_RQ_SELECTED 0x400
#define RETRY_TASK ((void *)-1UL)
+struct affinity_context {
+ const struct cpumask *new_mask;
+ struct cpumask *user_mask;
+ unsigned int flags;
+};
+
+extern s64 update_curr_common(struct rq *rq);
+
struct sched_class {
- const struct sched_class *next;
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
- void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
- bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
- void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
- struct task_struct *(*pick_next_task)(struct rq *rq);
+ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+ struct task_struct *(*pick_task)(struct rq *rq);
+ /*
+ * Optional! When implemented pick_next_task() should be equivalent to:
+ *
+ * next = pick_task();
+ * if (next) {
+ * put_prev_task(prev);
+ * set_next_task_first(next);
+ * }
+ */
+ struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev);
- void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
- int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
- int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
- void (*set_cpus_allowed)(struct task_struct *p,
- const struct cpumask *newmask);
+ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
+
+ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1777,11 +2458,14 @@ struct sched_class {
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
- * cannot assume the switched_from/switched_to pair is serliazed by
+ * cannot assume the switched_from/switched_to pair is serialized by
* rq->lock. They are however serialized by p->pi_lock.
*/
+ void (*switching_to) (struct rq *this_rq, struct task_struct *task);
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task,
+ const struct load_weight *lw);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
@@ -1790,37 +2474,69 @@ struct sched_class {
void (*update_curr)(struct rq *rq);
-#define TASK_SET_GROUP 0
-#define TASK_MOVE_GROUP 1
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_change_group)(struct task_struct *p, int type);
+ void (*task_change_group)(struct task_struct *p);
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+ int (*task_is_throttled)(struct task_struct *p, int cpu);
#endif
};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- WARN_ON_ONCE(rq->curr != prev);
- prev->sched_class->put_prev_task(rq, prev);
+ WARN_ON_ONCE(rq->donor != prev);
+ prev->sched_class->put_prev_task(rq, prev, NULL);
}
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next, false);
}
-#ifdef CONFIG_SMP
-#define sched_class_highest (&stop_sched_class)
-#else
-#define sched_class_highest (&dl_sched_class)
-#endif
+static inline void
+__put_prev_set_next_dl_server(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
+{
+ prev->dl_server = NULL;
+ next->dl_server = rq->dl_server;
+ rq->dl_server = NULL;
+}
-#define for_class_range(class, _from, _to) \
- for (class = (_from); class != (_to); class = class->next)
+static inline void put_prev_set_next_task(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
+{
+ WARN_ON_ONCE(rq->curr != prev);
-#define for_each_class(class) \
- for_class_range(class, sched_class_highest, NULL)
+ __put_prev_set_next_dl_server(rq, prev, next);
+
+ if (next == prev)
+ return;
+
+ prev->sched_class->put_prev_task(rq, prev, next);
+ next->sched_class->set_next_task(rq, next, true);
+}
+
+/*
+ * Helper to define a sched_class instance; each one is placed in a separate
+ * section which is ordered by the linker script:
+ *
+ * include/asm-generic/vmlinux.lds.h
+ *
+ * *CAREFUL* they are laid out in *REVERSE* order!!!
+ *
+ * Also enforce alignment on the instance, not the type, to guarantee layout.
+ */
+#define DEFINE_SCHED_CLASS(name) \
+const struct sched_class name##_sched_class \
+ __aligned(__alignof__(struct sched_class)) \
+ __section("__" #name "_sched_class")
+
+/* Defined in include/asm-generic/vmlinux.lds.h */
+extern struct sched_class __sched_class_highest[];
+extern struct sched_class __sched_class_lowest[];
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -1828,6 +2544,36 @@ extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
+/*
+ * Iterate only active classes. SCX can take over all fair tasks or be
+ * completely disabled. If the former, skip fair. If the latter, skip SCX.
+ */
+static inline const struct sched_class *next_active_class(const struct sched_class *class)
+{
+ class++;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (scx_switched_all() && class == &fair_sched_class)
+ class++;
+ if (!scx_enabled() && class == &ext_sched_class)
+ class++;
+#endif
+ return class;
+}
+
+#define for_class_range(class, _from, _to) \
+ for (class = (_from); class < (_to); class++)
+
+#define for_each_class(class) \
+ for_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define for_active_class_range(class, _from, _to) \
+ for (class = (_from); class != (_to); class = next_active_class(class))
+
+#define for_each_active_class(class) \
+ for_active_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define sched_class_above(_a, _b) ((_a) < (_b))
+
static inline bool sched_stop_runnable(struct rq *rq)
{
return rq->stop && task_on_rq_queued(rq->stop);
@@ -1845,23 +2591,92 @@ static inline bool sched_rt_runnable(struct rq *rq)
static inline bool sched_fair_runnable(struct rq *rq)
{
- return rq->cfs.nr_running > 0;
+ return rq->cfs.nr_queued > 0;
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-extern struct task_struct *pick_next_task_idle(struct rq *rq);
+extern struct task_struct *pick_task_idle(struct rq *rq);
+
+#define SCA_CHECK 0x01
+#define SCA_MIGRATE_DISABLE 0x02
+#define SCA_MIGRATE_ENABLE 0x04
+#define SCA_USER 0x08
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
-extern void trigger_load_balance(struct rq *rq);
+extern void sched_balance_trigger(struct rq *rq);
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx);
+extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
-#endif
+static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
+{
+ /* When not in the task's cpumask, no point in looking further. */
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
+ /* Can @cpu run a user thread? */
+ if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p))
+ return false;
+
+ return true;
+}
+
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See do_set_cpus_allowed() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
+}
+
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+ struct task_struct *p = rq->donor;
+
+ lockdep_assert_rq_held(rq);
+
+ if (rq->push_busy)
+ return NULL;
+
+ if (p->nr_cpus_allowed == 1)
+ return NULL;
+
+ if (p->migration_disabled)
+ return NULL;
+
+ rq->push_busy = true;
+ return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
+
+#else /* !CONFIG_SMP: */
+
+static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
+{
+ return true;
+}
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ struct affinity_context *ctx)
+{
+ return set_cpus_allowed_ptr(p, ctx->new_mask);
+}
+
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ return NULL;
+}
+
+#endif /* !CONFIG_SMP */
#ifdef CONFIG_CPU_IDLE
+
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
{
@@ -1874,7 +2689,9 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
return rq->idle_state;
}
-#else
+
+#else /* !CONFIG_CPU_IDLE: */
+
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
{
@@ -1884,9 +2701,11 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
return NULL;
}
-#endif
+
+#endif /* !CONFIG_CPU_IDLE */
extern void schedule_idle(void);
+asmlinkage void schedule_user(void);
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
@@ -1896,25 +2715,22 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void reweight_task(struct task_struct *p, int prio);
-
extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
extern void resched_cpu(int cpu);
-extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-extern struct dl_bandwidth def_dl_bandwidth;
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
-extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
-extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_entity(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
#define MAX_BW_BITS (64 - BW_SHIFT)
#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
-unsigned long to_ratio(u64 period, u64 runtime);
+
+extern unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
extern void post_init_entity_util_avg(struct task_struct *p);
@@ -1930,12 +2746,7 @@ extern int __init sched_tick_offload_init(void);
*/
static inline void sched_update_tick_dependency(struct rq *rq)
{
- int cpu;
-
- if (!tick_nohz_full_enabled())
- return;
-
- cpu = cpu_of(rq);
+ int cpu = cpu_of(rq);
if (!tick_nohz_full_cpu(cpu))
return;
@@ -1945,22 +2756,23 @@ static inline void sched_update_tick_dependency(struct rq *rq)
else
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
-#else
+#else /* !CONFIG_NO_HZ_FULL: */
static inline int sched_tick_offload_init(void) { return 0; }
static inline void sched_update_tick_dependency(struct rq *rq) { }
-#endif
+#endif /* !CONFIG_NO_HZ_FULL */
static inline void add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
rq->nr_running = prev_nr + count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, count);
+ }
#ifdef CONFIG_SMP
- if (prev_nr < 2 && rq->nr_running >= 2) {
- if (!READ_ONCE(rq->rd->overload))
- WRITE_ONCE(rq->rd->overload, 1);
- }
+ if (prev_nr < 2 && rq->nr_running >= 2)
+ set_rd_overloaded(rq->rd, 1);
#endif
sched_update_tick_dependency(rq);
@@ -1969,18 +2781,86 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
+ if (trace_sched_update_nr_running_tp_enabled()) {
+ call_trace_sched_update_nr_running(rq, -count);
+ }
+
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
}
+static inline void __block_task(struct rq *rq, struct task_struct *p)
+{
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible++;
+
+ if (p->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+
+ /*
+ * The moment this write goes through, ttwu() can swoop in and migrate
+ * this task, rendering our rq->__lock ineffective.
+ *
+ * __schedule() try_to_wake_up()
+ * LOCK rq->__lock LOCK p->pi_lock
+ * pick_next_task()
+ * pick_next_task_fair()
+ * pick_next_entity()
+ * dequeue_entities()
+ * __block_task()
+ * RELEASE p->on_rq = 0 if (p->on_rq && ...)
+ * break;
+ *
+ * ACQUIRE (after ctrl-dep)
+ *
+ * cpu = select_task_rq();
+ * set_task_cpu(p, cpu);
+ * ttwu_queue()
+ * ttwu_do_activate()
+ * LOCK rq->__lock
+ * activate_task()
+ * STORE p->on_rq = 1
+ * UNLOCK rq->__lock
+ *
+ * Callers must ensure to not reference @p after this -- we no longer
+ * own it.
+ */
+ smp_store_release(&p->on_rq, 0);
+}
+
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
+
+#ifdef CONFIG_PREEMPT_RT
+# define SCHED_NR_MIGRATE_BREAK 8
+#else
+# define SCHED_NR_MIGRATE_BREAK 32
+#endif
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_base_slice;
+
+#ifdef CONFIG_SCHED_DEBUG
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
+
+extern unsigned int sysctl_sched_tunable_scaling;
+
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
+#endif
+
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -1990,32 +2870,61 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
*/
static inline int hrtick_enabled(struct rq *rq)
{
- if (!sched_feat(HRTICK))
- return 0;
if (!cpu_active(cpu_of(rq)))
return 0;
return hrtimer_is_hres_active(&rq->hrtick_timer);
}
-void hrtick_start(struct rq *rq, u64 delay);
+static inline int hrtick_enabled_fair(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ return hrtick_enabled(rq);
+}
-#else
+static inline int hrtick_enabled_dl(struct rq *rq)
+{
+ if (!sched_feat(HRTICK_DL))
+ return 0;
+ return hrtick_enabled(rq);
+}
-static inline int hrtick_enabled(struct rq *rq)
+extern void hrtick_start(struct rq *rq, u64 delay);
+
+#else /* !CONFIG_SCHED_HRTICK: */
+
+static inline int hrtick_enabled_fair(struct rq *rq)
{
return 0;
}
-#endif /* CONFIG_SCHED_HRTICK */
+static inline int hrtick_enabled_dl(struct rq *rq)
+{
+ return 0;
+}
-#ifndef arch_scale_freq_tick
-static __always_inline
-void arch_scale_freq_tick(void)
+static inline int hrtick_enabled(struct rq *rq)
{
+ return 0;
}
+
+#endif /* !CONFIG_SCHED_HRTICK */
+
+#ifndef arch_scale_freq_tick
+static __always_inline void arch_scale_freq_tick(void) { }
#endif
#ifndef arch_scale_freq_capacity
+/**
+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
+ * @cpu: the CPU in question.
+ *
+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e.
+ *
+ * f_curr
+ * ------ * SCHED_CAPACITY_SCALE
+ * f_max
+ */
static __always_inline
unsigned long arch_scale_freq_capacity(int cpu)
{
@@ -2023,10 +2932,62 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
+ * acquire rq lock instead of rq_lock(). So at the end of these two functions
+ * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
+ * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
+ */
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
+{
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPTION
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
+}
+#else
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { }
+#endif
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
+__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
+static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
+ _lock; return _t; }
+
+#ifdef CONFIG_SMP
+
+static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
+{
+#ifdef CONFIG_SCHED_CORE
+ /*
+ * In order to not have {0,2},{1,3} turn into into an AB-BA,
+ * order by core-id first and cpu-id second.
+ *
+ * Notably:
+ *
+ * double_rq_lock(0,3); will take core-0, core-1 lock
+ * double_rq_lock(1,2); will take core-1, core-0 lock
+ *
+ * when only cpu-id is considered.
+ */
+ if (rq1->core->cpu < rq2->core->cpu)
+ return true;
+ if (rq1->core->cpu > rq2->core->cpu)
+ return false;
+
+ /*
+ * __sched_core_flip() relies on SMT having cpu-id lock order.
+ */
+#endif
+ return rq1->cpu < rq2->cpu;
+}
+
+extern void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+#ifdef CONFIG_PREEMPTION
/*
* fair double_lock_balance: Safely acquires both rq->locks in a fair
@@ -2041,13 +3002,13 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_rq_unlock(this_rq);
double_rq_lock(this_rq, busiest);
return 1;
}
-#else
+#else /* !CONFIG_PREEMPTION: */
/*
* Unfair double_lock_balance: Optimizes throughput at the expense of
* latency by eliminating extra atomic operations when the locks are
@@ -2060,34 +3021,32 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- int ret = 0;
-
- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- raw_spin_unlock(&this_rq->lock);
- raw_spin_lock(&busiest->lock);
- raw_spin_lock_nested(&this_rq->lock,
- SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- raw_spin_lock_nested(&busiest->lock,
- SINGLE_DEPTH_NESTING);
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+ likely(raw_spin_rq_trylock(busiest))) {
+ double_rq_clock_clear_update(this_rq, busiest);
+ return 0;
}
- return ret;
+
+ if (rq_order_less(this_rq, busiest)) {
+ raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(this_rq, busiest);
+ return 0;
+ }
+
+ raw_spin_rq_unlock(this_rq);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
}
-#endif /* CONFIG_PREEMPTION */
+#endif /* !CONFIG_PREEMPTION */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
*/
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work well under rq->lock */
- raw_spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
+ lockdep_assert_irqs_disabled();
return _double_lock_balance(this_rq, busiest);
}
@@ -2095,8 +3054,9 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ if (__rq_lockp(this_rq) != __rq_lockp(busiest))
+ raw_spin_rq_unlock(busiest);
+ lock_set_subclass(&__rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
}
static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
@@ -2126,31 +3086,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
}
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
+static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2)
{
- BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- }
- }
+ raw_spin_unlock(l1);
+ raw_spin_unlock(l2);
}
+DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t,
+ double_raw_lock(_T->lock, _T->lock2),
+ double_raw_unlock(_T->lock, _T->lock2))
+
/*
* double_rq_unlock - safely unlock two runqueues
*
@@ -2161,18 +3106,19 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_unlock(rq2);
else
__release(rq2->lock);
+ raw_spin_rq_unlock(rq1);
}
extern void set_rq_online (struct rq *rq);
extern void set_rq_offline(struct rq *rq);
+
extern bool sched_smp_initialized;
-#else /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
/*
* double_rq_lock - safely lock two runqueues
@@ -2184,10 +3130,11 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
__acquires(rq1->lock)
__acquires(rq2->lock)
{
- BUG_ON(!irqs_disabled());
- BUG_ON(rq1 != rq2);
- raw_spin_lock(&rq1->lock);
+ WARN_ON_ONCE(!irqs_disabled());
+ WARN_ON_ONCE(rq1 != rq2);
+ raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
+ double_rq_clock_clear_update(rq1, rq2);
}
/*
@@ -2200,18 +3147,23 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- BUG_ON(rq1 != rq2);
- raw_spin_unlock(&rq1->lock);
+ WARN_ON_ONCE(rq1 != rq2);
+ raw_spin_rq_unlock(rq1);
__release(rq2->lock);
}
-#endif
+#endif /* !CONFIG_SMP */
+DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
+ double_rq_lock(_T->lock, _T->lock2),
+ double_rq_unlock(_T->lock, _T->lock2))
+
+extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
#ifdef CONFIG_SCHED_DEBUG
-extern bool sched_debug_enabled;
+extern bool sched_debug_verbose;
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
@@ -2219,14 +3171,17 @@ extern void print_dl_stats(struct seq_file *m, int cpu);
extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
-#ifdef CONFIG_NUMA_BALANCING
-extern void
-show_numa_stats(struct task_struct *p, struct seq_file *m);
+
+extern void resched_latency_warn(int cpu, u64 latency);
+# ifdef CONFIG_NUMA_BALANCING
+extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
- unsigned long tpf, unsigned long gsf, unsigned long gpf);
-#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
+ unsigned long tpf, unsigned long gsf, unsigned long gpf);
+# endif /* CONFIG_NUMA_BALANCING */
+#else /* !CONFIG_SCHED_DEBUG: */
+static inline void resched_latency_warn(int cpu, u64 latency) { }
+#endif /* !CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
@@ -2236,49 +3191,66 @@ extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
+
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
+#define NOHZ_NEWILB_KICK_BIT 2
+#define NOHZ_NEXT_KICK_BIT 3
+/* Run sched_balance_domains() */
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
+/* Update blocked load */
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
+/* Update blocked load when entering idle */
+#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
+/* Update nohz.next_balance */
+#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
-#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
extern void nohz_balance_exit_idle(struct rq *rq);
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline void nohz_balance_exit_idle(struct rq *rq) { }
+#endif /* !CONFIG_NO_HZ_COMMON */
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_run_idle_balance(int cpu);
+#else
+static inline void nohz_run_idle_balance(int cpu) { }
#endif
+#include "stats.h"
-#ifdef CONFIG_SMP
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
- int i;
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask) {
- struct rq *rq = cpu_rq(i);
+extern void __sched_core_account_forceidle(struct rq *rq);
- rq->dl.extra_bw += bw;
- }
-}
-#else
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
+static inline void sched_core_account_forceidle(struct rq *rq)
{
- struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
- dl->extra_bw += bw;
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
}
-#endif
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
+
+static inline void sched_core_account_forceidle(struct rq *rq) { }
+
+static inline void sched_core_tick(struct rq *rq) { }
+
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
struct irqtime {
u64 total;
u64 tick_delta;
@@ -2287,10 +3259,16 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+extern int sched_clock_irqtime;
+
+static inline int irqtime_enabled(void)
+{
+ return sched_clock_irqtime;
+}
/*
* Returns the irqtime minus the softirq time computed by ksoftirqd.
- * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
* and never move forward.
*/
static inline u64 irq_time_read(int cpu)
@@ -2306,9 +3284,18 @@ static inline u64 irq_time_read(int cpu)
return total;
}
+
+#else
+
+static inline int irqtime_enabled(void)
+{
+ return 0;
+}
+
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
+
DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/**
@@ -2342,119 +3329,189 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
if (data)
data->func(data, rq_clock(rq), flags);
}
+#else /* !CONFIG_CPU_FREQ: */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
+#endif /* !CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+# ifndef arch_scale_freq_invariant
+# define arch_scale_freq_invariant() true
+# endif
#else
-static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-#endif /* CONFIG_CPU_FREQ */
+# define arch_scale_freq_invariant() false
+#endif
-#ifdef CONFIG_UCLAMP_TASK
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+#ifdef CONFIG_SMP
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max);
+
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max);
+
+
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the original capacity of @cpu is
+ * greater than or equal to task's deadline density right shifted by
+ * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
{
- unsigned long min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- unsigned long max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+ unsigned long cap = arch_scale_cpu_capacity(cpu);
- if (p) {
- min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
- max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
- }
+ return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT);
+}
- /*
- * Since CPU's {min,max}_util clamps are MAX aggregated considering
- * RUNNABLE tasks with _different_ clamps, we can end up with an
- * inversion. Fix it now when the clamps are applied.
- */
- if (unlikely(min_util >= max_util))
- return min_util;
+static inline unsigned long cpu_bw_dl(struct rq *rq)
+{
+ return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+}
- return clamp(util, min_util, max_util);
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+ return READ_ONCE(rq->avg_dl.util_avg);
}
-#else /* CONFIG_UCLAMP_TASK */
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
+
+
+extern unsigned long cpu_util_cfs(int cpu);
+extern unsigned long cpu_util_cfs_boost(int cpu);
+
+static inline unsigned long cpu_util_rt(struct rq *rq)
{
- return util;
+ return READ_ONCE(rq->avg_rt.util_avg);
}
-#endif /* CONFIG_UCLAMP_TASK */
-#ifdef arch_scale_freq_capacity
-# ifndef arch_scale_freq_invariant
-# define arch_scale_freq_invariant() true
-# endif
-#else
-# define arch_scale_freq_invariant() false
-#endif
+#else /* !CONFIG_SMP */
+static inline bool update_other_load_avgs(struct rq *rq) { return false; }
+#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
-static inline unsigned long capacity_orig_of(int cpu)
+#ifdef CONFIG_UCLAMP_TASK
+
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
{
- return cpu_rq(cpu)->cpu_capacity_orig;
+ return READ_ONCE(rq->uclamp[clamp_id].value);
}
-#endif
-/**
- * enum schedutil_type - CPU utilization type
- * @FREQUENCY_UTIL: Utilization used to select frequency
- * @ENERGY_UTIL: Utilization used during energy calculation
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+ WRITE_ONCE(rq->uclamp[clamp_id].value, value);
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
+}
+
+/* Is the rq being capped/throttled by uclamp_max? */
+static inline bool uclamp_rq_is_capped(struct rq *rq)
+{
+ unsigned long rq_util;
+ unsigned long max_util;
+
+ if (!static_branch_likely(&sched_uclamp_used))
+ return false;
+
+ rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+ max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+ return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
+}
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
*
- * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
- * need to be aggregated differently depending on the usage made of them. This
- * enum is used within schedutil_freq_util() to differentiate the types of
- * utilization expected by the callers, and adjust the aggregation accordingly.
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
*/
-enum schedutil_type {
- FREQUENCY_UTIL,
- ENERGY_UTIL,
-};
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+#define for_each_clamp_id(clamp_id) \
+ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
-unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p);
+extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
-static inline unsigned long cpu_bw_dl(struct rq *rq)
+
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
- return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+ return SCHED_CAPACITY_SCALE;
}
-static inline unsigned long cpu_util_dl(struct rq *rq)
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
- return READ_ONCE(rq->avg_dl.util_avg);
+ return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
}
-static inline unsigned long cpu_util_cfs(struct rq *rq)
+static inline void
+uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined)
{
- unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
+ uc_se->value = value;
+ uc_se->bucket_id = uclamp_bucket_id(value);
+ uc_se->user_defined = user_defined;
+}
- if (sched_feat(UTIL_EST)) {
- util = max_t(unsigned long, util,
- READ_ONCE(rq->cfs.avg.util_est.enqueued));
- }
+#else /* !CONFIG_UCLAMP_TASK: */
- return util;
+static inline unsigned long
+uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
}
-static inline unsigned long cpu_util_rt(struct rq *rq)
+static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
+
+static inline bool uclamp_is_used(void)
{
- return READ_ONCE(rq->avg_rt.util_avg);
+ return false;
}
-#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type,
- struct task_struct *p)
+
+static inline unsigned long
+uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id)
{
- return 0;
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void
+uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value)
+{
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return false;
}
-#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+#endif /* !CONFIG_UCLAMP_TASK */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+
static inline unsigned long cpu_util_irq(struct rq *rq)
{
- return rq->avg_irq.util_avg;
+ return READ_ONCE(rq->avg_irq.util_avg);
}
static inline
@@ -2466,7 +3523,9 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
return util;
}
-#else
+
+#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */
+
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return 0;
@@ -2477,7 +3536,10 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
{
return util;
}
-#endif
+
+#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */
+
+extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr);
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -2490,14 +3552,18 @@ static inline bool sched_energy_enabled(void)
return static_branch_unlikely(&sched_energy_present);
}
+extern struct cpufreq_governor schedutil_gov;
+
#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#define perf_domain_span(pd) NULL
+
static inline bool sched_energy_enabled(void) { return false; }
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
#ifdef CONFIG_MEMBARRIER
+
/*
* The scheduler provides memory barriers required by membarrier between:
* - prior user-space memory accesses and store to rq->membarrier_state,
@@ -2519,13 +3585,16 @@ static inline void membarrier_switch_mm(struct rq *rq,
WRITE_ONCE(rq->membarrier_state, membarrier_state);
}
-#else
+
+#else /* !CONFIG_MEMBARRIER :*/
+
static inline void membarrier_switch_mm(struct rq *rq,
struct mm_struct *prev_mm,
struct mm_struct *next_mm)
{
}
-#endif
+
+#endif /* !CONFIG_MEMBARRIER */
#ifdef CONFIG_SMP
static inline bool is_per_cpu_kthread(struct task_struct *p)
@@ -2540,5 +3609,398 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
}
#endif
-void swake_up_all_locked(struct swait_queue_head *q);
-void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void swake_up_all_locked(struct swait_queue_head *q);
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+extern int preempt_dynamic_mode;
+extern int sched_dynamic_mode(const char *str);
+extern void sched_dynamic_update(int mode);
+#endif
+
+#ifdef CONFIG_SCHED_MM_CID
+
+#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
+#define MM_CID_SCAN_DELAY 100 /* 100ms */
+
+extern raw_spinlock_t cid_lock;
+extern int use_cid_lock;
+
+extern void sched_mm_cid_migrate_from(struct task_struct *t);
+extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
+extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
+extern void init_sched_mm_cid(struct task_struct *t);
+
+static inline void __mm_cid_put(struct mm_struct *mm, int cid)
+{
+ if (cid < 0)
+ return;
+ cpumask_clear_cpu(cid, mm_cidmask(mm));
+}
+
+/*
+ * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
+ * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
+ * be held to transition to other states.
+ *
+ * State transitions synchronized with cmpxchg or try_cmpxchg need to be
+ * consistent across CPUs, which prevents use of this_cpu_cmpxchg.
+ */
+static inline void mm_cid_put_lazy(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid;
+
+ lockdep_assert_irqs_disabled();
+ cid = __this_cpu_read(pcpu_cid->cid);
+ if (!mm_cid_is_lazy_put(cid) ||
+ !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ return;
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
+{
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid, res;
+
+ lockdep_assert_irqs_disabled();
+ cid = __this_cpu_read(pcpu_cid->cid);
+ for (;;) {
+ if (mm_cid_is_unset(cid))
+ return MM_CID_UNSET;
+ /*
+ * Attempt transition from valid or lazy-put to unset.
+ */
+ res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
+ if (res == cid)
+ break;
+ cid = res;
+ }
+ return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm)
+{
+ int cid;
+
+ lockdep_assert_irqs_disabled();
+ cid = mm_cid_pcpu_unset(mm);
+ if (cid == MM_CID_UNSET)
+ return;
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
+{
+ struct cpumask *cidmask = mm_cidmask(mm);
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid, max_nr_cid, allowed_max_nr_cid;
+
+ /*
+ * After shrinking the number of threads or reducing the number
+ * of allowed cpus, reduce the value of max_nr_cid so expansion
+ * of cid allocation will preserve cache locality if the number
+ * of threads or allowed cpus increase again.
+ */
+ max_nr_cid = atomic_read(&mm->max_nr_cid);
+ while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed),
+ atomic_read(&mm->mm_users))),
+ max_nr_cid > allowed_max_nr_cid) {
+ /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */
+ if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) {
+ max_nr_cid = allowed_max_nr_cid;
+ break;
+ }
+ }
+ /* Try to re-use recent cid. This improves cache locality. */
+ cid = __this_cpu_read(pcpu_cid->recent_cid);
+ if (!mm_cid_is_unset(cid) && cid < max_nr_cid &&
+ !cpumask_test_and_set_cpu(cid, cidmask))
+ return cid;
+ /*
+ * Expand cid allocation if the maximum number of concurrency
+ * IDs allocated (max_nr_cid) is below the number cpus allowed
+ * and number of threads. Expanding cid allocation as much as
+ * possible improves cache locality.
+ */
+ cid = max_nr_cid;
+ while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
+ /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */
+ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
+ continue;
+ if (!cpumask_test_and_set_cpu(cid, cidmask))
+ return cid;
+ }
+ /*
+ * Find the first available concurrency id.
+ * Retry finding first zero bit if the mask is temporarily
+ * filled. This only happens during concurrent remote-clear
+ * which owns a cid without holding a rq lock.
+ */
+ for (;;) {
+ cid = cpumask_first_zero(cidmask);
+ if (cid < READ_ONCE(mm->nr_cpus_allowed))
+ break;
+ cpu_relax();
+ }
+ if (cpumask_test_and_set_cpu(cid, cidmask))
+ return -1;
+
+ return cid;
+}
+
+/*
+ * Save a snapshot of the current runqueue time of this cpu
+ * with the per-cpu cid value, allowing to estimate how recently it was used.
+ */
+static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
+{
+ struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+
+ lockdep_assert_rq_held(rq);
+ WRITE_ONCE(pcpu_cid->time, rq->clock);
+}
+
+static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
+ struct mm_struct *mm)
+{
+ int cid;
+
+ /*
+ * All allocations (even those using the cid_lock) are lock-free. If
+ * use_cid_lock is set, hold the cid_lock to perform cid allocation to
+ * guarantee forward progress.
+ */
+ if (!READ_ONCE(use_cid_lock)) {
+ cid = __mm_cid_try_get(t, mm);
+ if (cid >= 0)
+ goto end;
+ raw_spin_lock(&cid_lock);
+ } else {
+ raw_spin_lock(&cid_lock);
+ cid = __mm_cid_try_get(t, mm);
+ if (cid >= 0)
+ goto unlock;
+ }
+
+ /*
+ * cid concurrently allocated. Retry while forcing following
+ * allocations to use the cid_lock to ensure forward progress.
+ */
+ WRITE_ONCE(use_cid_lock, 1);
+ /*
+ * Set use_cid_lock before allocation. Only care about program order
+ * because this is only required for forward progress.
+ */
+ barrier();
+ /*
+ * Retry until it succeeds. It is guaranteed to eventually succeed once
+ * all newcoming allocations observe the use_cid_lock flag set.
+ */
+ do {
+ cid = __mm_cid_try_get(t, mm);
+ cpu_relax();
+ } while (cid < 0);
+ /*
+ * Allocate before clearing use_cid_lock. Only care about
+ * program order because this is for forward progress.
+ */
+ barrier();
+ WRITE_ONCE(use_cid_lock, 0);
+unlock:
+ raw_spin_unlock(&cid_lock);
+end:
+ mm_cid_snapshot_time(rq, mm);
+
+ return cid;
+}
+
+static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
+ struct mm_struct *mm)
+{
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ struct cpumask *cpumask;
+ int cid;
+
+ lockdep_assert_rq_held(rq);
+ cpumask = mm_cidmask(mm);
+ cid = __this_cpu_read(pcpu_cid->cid);
+ if (mm_cid_is_valid(cid)) {
+ mm_cid_snapshot_time(rq, mm);
+ return cid;
+ }
+ if (mm_cid_is_lazy_put(cid)) {
+ if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ }
+ cid = __mm_cid_get(rq, t, mm);
+ __this_cpu_write(pcpu_cid->cid, cid);
+ __this_cpu_write(pcpu_cid->recent_cid, cid);
+
+ return cid;
+}
+
+static inline void switch_mm_cid(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
+{
+ /*
+ * Provide a memory barrier between rq->curr store and load of
+ * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
+ *
+ * Should be adapted if context_switch() is modified.
+ */
+ if (!next->mm) { // to kernel
+ /*
+ * user -> kernel transition does not guarantee a barrier, but
+ * we can use the fact that it performs an atomic operation in
+ * mmgrab().
+ */
+ if (prev->mm) // from user
+ smp_mb__after_mmgrab();
+ /*
+ * kernel -> kernel transition does not change rq->curr->mm
+ * state. It stays NULL.
+ */
+ } else { // to user
+ /*
+ * kernel -> user transition does not provide a barrier
+ * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
+ * Provide it here.
+ */
+ if (!prev->mm) { // from kernel
+ smp_mb();
+ } else { // from user
+ /*
+ * user->user transition relies on an implicit
+ * memory barrier in switch_mm() when
+ * current->mm changes. If the architecture
+ * switch_mm() does not have an implicit memory
+ * barrier, it is emitted here. If current->mm
+ * is unchanged, no barrier is needed.
+ */
+ smp_mb__after_switch_mm();
+ }
+ }
+ if (prev->mm_cid_active) {
+ mm_cid_snapshot_time(rq, prev->mm);
+ mm_cid_put_lazy(prev);
+ prev->mm_cid = -1;
+ }
+ if (next->mm_cid_active)
+ next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
+}
+
+#else /* !CONFIG_SCHED_MM_CID: */
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
+static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
+static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+static inline void init_sched_mm_cid(struct task_struct *t) { }
+#endif /* !CONFIG_SCHED_MM_CID */
+
+extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_SMP
+static inline
+void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
+{
+ lockdep_assert_rq_held(src_rq);
+ lockdep_assert_rq_held(dst_rq);
+
+ deactivate_task(src_rq, task, 0);
+ set_task_cpu(task, dst_rq->cpu);
+ activate_task(dst_rq, task, 0);
+}
+
+static inline
+bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_on_cpu(rq, p) &&
+ cpumask_test_cpu(cpu, &p->cpus_mask))
+ return true;
+
+ return false;
+}
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
+#else /* !CONFIG_RT_MUTEXES: */
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
+
+#endif /* !CONFIG_RT_MUTEXES */
+
+extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
+extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
+extern const struct sched_class *__setscheduler_class(int policy, int prio);
+extern void set_load_weight(struct task_struct *p, bool update_load);
+extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+
+extern void check_class_changing(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class);
+extern void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio);
+
+#ifdef CONFIG_SMP
+extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
+#else
+
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+}
+
+#endif
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+/*
+ * Used by SCX in the enable/disable paths to move tasks between sched_classes
+ * and establish invariants.
+ */
+struct sched_enq_and_set_ctx {
+ struct task_struct *p;
+ int queue_flags;
+ bool queued;
+ bool running;
+};
+
+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
+ struct sched_enq_and_set_ctx *ctx);
+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
+
+#endif /* CONFIG_SCHED_CLASS_EXT */
+
+#include "ext.h"
+
+#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 9620e323162c..21ac44428bb0 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -6,4 +6,10 @@
extern void sched_ttwu_pending(void *arg);
-extern void send_call_function_single_ipi(int cpu);
+extern bool call_function_single_prep_ipi(int cpu);
+
+#ifdef CONFIG_SMP
+extern void flush_smp_call_function_queue(void);
+#else
+static inline void flush_smp_call_function_queue(void) { }
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 750fb3c67eed..4346fd81c31f 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -2,7 +2,100 @@
/*
* /proc/schedstat implementation
*/
-#include "sched.h"
+
+void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 wait_start, prev_wait_start;
+
+ wait_start = rq_clock(rq);
+ prev_wait_start = schedstat_val(stats->wait_start);
+
+ if (p && likely(wait_start > prev_wait_start))
+ wait_start -= prev_wait_start;
+
+ __schedstat_set(stats->wait_start, wait_start);
+}
+
+void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start);
+
+ if (p) {
+ if (task_on_rq_migrating(p)) {
+ /*
+ * Preserve migrating task's wait time so wait_start
+ * time stamp can be adjusted to accumulate wait time
+ * prior to migration.
+ */
+ __schedstat_set(stats->wait_start, delta);
+
+ return;
+ }
+
+ trace_sched_stat_wait(p, delta);
+ }
+
+ __schedstat_set(stats->wait_max,
+ max(schedstat_val(stats->wait_max), delta));
+ __schedstat_inc(stats->wait_count);
+ __schedstat_add(stats->wait_sum, delta);
+ __schedstat_set(stats->wait_start, 0);
+}
+
+void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats)
+{
+ u64 sleep_start, block_start;
+
+ sleep_start = schedstat_val(stats->sleep_start);
+ block_start = schedstat_val(stats->block_start);
+
+ if (sleep_start) {
+ u64 delta = rq_clock(rq) - sleep_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(stats->sleep_max)))
+ __schedstat_set(stats->sleep_max, delta);
+
+ __schedstat_set(stats->sleep_start, 0);
+ __schedstat_add(stats->sum_sleep_runtime, delta);
+
+ if (p) {
+ account_scheduler_latency(p, delta >> 10, 1);
+ trace_sched_stat_sleep(p, delta);
+ }
+ }
+
+ if (block_start) {
+ u64 delta = rq_clock(rq) - block_start;
+
+ if ((s64)delta < 0)
+ delta = 0;
+
+ if (unlikely(delta > schedstat_val(stats->block_max)))
+ __schedstat_set(stats->block_max, delta);
+
+ __schedstat_set(stats->block_start, 0);
+ __schedstat_add(stats->sum_sleep_runtime, delta);
+ __schedstat_add(stats->sum_block_runtime, delta);
+
+ if (p) {
+ if (p->in_iowait) {
+ __schedstat_add(stats->iowait_sum, delta);
+ __schedstat_inc(stats->iowait_count);
+ trace_sched_stat_iowait(p, delta);
+ }
+
+ trace_sched_stat_blocked(p, delta);
+
+ account_scheduler_latency(p, delta >> 10, 0);
+ }
+ }
+}
/*
* Current schedstat API version.
@@ -10,7 +103,7 @@
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 15
+#define SCHEDSTAT_VERSION 17
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -45,15 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- seq_printf(seq, "domain%d %*pb", dcount++,
+ seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
cpumask_pr_args(sched_domain_span(sd)));
- for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
- itype++) {
- seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
+ seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
sd->lb_count[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
- sd->lb_imbalance[itype],
+ sd->lb_imbalance_load[itype],
+ sd->lb_imbalance_util[itype],
+ sd->lb_imbalance_task[itype],
+ sd->lb_imbalance_misfit[itype],
sd->lb_gained[itype],
sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
@@ -74,7 +169,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
}
/*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
* In a hotplugged system some CPUs, including cpu 0, may be missing so we have
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 33d0daf83842..19cdbe96f93d 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -1,7 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_STATS_H
+#define _KERNEL_STATS_H
#ifdef CONFIG_SCHEDSTATS
+extern struct static_key_false sched_schedstats;
+
/*
* Expects runqueue lock to be held for atomicity of update
*/
@@ -25,7 +29,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
}
static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+rq_sched_info_dequeue(struct rq *rq, unsigned long long delta)
{
if (rq)
rq->rq_sched_info.run_delay += delta;
@@ -40,9 +44,33 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
+void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+
+void __update_stats_wait_end(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
+ struct sched_statistics *stats);
+
+static inline void
+check_schedstat_required(void)
+{
+ if (schedstat_enabled())
+ return;
+
+ /* Force schedstat enabled if a dependent tracepoint is active */
+ if (trace_sched_stat_wait_enabled() ||
+ trace_sched_stat_sleep_enabled() ||
+ trace_sched_stat_iowait_enabled() ||
+ trace_sched_stat_blocked_enabled() ||
+ trace_sched_stat_runtime_enabled())
+ printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n");
+}
+
#else /* !CONFIG_SCHEDSTATS: */
+
static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
-static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
+static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { }
static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
# define schedstat_enabled() 0
# define __schedstat_inc(var) do { } while (0)
@@ -53,59 +81,114 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
# define schedstat_set(var, val) do { } while (0)
# define schedstat_val(var) 0
# define schedstat_val_or_zero(var) 0
+
+# define __update_stats_wait_start(rq, p, stats) do { } while (0)
+# define __update_stats_wait_end(rq, p, stats) do { } while (0)
+# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0)
+# define check_schedstat_required() do { } while (0)
+
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+struct sched_entity_stats {
+ struct sched_entity se;
+ struct sched_statistics stats;
+} __no_randomize_layout;
+#endif
+
+static inline struct sched_statistics *
+__schedstats_from_se(struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (!entity_is_task(se))
+ return &container_of(se, struct sched_entity_stats, se)->stats;
+#endif
+ return &task_of(se)->stats;
+}
+
#ifdef CONFIG_PSI
+void psi_task_change(struct task_struct *task, int clear, int set);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+ bool sleep);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
- * where a task's runnable state changes, and requeues, where a task
- * and its state are being moved between CPUs and runqueues.
+ * where a task's runnable state changes, and migrations, where a task
+ * and its runnable state are being moved between CPUs and runqueues.
+ *
+ * A notable case is a task whose dequeue is delayed. PSI considers
+ * those sleeping, but because they are still on the runqueue they can
+ * go through migration requeues. In this case, *sleeping* states need
+ * to be transferred.
*/
-static inline void psi_enqueue(struct task_struct *p, bool wakeup)
+static inline void psi_enqueue(struct task_struct *p, int flags)
{
- int clear = 0, set = TSK_RUNNING;
+ int clear = 0, set = 0;
if (static_branch_likely(&psi_disabled))
return;
- if (!wakeup || p->sched_psi_wake_requeue) {
+ /* Same runqueue, nothing changed for psi */
+ if (flags & ENQUEUE_RESTORE)
+ return;
+
+ /* psi_sched_switch() will handle the flags */
+ if (task_on_cpu(task_rq(p), p))
+ return;
+
+ if (p->se.sched_delayed) {
+ /* CPU migration of "sleeping" task */
+ SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
if (p->in_memstall)
set |= TSK_MEMSTALL;
- if (p->sched_psi_wake_requeue)
- p->sched_psi_wake_requeue = 0;
+ if (p->in_iowait)
+ set |= TSK_IOWAIT;
+ } else if (flags & ENQUEUE_MIGRATED) {
+ /* CPU migration of runnable task */
+ set = TSK_RUNNING;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
} else {
+ /* Wakeup of new or sleeping task */
if (p->in_iowait)
clear |= TSK_IOWAIT;
+ set = TSK_RUNNING;
+ if (p->in_memstall)
+ set |= TSK_MEMSTALL_RUNNING;
}
psi_task_change(p, clear, set);
}
-static inline void psi_dequeue(struct task_struct *p, bool sleep)
+static inline void psi_dequeue(struct task_struct *p, int flags)
{
- int clear = TSK_RUNNING, set = 0;
-
if (static_branch_likely(&psi_disabled))
return;
- if (!sleep) {
- if (p->in_memstall)
- clear |= TSK_MEMSTALL;
- } else {
- /*
- * When a task sleeps, schedule() dequeues it before
- * switching to the next one. Merge the clearing of
- * TSK_RUNNING and TSK_ONCPU to save an unnecessary
- * psi_task_change() call in psi_sched_switch().
- */
- clear |= TSK_ONCPU;
+ /* Same runqueue, nothing changed for psi */
+ if (flags & DEQUEUE_SAVE)
+ return;
- if (p->in_iowait)
- set |= TSK_IOWAIT;
- }
+ /*
+ * A voluntary sleep is a dequeue followed by a task switch. To
+ * avoid walking all ancestors twice, psi_task_switch() handles
+ * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
+ * Do nothing here.
+ */
+ if (flags & DEQUEUE_SLEEP)
+ return;
- psi_task_change(p, clear, set);
+ /*
+ * When migrating a task to another CPU, clear all psi
+ * state. The enqueue callback above will work it out.
+ */
+ psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
@@ -117,19 +200,12 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
* deregister its sleep-persistent psi states from the old
* queue, and let psi_enqueue() know it has to requeue.
*/
- if (unlikely(p->in_iowait || p->in_memstall)) {
+ if (unlikely(p->psi_flags)) {
struct rq_flags rf;
struct rq *rq;
- int clear = 0;
-
- if (p->in_iowait)
- clear |= TSK_IOWAIT;
- if (p->in_memstall)
- clear |= TSK_MEMSTALL;
rq = __task_rq_lock(p, &rf);
- psi_task_change(p, clear, 0);
- p->sched_psi_wake_requeue = 1;
+ psi_task_change(p, p->psi_flags, 0);
__task_rq_unlock(rq, &rf);
}
}
@@ -144,65 +220,63 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}
-static inline void psi_task_tick(struct rq *rq)
-{
- if (static_branch_likely(&psi_disabled))
- return;
-
- if (unlikely(rq->curr->in_memstall))
- psi_memstall_tick(rq->curr, cpu_of(rq));
-}
#else /* CONFIG_PSI */
-static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
-static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
+static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
+static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_task_tick(struct rq *rq) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
- t->sched_info.last_queued = 0;
-}
-
/*
* We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a CPU, we call this routine
* from dequeue_task() to account for possible rq->clock skew across CPUs. The
* delta taken on each CPU would annul the skew.
*/
-static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long delta = 0;
- if (sched_info_on()) {
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- }
- sched_info_reset_dequeued(t);
- t->sched_info.run_delay += delta;
+ if (!t->sched_info.last_queued)
+ return;
- rq_sched_info_dequeued(rq, delta);
+ delta = rq_clock(rq) - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
+ t->sched_info.run_delay += delta;
+ if (delta > t->sched_info.max_run_delay)
+ t->sched_info.max_run_delay = delta;
+ if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
+ t->sched_info.min_run_delay = delta;
+ rq_sched_info_dequeue(rq, delta);
}
/*
* Called when a task finally hits the CPU. We can now calculate how
* long it was waiting to run. We also note when it began so that we
- * can keep stats on how long its timeslice is.
+ * can keep stats on how long its time-slice is.
*/
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
- unsigned long long now = rq_clock(rq), delta = 0;
+ unsigned long long now, delta = 0;
+
+ if (!t->sched_info.last_queued)
+ return;
- if (t->sched_info.last_queued)
- delta = now - t->sched_info.last_queued;
- sched_info_reset_dequeued(t);
+ now = rq_clock(rq);
+ delta = now - t->sched_info.last_queued;
+ t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
+ if (delta > t->sched_info.max_run_delay)
+ t->sched_info.max_run_delay = delta;
+ if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
+ t->sched_info.min_run_delay = delta;
rq_sched_info_arrive(rq, delta);
}
@@ -210,14 +284,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
/*
* This function is only called from enqueue_task(), but also only updates
* the timestamp if it is already not set. It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
+ * sched_info_dequeue() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
+static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t)
{
- if (sched_info_on()) {
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = rq_clock(rq);
- }
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = rq_clock(rq);
}
/*
@@ -225,7 +297,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
* due, typically, to expiring its time slice (this may also be called when
* switching to the idle task). Now we can calculate how long we ran.
* Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
+ * sched_info_enqueue() to mark that it has now again started waiting on
* the runqueue.
*/
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
@@ -234,8 +306,8 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
rq_sched_info_depart(rq, delta);
- if (t->state == TASK_RUNNING)
- sched_info_queued(rq, t);
+ if (task_is_running(t))
+ sched_info_enqueue(rq, t);
}
/*
@@ -244,7 +316,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static inline void
-__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
/*
* prev now departs the CPU. It's not interesting to record
@@ -258,18 +330,10 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
sched_info_arrive(rq, next);
}
-static inline void
-sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
-{
- if (sched_info_on())
- __sched_info_switch(rq, prev, next);
-}
-
#else /* !CONFIG_SCHED_INFO: */
-# define sched_info_queued(rq, t) do { } while (0)
-# define sched_info_reset_dequeued(t) do { } while (0)
-# define sched_info_dequeued(rq, t) do { } while (0)
-# define sched_info_depart(rq, t) do { } while (0)
-# define sched_info_arrive(rq, next) do { } while (0)
+# define sched_info_enqueue(rq, t) do { } while (0)
+# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
+
+#endif /* _KERNEL_STATS_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4c9e9975684f..058dd42e3d9b 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -7,11 +7,10 @@
*
* See kernel/stop_machine.c
*/
-#include "sched.h"
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
@@ -24,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#endif /* CONFIG_SMP */
static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
{
/* we're never preempted */
}
@@ -34,12 +33,11 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq)
{
if (!sched_stop_runnable(rq))
return NULL;
- set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
@@ -49,10 +47,11 @@ enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
add_nr_running(rq, 1);
}
-static void
+static bool
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ return true;
}
static void yield_task_stop(struct rq *rq)
@@ -60,23 +59,9 @@ static void yield_task_stop(struct rq *rq)
BUG(); /* the stop task should never yield, its pointless. */
}
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- struct task_struct *curr = rq->curr;
- u64 delta_exec;
-
- delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
-
- schedstat_set(curr->se.statistics.exec_max,
- max(curr->se.statistics.exec_max, delta_exec));
-
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = rq_clock_task(rq);
- cgroup_account_cputime(curr, delta_exec);
+ update_curr_common(rq);
}
/*
@@ -102,12 +87,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
BUG(); /* how!?, what priority? */
}
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_stop(struct rq *rq)
{
}
@@ -115,16 +94,15 @@ static void update_curr_stop(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
-const struct sched_class stop_sched_class = {
- .next = &dl_sched_class,
+DEFINE_SCHED_CLASS(stop) = {
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
- .check_preempt_curr = check_preempt_curr_stop,
+ .wakeup_preempt = wakeup_preempt_stop,
- .pick_next_task = pick_next_task_stop,
+ .pick_task = pick_task_stop,
.put_prev_task = put_prev_task_stop,
.set_next_task = set_next_task_stop,
@@ -136,8 +114,6 @@ const struct sched_class stop_sched_class = {
.task_tick = task_tick_stop,
- .get_rr_interval = get_rr_interval_stop,
-
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e1c655f928c7..72505cd3b60a 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,7 +2,6 @@
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
-#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
@@ -19,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head);
* If for some reason it would return 0, that means the previously waiting
* task is already running, so it will observe condition true (or has already).
*/
-void swake_up_locked(struct swait_queue_head *q)
+void swake_up_locked(struct swait_queue_head *q, int wake_flags)
{
struct swait_queue *curr;
@@ -27,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q)
return;
curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
- wake_up_process(curr->task);
+ try_to_wake_up(curr->task, TASK_NORMAL, wake_flags);
list_del_init(&curr->task_list);
}
EXPORT_SYMBOL(swake_up_locked);
@@ -42,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked);
void swake_up_all_locked(struct swait_queue_head *q)
{
while (!list_empty(&q->task_list))
- swake_up_locked(q);
+ swake_up_locked(q, 0);
}
void swake_up_one(struct swait_queue_head *q)
@@ -50,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q)
unsigned long flags;
raw_spin_lock_irqsave(&q->lock, flags);
- swake_up_locked(q);
+ swake_up_locked(q, 0);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(swake_up_one);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
new file mode 100644
index 000000000000..456d339be98f
--- /dev/null
+++ b/kernel/sched/syscalls.c
@@ -0,0 +1,1594 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kernel/sched/syscalls.c
+ *
+ * Core kernel scheduler syscalls related code
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
+ */
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+#include <linux/sched/debug.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+#include "autogroup.h"
+
+static inline int __normal_prio(int policy, int rt_prio, int nice)
+{
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_or_dl_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ bool queued, running;
+ struct rq *rq;
+ int old_prio;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ CLASS(task_rq_lock, rq_guard)(p);
+ rq = rq_guard.rq;
+
+ update_rq_clock(rq);
+
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it won't have any effect on scheduling until the task is
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
+ */
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ return;
+ }
+
+ queued = task_on_rq_queued(p);
+ running = task_current_donor(rq, p);
+ if (queued)
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ if (running)
+ put_prev_task(rq, p);
+
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+
+ if (queued)
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+ if (running)
+ set_next_task(rq, p);
+
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ p->sched_class->prio_changed(rq, p, old_prio);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
+ * @p: task
+ * @nice: nice value
+ */
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
+{
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ * deadline -101 -1 0
+ */
+int task_prio(const struct task_struct *p)
+{
+ return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * idle_cpu - is a given CPU idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle)
+ return 0;
+
+ if (rq->nr_running)
+ return 0;
+
+#ifdef CONFIG_SMP
+ if (rq->ttwu_pending)
+ return 0;
+#endif
+
+ return 1;
+}
+
+/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+ if (!idle_cpu(cpu))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * idle_task - return the idle task for a given CPU.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the CPU @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int policy = attr->sched_policy;
+
+ if (policy == SETPARAM_POLICY)
+ policy = p->policy;
+
+ p->policy = policy;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ __setparam_fair(p, attr);
+
+ /* rt-policy tasks do not have a timerslack */
+ if (rt_or_dl_task_policy(p)) {
+ p->timer_slack_ns = 0;
+ } else if (p->timer_slack_ns == 0) {
+ /* when switching back to non-rt policy, restore timerslack */
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ }
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
+ p->normal_prio = normal_prio(p);
+ set_load_weight(p, true);
+}
+
+/*
+ * Check the target process has a UID that matches the current process's:
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ guard(rcu)();
+
+ pcred = __task_cred(p);
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+}
+
+#ifdef CONFIG_UCLAMP_TASK
+
+static int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ util_min = attr->sched_util_min;
+
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ util_max = attr->sched_util_max;
+
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
+ return -EINVAL;
+
+ /*
+ * We have valid uclamp attributes; make sure uclamp is enabled.
+ *
+ * We need to do that here, because enabling static branches is a
+ * blocking operation which obviously cannot be done while holding
+ * scheduler locks.
+ */
+ static_branch_enable(&sched_uclamp_used);
+
+ return 0;
+}
+
+static bool uclamp_reset(const struct sched_attr *attr,
+ enum uclamp_id clamp_id,
+ struct uclamp_se *uc_se)
+{
+ /* Reset on sched class change for a non user-defined clamp value. */
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
+ !uc_se->user_defined)
+ return true;
+
+ /* Reset on sched_util_{min,max} == -1. */
+ if (clamp_id == UCLAMP_MIN &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min == -1) {
+ return true;
+ }
+
+ if (clamp_id == UCLAMP_MAX &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max == -1) {
+ return true;
+ }
+
+ return false;
+}
+
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int value;
+
+ if (!uclamp_reset(attr, clamp_id, uc_se))
+ continue;
+
+ /*
+ * RT by default have a 100% boost value that could be modified
+ * at runtime.
+ */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ value = sysctl_sched_uclamp_util_min_rt_default;
+ else
+ value = uclamp_none(clamp_id);
+
+ uclamp_se_set(uc_se, value, false);
+
+ }
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+ return;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min != -1) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+ attr->sched_util_min, true);
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max != -1) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+ attr->sched_util_max, true);
+ }
+}
+
+#else /* !CONFIG_UCLAMP_TASK: */
+
+static inline int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr) { }
+#endif
+
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
+int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user, bool pi)
+{
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
+ const struct sched_class *prev_class, *next_class;
+ struct balance_callback *head;
+ struct rq_flags rf;
+ int reset_on_fork;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct rq *rq;
+ bool cpuset_locked = false;
+
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
+recheck:
+ /* Double check policy once rq lock held: */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
+
+ if (!valid_policy(policy))
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+ return -EINVAL;
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLE is 0.
+ */
+ if (attr->sched_priority > MAX_RT_PRIO-1)
+ return -EINVAL;
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
+ return -EINVAL;
+
+ if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return -EINVAL;
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ /* Update task specific "requested" clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ retval = uclamp_validate(p, attr);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }
+
+ /*
+ * Make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ *
+ * To be able to change p->policy safely, the appropriate
+ * runqueue lock must be held.
+ */
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea:
+ */
+ if (p == rq->stop) {
+ retval = -EINVAL;
+ goto unlock;
+ }
+
+ retval = scx_check_setscheduler(p, policy);
+ if (retval)
+ goto unlock;
+
+ /*
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
+ */
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime != p->se.slice)))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy) && dl_param_changed(p, attr))
+ goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ goto change;
+
+ p->sched_reset_on_fork = reset_on_fork;
+ retval = 0;
+ goto unlock;
+ }
+change:
+
+ if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow real-time tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
+ retval = -EPERM;
+ goto unlock;
+ }
+#endif
+#ifdef CONFIG_SMP
+ if (dl_bandwidth_enabled() && dl_policy(policy) &&
+ !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, p->cpus_ptr) ||
+ rq->rd->dl_bw.bw == 0) {
+ retval = -EPERM;
+ goto unlock;
+ }
+ }
+#endif
+ }
+
+ /* Re-check policy now with rq lock held: */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ task_rq_unlock(rq, p, &rf);
+ if (cpuset_locked)
+ cpuset_unlock();
+ goto recheck;
+ }
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
+ retval = -EBUSY;
+ goto unlock;
+ }
+
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
+ queue_flags &= ~DEQUEUE_MOVE;
+ }
+
+ prev_class = p->sched_class;
+ next_class = __setscheduler_class(policy, newprio);
+
+ if (prev_class != next_class && p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
+ queued = task_on_rq_queued(p);
+ running = task_current_donor(rq, p);
+ if (queued)
+ dequeue_task(rq, p, queue_flags);
+ if (running)
+ put_prev_task(rq, p);
+
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ p->sched_class = next_class;
+ p->prio = newprio;
+ }
+ __setscheduler_uclamp(p, attr);
+ check_class_changing(rq, p, prev_class);
+
+ if (queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ queue_flags |= ENQUEUE_HEAD;
+
+ enqueue_task(rq, p, queue_flags);
+ }
+ if (running)
+ set_next_task(rq, p);
+
+ check_class_changed(rq, p, prev_class, oldprio);
+
+ /* Avoid rq from going away on us: */
+ preempt_disable();
+ head = splice_balance_callbacks(rq);
+ task_rq_unlock(rq, p, &rf);
+
+ if (pi) {
+ if (cpuset_locked)
+ cpuset_unlock();
+ rt_mutex_adjust_pi(p);
+ }
+
+ /* Run balance callbacks after we've adjusted the PI chain: */
+ balance_callbacks(rq, head);
+ preempt_enable();
+
+ return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (cpuset_locked)
+ cpuset_unlock();
+ return retval;
+}
+
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ if (p->se.custom_slice)
+ attr.sched_runtime = p->se.slice;
+
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check, true);
+}
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Use sched_set_fifo(), read its comment.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, true);
+}
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true, true);
+}
+
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, false);
+}
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ return sched_setscheduler(p, policy, &lparam);
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ /* Zero the full structure, so that a short copy will be nice: */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ /* ABI compatibility quirk: */
+ if (!size)
+ size = SCHED_ATTR_SIZE_VER0;
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
+ goto err_size;
+
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
+ }
+
+ if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+ size < SCHED_ATTR_SIZE_VER1)
+ return -EINVAL;
+
+ /*
+ * XXX: Do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p)) {
+ __getparam_dl(p, attr);
+ } else if (task_has_rt_policy(p)) {
+ attr->sched_priority = p->rt_priority;
+ } else {
+ attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
+{
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ attr.sched_policy = SETPARAM_POLICY;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
+
+ return sched_setattr(p, &attr);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ guard(rcu)();
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
+ }
+ return retval;
+}
+
+/**
+ * sys_sched_getparam - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @usize: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, usize, unsigned int, flags)
+{
+ struct sched_attr kattr = { };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
+
+#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
+ }
+
+ kattr.size = min(usize, sizeof(kattr));
+ return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
+}
+
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+{
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
+
+ /*
+ * The special/sugov task isn't part of regular bandwidth/admission
+ * control so let userspace change affinities.
+ */
+ if (dl_entity_is_special(&p->dl))
+ return 0;
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ guard(rcu)();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ return -EBUSY;
+
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
+{
+ int retval;
+ cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ ctx->new_mask = new_mask;
+ ctx->flags |= SCA_CHECK;
+
+ retval = dl_task_check_affinity(p, new_mask);
+ if (retval)
+ goto out_free_new_mask;
+
+ retval = __set_cpus_allowed_ptr(p, ctx);
+ if (retval)
+ goto out_free_new_mask;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+
+ /*
+ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ * will restore the previous user_cpus_ptr value.
+ *
+ * In the unlikely event a previous user_cpus_ptr exists,
+ * we need to further restrict the mask to what is allowed
+ * by that old user_cpus_ptr.
+ */
+ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ bool empty = !cpumask_and(new_mask, new_mask,
+ ctx->user_mask);
+
+ if (empty)
+ cpumask_copy(new_mask, cpus_allowed);
+ }
+ __set_cpus_allowed_ptr(p, ctx);
+ retval = -EINVAL;
+ }
+
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ struct affinity_context ac;
+ struct cpumask *user_mask;
+ int retval;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
+
+ if (!check_same_owner(p)) {
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
+ return -ENOMEM;
+ }
+
+ ac = (struct affinity_context){
+ .new_mask = in_mask,
+ .user_mask = user_mask,
+ .flags = SCA_USER,
+ };
+
+ retval = __sched_setaffinity(p, &ac);
+ kfree(ac.user_mask);
+
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ struct cpumask *new_mask)
+{
+ if (len < cpumask_size())
+ cpumask_clear(new_mask);
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
+{
+ struct task_struct *p;
+ int retval;
+
+ guard(rcu)();
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
+
+ return 0;
+}
+
+/**
+ * sys_sched_getaffinity - get the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
+ *
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ unsigned int retlen = min(len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+static void do_sched_yield(void)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = this_rq_lock_irq(&rf);
+
+ schedstat_inc(rq->yld_count);
+ current->sched_class->yield_task(rq);
+
+ preempt_disable();
+ rq_unlock_irq(rq, &rf);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ do_sched_yield();
+ return 0;
+}
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, it's already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ do_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ int yielded = 0;
+
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
+
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
+
+ if (!curr->sched_class->yield_to_task)
+ return 0;
+
+ if (curr->sched_class != p->sched_class)
+ return 0;
+
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
+
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
+ }
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_RT_PRIO-1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ case SCHED_EXT:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ case SCHED_EXT:
+ ret = 0;
+ }
+ return ret;
+}
+
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
+{
+ unsigned int time_slice = 0;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
+
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default time-slice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the time-slice value.
+ *
+ * this syscall writes the default time-slice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the time-slice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct __kernel_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_old_timespec32(&t, interval);
+ return retval;
+}
+#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..c49aea8c1025 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,7 +2,8 @@
/*
* Scheduler topology setup/handling methods
*/
-#include "sched.h"
+
+#include <linux/bsearch.h>
DEFINE_MUTEX(sched_domains_mutex);
@@ -14,21 +15,29 @@ static cpumask_var_t sched_domains_tmpmask2;
static int __init sched_debug_setup(char *str)
{
- sched_debug_enabled = true;
+ sched_debug_verbose = true;
return 0;
}
-early_param("sched_debug", sched_debug_setup);
+early_param("sched_verbose", sched_debug_setup);
static inline bool sched_debug(void)
{
- return sched_debug_enabled;
+ return sched_debug_verbose;
}
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
+const struct sd_flag_debug sd_flag_debug[] = {
+#include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
+ unsigned long flags = sd->flags;
+ unsigned int idx;
cpumask_clear(groupmask);
@@ -43,6 +52,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ unsigned int flag = BIT(idx);
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
+
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
+ !(sd->child->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
+ sd_flag_debug[idx].name);
+
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
+ !(sd->parent->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
+ sd_flag_debug[idx].name);
+ }
+
printk(KERN_DEBUG "%*s groups:", level + 1, "");
do {
if (!group) {
@@ -51,7 +75,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!cpumask_weight(sched_group_span(group))) {
+ if (cpumask_empty(sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
@@ -108,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
- if (!sched_debug_enabled)
+ if (!sched_debug_verbose)
return;
if (!sd) {
@@ -129,7 +153,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
}
#else /* !CONFIG_SCHED_DEBUG */
-# define sched_debug_enabled 0
+# define sched_debug_verbose 0
# define sched_domain_debug(sd, cpu) do { } while (0)
static inline bool sched_debug(void)
{
@@ -137,22 +161,22 @@ static inline bool sched_debug(void)
}
#endif /* CONFIG_SCHED_DEBUG */
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
static int sd_degenerate(struct sched_domain *sd)
{
if (cpumask_weight(sched_domain_span(sd)) == 1)
return 1;
/* Following flags need at least 2 groups */
- if (sd->flags & (SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
- if (sd->groups != sd->groups->next)
- return 0;
- }
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
+ (sd->groups != sd->groups->next))
+ return 0;
/* Following flags don't use groups */
if (sd->flags & (SD_WAKE_AFFINE))
@@ -173,18 +197,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 0;
/* Flags needing groups don't count if only 1 group in parent */
- if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
+ if (parent->groups == parent->groups->next)
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
+
if (~cflags & pflags)
return 0;
@@ -193,12 +208,84 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
-unsigned int sysctl_sched_energy_aware = 1;
-DEFINE_MUTEX(sched_energy_mutex);
-bool sched_energy_update;
+static unsigned int sysctl_sched_energy_aware = 1;
+static DEFINE_MUTEX(sched_energy_mutex);
+static bool sched_energy_update;
+
+static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
+{
+ bool any_asym_capacity = false;
+ struct cpufreq_policy *policy;
+ struct cpufreq_governor *gov;
+ int i;
+
+ /* EAS is enabled for asymmetric CPU capacity topologies. */
+ for_each_cpu(i, cpu_mask) {
+ if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
+ any_asym_capacity = true;
+ break;
+ }
+ }
+ if (!any_asym_capacity) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* EAS definitely does *not* handle SMT */
+ if (sched_smt_active()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ if (!arch_scale_freq_invariant()) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+
+ /* Do not attempt EAS if schedutil is not being used. */
+ for_each_cpu(i, cpu_mask) {
+ policy = cpufreq_cpu_get(i);
+ if (!policy) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
+ cpumask_pr_args(cpu_mask), i);
+ }
+ return false;
+ }
+ gov = policy->governor;
+ cpufreq_cpu_put(policy);
+ if (gov != &schedutil_gov) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
+ cpumask_pr_args(cpu_mask));
+ }
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void rebuild_sched_domains_energy(void)
+{
+ mutex_lock(&sched_energy_mutex);
+ sched_energy_update = true;
+ rebuild_sched_domains();
+ sched_energy_update = false;
+ mutex_unlock(&sched_energy_mutex);
+}
#ifdef CONFIG_PROC_SYSCTL
-int sched_energy_aware_handler(struct ctl_table *table, int write,
+static int sched_energy_aware_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, state;
@@ -206,20 +293,44 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!sched_is_eas_possible(cpu_active_mask)) {
+ if (write) {
+ return -EOPNOTSUPP;
+ } else {
+ *lenp = 0;
+ return 0;
+ }
+ }
+
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
state = static_branch_unlikely(&sched_energy_present);
- if (state != sysctl_sched_energy_aware) {
- mutex_lock(&sched_energy_mutex);
- sched_energy_update = 1;
- rebuild_sched_domains();
- sched_energy_update = 0;
- mutex_unlock(&sched_energy_mutex);
- }
+ if (state != sysctl_sched_energy_aware)
+ rebuild_sched_domains_energy();
}
return ret;
}
+
+static const struct ctl_table sched_energy_aware_sysctls[] = {
+ {
+ .procname = "sched_energy_aware",
+ .data = &sysctl_sched_energy_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_energy_aware_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init sched_energy_aware_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_energy_aware_sysctls);
+ return 0;
+}
+
+late_initcall(sched_energy_aware_sysctl_init);
#endif
static void free_pd(struct perf_domain *pd)
@@ -272,10 +383,10 @@ static void perf_domain_debug(const struct cpumask *cpu_map,
printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
while (pd) {
- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
cpumask_first(perf_domain_span(pd)),
cpumask_pr_args(perf_domain_span(pd)),
- em_pd_nr_cap_states(pd->em_pd));
+ em_pd_nr_perf_states(pd->em_pd));
pd = pd->next;
}
@@ -308,94 +419,33 @@ static void sched_energy_set(bool has_eas)
* 1. an Energy Model (EM) is available;
* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
* 3. no SMT is detected.
- * 4. the EM complexity is low enough to keep scheduling overheads low;
- * 5. schedutil is driving the frequency of all CPUs of the rd;
- *
- * The complexity of the Energy Model is defined as:
- *
- * C = nr_pd * (nr_cpus + nr_cs)
- *
- * with parameters defined as:
- * - nr_pd: the number of performance domains
- * - nr_cpus: the number of CPUs
- * - nr_cs: the sum of the number of capacity states of all performance
- * domains (for example, on a system with 2 performance domains,
- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
- *
- * It is generally not a good idea to use such a model in the wake-up path on
- * very complex platforms because of the associated scheduling overheads. The
- * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 capacity states each, for example.
+ * 4. schedutil is driving the frequency of all CPUs of the rd;
+ * 5. frequency invariance support is present;
*/
-#define EM_MAX_COMPLEXITY 2048
-
-extern struct cpufreq_governor schedutil_gov;
static bool build_perf_domains(const struct cpumask *cpu_map)
{
- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
+ int i;
struct perf_domain *pd = NULL, *tmp;
int cpu = cpumask_first(cpu_map);
struct root_domain *rd = cpu_rq(cpu)->rd;
- struct cpufreq_policy *policy;
- struct cpufreq_governor *gov;
if (!sysctl_sched_energy_aware)
goto free;
- /* EAS is enabled for asymmetric CPU capacity topologies. */
- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
- if (sched_debug()) {
- pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
- cpumask_pr_args(cpu_map));
- }
+ if (!sched_is_eas_possible(cpu_map))
goto free;
- }
-
- /* EAS definitely does *not* handle SMT */
- if (sched_smt_active()) {
- pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
- cpumask_pr_args(cpu_map));
- goto free;
- }
for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */
if (find_pd(pd, i))
continue;
- /* Do not attempt EAS if schedutil is not being used. */
- policy = cpufreq_cpu_get(i);
- if (!policy)
- goto free;
- gov = policy->governor;
- cpufreq_cpu_put(policy);
- if (gov != &schedutil_gov) {
- if (rd->pd)
- pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
- cpumask_pr_args(cpu_map));
- goto free;
- }
-
/* Create the new pd and add it to the local list. */
tmp = pd_init(i);
if (!tmp)
goto free;
tmp->next = pd;
pd = tmp;
-
- /*
- * Count performance domains and capacity states for the
- * complexity check.
- */
- nr_pd++;
- nr_cs += em_pd_nr_cap_states(pd->em_pd);
- }
-
- /* Bail out if the Energy Model complexity is too high. */
- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
- WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
- cpumask_pr_args(cpu_map));
- goto free;
}
perf_domain_debug(cpu_map, pd);
@@ -438,9 +488,9 @@ static void free_rootdomain(struct rcu_head *rcu)
void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
struct root_domain *old_rd = NULL;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
old_rd = rq->rd;
@@ -451,7 +501,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rd yet then
+ * If we don't want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -466,7 +516,15 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ /*
+ * Because the rq is not a task, dl_add_task_root_domain() did not
+ * move the fair server bw to the rd if it already started.
+ * Add it now.
+ */
+ if (rq->fair_server.dl_server)
+ __dl_server_attach_root(&rq->fair_server, rq);
+
+ rq_unlock_irqrestore(rq, &rf);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
@@ -499,9 +557,10 @@ static int init_rootdomain(struct root_domain *rd)
#ifdef HAVE_RT_PUSH_IPI
rd->rto_cpu = -1;
raw_spin_lock_init(&rd->rto_lock);
- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
+ rd->visit_gen = 0;
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@@ -530,7 +589,7 @@ out:
*/
struct root_domain def_root_domain;
-void init_defrootdomain(void)
+void __init init_defrootdomain(void)
{
init_rootdomain(&def_root_domain);
@@ -605,22 +664,25 @@ static void destroy_sched_domains(struct sched_domain *sd)
}
/*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
+ * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set
+ * (Last Level Cache Domain) for this allows us to avoid some pointer chasing
+ * select_idle_sibling().
*
- * Also keep a unique ID per domain (we use the first CPU number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_cache().
+ * Also keep a unique ID per domain (we use the first CPU number in the cpumask
+ * of the domain), this allows us to quickly tell if two CPUs are in the same
+ * cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_share_id);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
static void update_top_cache_domain(int cpu)
{
@@ -629,7 +691,7 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
int size = 1;
- sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ sd = highest_flag_domain(cpu, SD_SHARE_LLC);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
@@ -641,13 +703,24 @@ static void update_top_cache_domain(int cpu)
per_cpu(sd_llc_id, cpu) = id;
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+ sd = lowest_flag_domain(cpu, SD_CLUSTER);
+ if (sd)
+ id = cpumask_first(sched_domain_span(sd));
+
+ /*
+ * This assignment should be placed after the sd_llc_id as
+ * we want this id equals to cluster id on cluster machines
+ * but equals to LLC id on non-Cluster machines.
+ */
+ per_cpu(sd_share_id, cpu) = id;
+
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
- sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
+ sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}
@@ -669,8 +742,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
- if (parent->parent)
+
+ if (parent->parent) {
parent->parent->child = tmp;
+ parent->parent->groups->flags = tmp->flags;
+ }
+
/*
* Transfer SD_PREFER_SIBLING down in case of a
* degenerate parent; the spans match for this
@@ -687,8 +764,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp = sd;
sd = sd->parent;
destroy_sched_domain(tmp);
- if (sd)
+ if (sd) {
+ struct sched_group *sg = sd->groups;
+
+ /*
+ * sched groups hold the flags of the child sched
+ * domain for convenience. Clear such flags since
+ * the child is being destroyed.
+ */
+ do {
+ sg->flags = 0;
+ } while (sg != sd->groups);
+
sd->child = NULL;
+ }
}
sched_domain_debug(sd, cpu);
@@ -884,10 +973,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
return NULL;
sg_span = sched_group_span(sg);
- if (sd->child)
+ if (sd->child) {
cpumask_copy(sg_span, sched_domain_span(sd->child));
- else
+ sg->flags = sd->child->flags;
+ } else {
cpumask_copy(sg_span, sched_domain_span(sd));
+ }
atomic_inc(&sg->ref);
return sg;
@@ -902,7 +993,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
int cpu;
build_balance_mask(sd, sg, mask);
- cpu = cpumask_first_and(sched_group_span(sg), mask);
+ cpu = cpumask_first(mask);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
if (atomic_inc_return(&sg->sgc->ref) == 1)
@@ -921,6 +1012,31 @@ static void init_overlap_sched_group(struct sched_domain *sd,
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
}
+static struct sched_domain *
+find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
+{
+ /*
+ * The proper descendant would be the one whose child won't span out
+ * of sd
+ */
+ while (sibling->child &&
+ !cpumask_subset(sched_domain_span(sibling->child),
+ sched_domain_span(sd)))
+ sibling = sibling->child;
+
+ /*
+ * As we are referencing sgc across different topology level, we need
+ * to go down to skip those sched_domains which don't contribute to
+ * scheduling because they will be degenerated in cpu_attach_domain
+ */
+ while (sibling->child &&
+ cpumask_equal(sched_domain_span(sibling->child),
+ sched_domain_span(sibling)))
+ sibling = sibling->child;
+
+ return sibling;
+}
+
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
@@ -954,6 +1070,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
+ /*
+ * Usually we build sched_group by sibling's child sched_domain
+ * But for machines whose NUMA diameter are 3 or above, we move
+ * to build sched_group by sibling's proper descendant's child
+ * domain because sibling's child sched_domain will span out of
+ * the sched_domain being built as below.
+ *
+ * Smallest diameter=3 topology is:
+ *
+ * node 0 1 2 3
+ * 0: 10 20 30 40
+ * 1: 20 10 20 30
+ * 2: 30 20 10 20
+ * 3: 40 30 20 10
+ *
+ * 0 --- 1 --- 2 --- 3
+ *
+ * NUMA-3 0-3 N/A N/A 0-3
+ * groups: {0-2},{1-3} {1-3},{0-2}
+ *
+ * NUMA-2 0-2 0-3 0-3 1-3
+ * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
+ *
+ * NUMA-1 0-1 0-2 1-3 2-3
+ * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
+ *
+ * NUMA-0 0 1 2 3
+ *
+ * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
+ * group span isn't a subset of the domain span.
+ */
+ if (sibling->child &&
+ !cpumask_subset(sched_domain_span(sibling->child), span))
+ sibling = find_descended_sibling(sd, sibling);
+
sg = build_group_from_child_sched_domain(sibling, cpu);
if (!sg)
goto fail;
@@ -961,7 +1112,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
sg_span = sched_group_span(sg);
cpumask_or(covered, covered, sg_span);
- init_overlap_sched_group(sd, sg);
+ init_overlap_sched_group(sibling, sg);
if (!first)
first = sg;
@@ -989,7 +1140,7 @@ fail:
*
* - Simultaneous multithreading (SMT)
* - Multi-Core Cache (MC)
- * - Package (DIE)
+ * - Package (PKG)
*
* Where the last one more or less denotes everything up to a NUMA node.
*
@@ -1011,13 +1162,13 @@ fail:
*
* CPU 0 1 2 3 4 5 6 7
*
- * DIE [ ]
+ * PKG [ ]
* MC [ ] [ ]
* SMT [ ] [ ] [ ] [ ]
*
* - or -
*
- * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
* MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
* SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
*
@@ -1033,7 +1184,7 @@ fail:
* uniquely identify each group (for a given domain):
*
* - The first is the balance_cpu (see should_we_balance() and the
- * load-balance blub in fair.c); for each group we only want 1 CPU to
+ * load-balance blurb in fair.c); for each group we only want 1 CPU to
* continue balancing at a higher domain.
*
* - The second is the sched_group_capacity; we want all identical groups
@@ -1077,6 +1228,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child));
cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ sg->flags = child->flags;
} else {
cpumask_set_cpu(cpu, sched_group_span(sg));
cpumask_set_cpu(cpu, group_balance_mask(sg));
@@ -1145,14 +1297,24 @@ build_sched_groups(struct sched_domain *sd, int cpu)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
+ struct cpumask *mask = sched_domains_tmpmask2;
WARN_ON(!sg);
do {
- int cpu, max_cpu = -1;
+ int cpu, cores = 0, max_cpu = -1;
sg->group_weight = cpumask_weight(sched_group_span(sg));
+ cpumask_copy(mask, sched_group_span(sg));
+ for_each_cpu(cpu, mask) {
+ cores++;
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
+#endif
+ }
+ sg->cores = cores;
+
if (!(sd->flags & SD_ASYM_PACKING))
goto next;
@@ -1175,6 +1337,123 @@ next:
}
/*
+ * Set of available CPUs grouped by their corresponding capacities
+ * Each list entry contains a CPU mask reflecting CPUs that share the same
+ * capacity.
+ * The lifespan of data is unlimited.
+ */
+LIST_HEAD(asym_cap_list);
+
+/*
+ * Verify whether there is any CPU capacity asymmetry in a given sched domain.
+ * Provides sd_flags reflecting the asymmetry scope.
+ */
+static inline int
+asym_cpu_capacity_classify(const struct cpumask *sd_span,
+ const struct cpumask *cpu_map)
+{
+ struct asym_cap_data *entry;
+ int count = 0, miss = 0;
+
+ /*
+ * Count how many unique CPU capacities this domain spans across
+ * (compare sched_domain CPUs mask with ones representing available
+ * CPUs capacities). Take into account CPUs that might be offline:
+ * skip those.
+ */
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (cpumask_intersects(sd_span, cpu_capacity_span(entry)))
+ ++count;
+ else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry)))
+ ++miss;
+ }
+
+ WARN_ON_ONCE(!count && !list_empty(&asym_cap_list));
+
+ /* No asymmetry detected */
+ if (count < 2)
+ return 0;
+ /* Some of the available CPU capacity values have not been detected */
+ if (miss)
+ return SD_ASYM_CPUCAPACITY;
+
+ /* Full asymmetry */
+ return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL;
+
+}
+
+static void free_asym_cap_entry(struct rcu_head *head)
+{
+ struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
+ kfree(entry);
+}
+
+static inline void asym_cpu_capacity_update_data(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+ struct asym_cap_data *insert_entry = NULL;
+ struct asym_cap_data *entry;
+
+ /*
+ * Search if capacity already exits. If not, track which the entry
+ * where we should insert to keep the list ordered descending.
+ */
+ list_for_each_entry(entry, &asym_cap_list, link) {
+ if (capacity == entry->capacity)
+ goto done;
+ else if (!insert_entry && capacity > entry->capacity)
+ insert_entry = list_prev_entry(entry, link);
+ }
+
+ entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
+ if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
+ return;
+ entry->capacity = capacity;
+
+ /* If NULL then the new capacity is the smallest, add last. */
+ if (!insert_entry)
+ list_add_tail_rcu(&entry->link, &asym_cap_list);
+ else
+ list_add_rcu(&entry->link, &insert_entry->link);
+done:
+ __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
+}
+
+/*
+ * Build-up/update list of CPUs grouped by their capacities
+ * An update requires explicit request to rebuild sched domains
+ * with state indicating CPU topology changes.
+ */
+static void asym_cpu_capacity_scan(void)
+{
+ struct asym_cap_data *entry, *next;
+ int cpu;
+
+ list_for_each_entry(entry, &asym_cap_list, link)
+ cpumask_clear(cpu_capacity_span(entry));
+
+ for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN))
+ asym_cpu_capacity_update_data(cpu);
+
+ list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
+ if (cpumask_empty(cpu_capacity_span(entry))) {
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
+ }
+ }
+
+ /*
+ * Only one capacity value has been detected i.e. this system is symmetric.
+ * No need to keep this data around.
+ */
+ if (list_is_singular(&asym_cap_list)) {
+ entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
+ }
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -1203,7 +1482,7 @@ static void set_domain_attribute(struct sched_domain *sd,
} else
request = attr->relax_domain_level;
- if (sd->level > request) {
+ if (sd->level >= request) {
/* Turn off idle balance on this domain: */
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
}
@@ -1219,13 +1498,13 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu);
- /* Fall through */
+ fallthrough;
case sa_sd:
free_percpu(d->sd);
- /* Fall through */
+ fallthrough;
case sa_sd_storage:
__sdt_free(cpu_map);
- /* Fall through */
+ fallthrough;
case sa_none:
break;
}
@@ -1279,7 +1558,6 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#endif
/*
@@ -1287,12 +1565,12 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
*
* These flags are purely descriptive of the topology and do not prescribe
* behaviour. Behaviour is artificial and mapped in the below sd_init()
- * function:
+ * function. For details, see include/linux/sched/sd_flags.h.
*
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_SHARE_CPUCAPACITY
+ * SD_SHARE_LLC
+ * SD_CLUSTER
+ * SD_NUMA
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -1301,19 +1579,20 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
- SD_SHARE_PKG_RESOURCES | \
+ SD_CLUSTER | \
+ SD_SHARE_LLC | \
SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_ASYM_PACKING)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map,
- struct sched_domain *child, int dflags, int cpu)
+ struct sched_domain *child, int cpu)
{
struct sd_data *sdd = &tl->data;
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
+ struct cpumask *sd_span;
#ifdef CONFIG_NUMA
/*
@@ -1328,16 +1607,13 @@ sd_init(struct sched_domain_topology_level *tl,
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
-
- /* Apply detected topology flags */
- sd_flags |= dflags;
+ sd_flags &= TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
+ .busy_factor = 16,
+ .imbalance_pct = 117,
.cache_nice_tries = 0,
@@ -1347,7 +1623,7 @@ sd_init(struct sched_domain_topology_level *tl,
| 0*SD_BALANCE_WAKE
| 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUCAPACITY
- | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SHARE_LLC
| 0*SD_SERIALIZE
| 1*SD_PREFER_SIBLING
| 0*SD_NUMA
@@ -1357,20 +1633,24 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
.max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
+ .last_decay_max_lb_cost = jiffies,
.child = child,
-#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
-#endif
};
- cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
- sd_id = cpumask_first(sched_domain_span(sd));
+ sd_span = sched_domain_span(sd);
+ cpumask_and(sd_span, cpu_map, tl->mask(cpu));
+ sd_id = cpumask_first(sd_span);
+
+ sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
+
+ WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
+ (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
+ "CPU capacity asymmetry not supported on SMT\n");
/*
* Convert topological properties into behaviour.
*/
-
/* Don't attempt to spread across CPUs of different capacities. */
if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
sd->child->flags &= ~SD_PREFER_SIBLING;
@@ -1378,7 +1658,7 @@ sd_init(struct sched_domain_topology_level *tl,
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
- } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ } else if (sd->flags & SD_SHARE_LLC) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
@@ -1403,7 +1683,7 @@ sd_init(struct sched_domain_topology_level *tl,
* For all levels sharing cache; connect a sched_domain_shared
* instance.
*/
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ if (sd->flags & SD_SHARE_LLC) {
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_inc(&sd->shared->ref);
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
@@ -1421,25 +1701,32 @@ static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
+
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, SD_INIT_NAME(PKG) },
{ NULL, },
};
static struct sched_domain_topology_level *sched_domain_topology =
default_topology;
+static struct sched_domain_topology_level *sched_domain_topology_saved;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
-void set_sched_topology(struct sched_domain_topology_level *tl)
+void __init set_sched_topology(struct sched_domain_topology_level *tl)
{
if (WARN_ON_ONCE(sched_smp_initialized))
return;
sched_domain_topology = tl;
+ sched_domain_topology_saved = NULL;
}
#ifdef CONFIG_NUMA
@@ -1463,8 +1750,12 @@ static void sched_numa_warn(const char *str)
for (i = 0; i < nr_node_ids; i++) {
printk(KERN_WARNING " ");
- for (j = 0; j < nr_node_ids; j++)
- printk(KERN_CONT "%02d ", node_distance(i,j));
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
+ printk(KERN_CONT "(%02d) ", node_distance(i,j));
+ else
+ printk(KERN_CONT " %02d ", node_distance(i,j));
+ }
printk(KERN_CONT "\n");
}
printk(KERN_WARNING "\n");
@@ -1472,19 +1763,34 @@ static void sched_numa_warn(const char *str)
bool find_numa_distance(int distance)
{
- int i;
+ bool found = false;
+ int i, *distances;
if (distance == node_distance(0, 0))
return true;
+ rcu_read_lock();
+ distances = rcu_dereference(sched_domains_numa_distance);
+ if (!distances)
+ goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
- if (sched_domains_numa_distance[i] == distance)
- return true;
+ if (distances[i] == distance) {
+ found = true;
+ break;
+ }
}
+unlock:
+ rcu_read_unlock();
- return false;
+ return found;
}
+#define for_each_cpu_node_but(n, nbut) \
+ for_each_node_state(n, N_CPU) \
+ if (n == nbut) \
+ continue; \
+ else
+
/*
* A system can have three types of NUMA topology:
* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
@@ -1504,7 +1810,7 @@ bool find_numa_distance(int distance)
* there is an intermediary node C, which is < N hops away from both
* nodes A and B, the system is a glueless mesh.
*/
-static void init_numa_topology_type(void)
+static void init_numa_topology_type(int offline_node)
{
int a, b, c, n;
@@ -1515,14 +1821,14 @@ static void init_numa_topology_type(void)
return;
}
- for_each_online_node(a) {
- for_each_online_node(b) {
+ for_each_cpu_node_but(a, offline_node) {
+ for_each_cpu_node_but(b, offline_node) {
/* Find two nodes furthest removed from each other. */
if (node_distance(a, b) < n)
continue;
/* Is there an intermediary node between a and b? */
- for_each_online_node(c) {
+ for_each_cpu_node_but(c, offline_node) {
if (node_distance(a, c) < n &&
node_distance(b, c) < n) {
sched_numa_topology_type =
@@ -1535,68 +1841,67 @@ static void init_numa_topology_type(void)
return;
}
}
+
+ pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
+ sched_numa_topology_type = NUMA_DIRECT;
}
-void sched_init_numa(void)
-{
- int next_distance, curr_distance = node_distance(0, 0);
- struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
- /* Includes NUMA identity node at level 0. */
- sched_domains_numa_distance[level++] = curr_distance;
- sched_domains_numa_levels = level;
+void sched_init_numa(int offline_node)
+{
+ struct sched_domain_topology_level *tl;
+ unsigned long *distance_map;
+ int nr_levels = 0;
+ int i, j;
+ int *distances;
+ struct cpumask ***masks;
/*
- * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
*/
- next_distance = curr_distance;
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+ if (!distance_map)
+ return;
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
+ for_each_cpu_node_but(i, offline_node) {
+ for_each_cpu_node_but(j, offline_node) {
+ int distance = node_distance(i, j);
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+ sched_numa_warn("Invalid distance value range");
+ bitmap_free(distance_map);
+ return;
}
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
+
+ bitmap_set(distance_map, distance, 1);
}
+ }
+ /*
+ * We can now figure out how many unique distance values there are and
+ * allocate memory accordingly.
+ */
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
+ distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!distances) {
+ bitmap_free(distance_map);
+ return;
+ }
+
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+ distances[i] = j;
}
+ rcu_assign_pointer(sched_domains_numa_distance, distances);
+
+ bitmap_free(distance_map);
/*
- * 'level' contains the number of unique distances
+ * 'nr_levels' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@@ -1605,36 +1910,40 @@ void sched_init_numa(void)
/*
* Here, we should temporarily reset sched_domains_numa_levels to 0.
* If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
+ * the array will contain less then 'nr_levels' members. This could be
* dangerous when we use it to iterate array sched_domains_numa_masks[][]
* in other functions.
*
- * We reset it to 'level' at the end of this function.
+ * We reset it to 'nr_levels' at the end of this function.
*/
sched_domains_numa_levels = 0;
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
- if (!sched_domains_numa_masks)
+ masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
+ if (!masks)
return;
/*
* Now for each level, construct a mask per node which contains all
* CPUs of nodes that are that many hops away from us.
*/
- for (i = 0; i < level; i++) {
- sched_domains_numa_masks[i] =
- kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!sched_domains_numa_masks[i])
+ for (i = 0; i < nr_levels; i++) {
+ masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!masks[i])
return;
- for (j = 0; j < nr_node_ids; j++) {
+ for_each_cpu_node_but(j, offline_node) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ int k;
+
if (!mask)
return;
- sched_domains_numa_masks[i][j] = mask;
+ masks[i][j] = mask;
+
+ for_each_cpu_node_but(k, offline_node) {
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ sched_numa_warn("Node-distance not symmetric");
- for_each_node(k) {
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
@@ -1642,11 +1951,12 @@ void sched_init_numa(void)
}
}
}
+ rcu_assign_pointer(sched_domains_numa_masks, masks);
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
- tl = kzalloc((i + level + 1) *
+ tl = kzalloc((i + nr_levels + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -1669,7 +1979,7 @@ void sched_init_numa(void)
/*
* .. and append 'j' levels of NUMA goodness.
*/
- for (j = 1; j < level; i++, j++) {
+ for (j = 1; j < nr_levels; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@@ -1679,12 +1989,67 @@ void sched_init_numa(void)
};
}
+ sched_domain_topology_saved = sched_domain_topology;
sched_domain_topology = tl;
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+ sched_domains_numa_levels = nr_levels;
+ WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
+
+ init_numa_topology_type(offline_node);
+}
+
+
+static void sched_reset_numa(void)
+{
+ int nr_levels, *distances;
+ struct cpumask ***masks;
+
+ nr_levels = sched_domains_numa_levels;
+ sched_domains_numa_levels = 0;
+ sched_max_numa_distance = 0;
+ sched_numa_topology_type = NUMA_DIRECT;
+ distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_domains_numa_distance, NULL);
+ masks = sched_domains_numa_masks;
+ rcu_assign_pointer(sched_domains_numa_masks, NULL);
+ if (distances || masks) {
+ int i, j;
+
+ synchronize_rcu();
+ kfree(distances);
+ for (i = 0; i < nr_levels && masks; i++) {
+ if (!masks[i])
+ continue;
+ for_each_node(j)
+ kfree(masks[i][j]);
+ kfree(masks[i]);
+ }
+ kfree(masks);
+ }
+ if (sched_domain_topology_saved) {
+ kfree(sched_domain_topology);
+ sched_domain_topology = sched_domain_topology_saved;
+ sched_domain_topology_saved = NULL;
+ }
+}
+
+/*
+ * Call with hotplug lock held
+ */
+void sched_update_numa(int cpu, bool online)
+{
+ int node;
+
+ node = cpu_to_node(cpu);
+ /*
+ * Scheduler NUMA topology is updated when the first CPU of a
+ * node is onlined or the last CPU of a node is offlined.
+ */
+ if (cpumask_weight(cpumask_of_node(node)) != 1)
+ return;
- init_numa_topology_type();
+ sched_reset_numa();
+ sched_init_numa(online ? NUMA_NO_NODE : node);
}
void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1694,6 +2059,10 @@ void sched_domains_numa_masks_set(unsigned int cpu)
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
+ if (!node_state(j, N_CPU))
+ continue;
+
+ /* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
@@ -1705,8 +2074,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
int i, j;
for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++)
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ for (j = 0; j < nr_node_ids; j++) {
+ if (sched_domains_numa_masks[i][j])
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
}
}
@@ -1720,15 +2091,129 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
*/
int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
- int i, j = cpu_to_node(cpu);
+ int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
+ struct cpumask ***masks;
+ rcu_read_lock();
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
- cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
- if (cpu < nr_cpu_ids)
- return cpu;
+ if (!masks[i][j])
+ break;
+ cpu = cpumask_any_and(cpus, masks[i][j]);
+ if (cpu < nr_cpu_ids) {
+ found = cpu;
+ break;
+ }
+ }
+unlock:
+ rcu_read_unlock();
+
+ return found;
+}
+
+struct __cmp_key {
+ const struct cpumask *cpus;
+ struct cpumask ***masks;
+ int node;
+ int cpu;
+ int w;
+};
+
+static int hop_cmp(const void *a, const void *b)
+{
+ struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b;
+ struct __cmp_key *k = (struct __cmp_key *)a;
+
+ if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+ return 1;
+
+ if (b == k->masks) {
+ k->w = 0;
+ return 0;
}
- return nr_cpu_ids;
+
+ prev_hop = *((struct cpumask ***)b - 1);
+ k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]);
+ if (k->w <= k->cpu)
+ return 0;
+
+ return -1;
+}
+
+/**
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
+ * from @cpus to @cpu, taking into account distance
+ * from a given @node.
+ * @cpus: cpumask to find a cpu from
+ * @cpu: CPU to start searching
+ * @node: NUMA node to order CPUs by distance
+ *
+ * Return: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+ struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
+ struct cpumask ***hop_masks;
+ int hop, ret = nr_cpu_ids;
+
+ if (node == NUMA_NO_NODE)
+ return cpumask_nth_and(cpu, cpus, cpu_online_mask);
+
+ rcu_read_lock();
+
+ /* CPU-less node entries are uninitialized in sched_domains_numa_masks */
+ node = numa_nearest_node(node, N_CPU);
+ k.node = node;
+
+ k.masks = rcu_dereference(sched_domains_numa_masks);
+ if (!k.masks)
+ goto unlock;
+
+ hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ hop = hop_masks - k.masks;
+
+ ret = hop ?
+ cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
+ cpumask_nth_and(cpu, cpus, k.masks[0][node]);
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
+
+/**
+ * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
+ * @node
+ * @node: The node to count hops from.
+ * @hops: Include CPUs up to that many hops away. 0 means local node.
+ *
+ * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
+ * @node, an error value otherwise.
+ *
+ * Requires rcu_lock to be held. Returned cpumask is only valid within that
+ * read-side section, copy it if required beyond that.
+ *
+ * Note that not all hops are equal in distance; see sched_init_numa() for how
+ * distances and masks are handled.
+ * Also note that this is a reflection of sched_domains_numa_masks, which may change
+ * during the lifetime of the system (offline nodes are taken out of the masks).
+ */
+const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+ struct cpumask ***masks;
+
+ if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
+ return ERR_PTR(-EINVAL);
+
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ return ERR_PTR(-EBUSY);
+
+ return masks[hops][node];
}
+EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
#endif /* CONFIG_NUMA */
@@ -1839,9 +2324,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int dflags, int cpu)
+ struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
if (child) {
sd->level = child->level + 1;
@@ -1851,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
-#endif
/* Fixup, ensure @sd has at least @child CPUs. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
@@ -1874,7 +2357,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
static bool topology_span_sane(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, int cpu)
{
- int i;
+ int i = cpu + 1;
/* NUMA levels are allowed to overlap */
if (tl->flags & SDTL_OVERLAP)
@@ -1886,9 +2369,7 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl,
* breaking the sched_group lists - i.e. a later get_group() pass
* breaks the linking done for an earlier span.
*/
- for_each_cpu(i, cpu_map) {
- if (i == cpu)
- continue;
+ for_each_cpu_from(i, cpu_map) {
/*
* We should 'and' all those masks with 'cpu_map' to exactly
* match the topology we're about to build, but that can only
@@ -1904,65 +2385,6 @@ static bool topology_span_sane(struct sched_domain_topology_level *tl,
}
/*
- * Find the sched_domain_topology_level where all CPU capacities are visible
- * for all CPUs.
- */
-static struct sched_domain_topology_level
-*asym_cpu_capacity_level(const struct cpumask *cpu_map)
-{
- int i, j, asym_level = 0;
- bool asym = false;
- struct sched_domain_topology_level *tl, *asym_tl = NULL;
- unsigned long cap;
-
- /* Is there any asymmetry? */
- cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
-
- for_each_cpu(i, cpu_map) {
- if (arch_scale_cpu_capacity(i) != cap) {
- asym = true;
- break;
- }
- }
-
- if (!asym)
- return NULL;
-
- /*
- * Examine topology from all CPU's point of views to detect the lowest
- * sched_domain_topology_level where a highest capacity CPU is visible
- * to everyone.
- */
- for_each_cpu(i, cpu_map) {
- unsigned long max_capacity = arch_scale_cpu_capacity(i);
- int tl_id = 0;
-
- for_each_sd_topology(tl) {
- if (tl_id < asym_level)
- goto next_level;
-
- for_each_cpu_and(j, tl->mask(i), cpu_map) {
- unsigned long capacity;
-
- capacity = arch_scale_cpu_capacity(j);
-
- if (capacity <= max_capacity)
- continue;
-
- max_capacity = capacity;
- asym_level = tl_id;
- asym_tl = tl;
- }
-next_level:
- tl_id++;
- }
- }
-
- return asym_tl;
-}
-
-
-/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
@@ -1974,8 +2396,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
- struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
+ bool has_cluster = false;
if (WARN_ON(cpumask_empty(cpu_map)))
goto error;
@@ -1984,25 +2406,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (alloc_state != sa_rootdomain)
goto error;
- tl_asym = asym_cpu_capacity_level(cpu_map);
-
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
sd = NULL;
for_each_sd_topology(tl) {
- int dflags = 0;
-
- if (tl == tl_asym) {
- dflags |= SD_ASYM_CPUCAPACITY;
- has_asym = true;
- }
if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
goto error;
- sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
+ sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+
+ has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
@@ -2027,6 +2443,64 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
+ /*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+ for_each_cpu(i, cpu_map) {
+ unsigned int imb = 0;
+ unsigned int imb_span = 1;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ struct sched_domain *child = sd->child;
+
+ if (!(sd->flags & SD_SHARE_LLC) && child &&
+ (child->flags & SD_SHARE_LLC)) {
+ struct sched_domain __rcu *top_p;
+ unsigned int nr_llcs;
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
+ */
+ nr_llcs = sd->span_weight / child->span_weight;
+ if (nr_llcs == 1)
+ imb = sd->span_weight >> 3;
+ else
+ imb = nr_llcs;
+ imb = max(1U, imb);
+ sd->imb_numa_nr = imb;
+
+ /* Set span based on the first NUMA domain. */
+ top_p = sd->parent;
+ while (top_p && !(top_p->flags & SD_NUMA)) {
+ top_p = top_p->parent;
+ }
+ imb_span = top_p ? top_p->span_weight : sd->span_weight;
+ } else {
+ int factor = max(1U, (sd->span_weight / imb_span));
+
+ sd->imb_numa_nr = imb * factor;
+ }
+ }
+ }
+
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
@@ -2044,21 +2518,21 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
-
cpu_attach_domain(sd, d.rd, i);
+
+ if (lowest_flag_domain(i, SD_CLUSTER))
+ has_cluster = true;
}
rcu_read_unlock();
if (has_asym)
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
- if (rq && sched_debug_enabled) {
- pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
+ if (has_cluster)
+ static_branch_inc_cpuslocked(&sched_cluster_active);
+
+ if (rq && sched_debug_verbose)
+ pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
ret = 0;
error:
@@ -2073,7 +2547,7 @@ static cpumask_var_t *doms_cur;
/* Number of sched domains in 'doms_cur': */
static int ndoms_cur;
-/* Attribues of custom domains in 'doms_cur' */
+/* Attributes of custom domains in 'doms_cur' */
static struct sched_domain_attr *dattr_cur;
/*
@@ -2122,7 +2596,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* Set up scheduler domains and groups. For now this just excludes isolated
* CPUs, but could be used to exclude other special cases in the future.
*/
-int sched_init_domains(const struct cpumask *cpu_map)
+int __init sched_init_domains(const struct cpumask *cpu_map)
{
int err;
@@ -2131,13 +2605,13 @@ int sched_init_domains(const struct cpumask *cpu_map)
zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
arch_update_cpu_topology();
+ asym_cpu_capacity_scan();
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
- cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
- register_sched_domain_sysctl();
return err;
}
@@ -2154,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
+ if (static_branch_unlikely(&sched_cluster_active))
+ static_branch_dec_cpuslocked(&sched_cluster_active);
+
rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
@@ -2212,11 +2689,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
lockdep_assert_held(&sched_domains_mutex);
- /* Always unregister in case we don't destroy any domains: */
- unregister_sched_domain_sysctl();
-
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
+ /* Trigger rebuilding CPU capacity asymmetry data */
+ if (new_topology)
+ asym_cpu_capacity_scan();
if (!doms_new) {
WARN_ON_ONCE(dattr_new);
@@ -2225,7 +2702,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
if (doms_new) {
n = 1;
cpumask_and(doms_new[0], cpu_active_mask,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
}
} else {
n = ndoms_new;
@@ -2240,9 +2717,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
/*
* This domain won't be destroyed and as such
- * its dl_bw->total_bw needs to be cleared. It
- * will be recomputed in function
- * update_tasks_root_domain().
+ * its dl_bw->total_bw needs to be cleared.
+ * Tasks contribution will be then recomputed
+ * in function dl_update_tasks_root_domain(),
+ * dl_servers contribution in function
+ * dl_restore_server_root_domain().
*/
rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
dl_clear_root_domain(rd);
@@ -2260,7 +2739,7 @@ match1:
n = 0;
doms_new = &fallback_doms;
cpumask_and(doms_new[0], cpu_active_mask,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
}
/* Build new domains: */
@@ -2277,7 +2756,7 @@ match2:
}
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
- /* Build perf. domains: */
+ /* Build perf domains: */
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < n && !sched_energy_update; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j]) &&
@@ -2286,7 +2765,7 @@ match2:
goto match3;
}
}
- /* No match - add perf. domains for a new rd */
+ /* No match - add perf domains for a new rd */
has_eas |= build_perf_domains(doms_new[i]);
match3:
;
@@ -2303,7 +2782,7 @@ match3:
dattr_cur = dattr_new;
ndoms_cur = ndoms_new;
- register_sched_domain_sysctl();
+ update_sched_domain_debugfs();
}
/*
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index ba059fbfc53a..51e38f5f4701 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,7 +4,6 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
-#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
@@ -37,6 +36,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -48,37 +58,26 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
EXPORT_SYMBOL(remove_wait_queue);
/*
- * Scan threshold to break wait queue walk.
- * This allows a waker to take a break from holding the
- * wait queue lock during the wait queue walk.
- */
-#define WAITQUEUE_WALK_BREAK_CNT 64
-
-/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
+ * number) then we wake that number of exclusive tasks, and potentially all
+ * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
+ * the list and any non-exclusive tasks will be woken first. A priority task
+ * may be at the head of the list, and can consume the event without any other
+ * tasks being woken.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, int wake_flags, void *key,
- wait_queue_entry_t *bookmark)
+ int nr_exclusive, int wake_flags, void *key)
{
wait_queue_entry_t *curr, *next;
- int cnt = 0;
lockdep_assert_held(&wq_head->lock);
- if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
- curr = list_next_entry(bookmark, entry);
-
- list_del(&bookmark->entry);
- bookmark->flags = 0;
- } else
- curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+ curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
if (&curr->entry == &wq_head->head)
return nr_exclusive;
@@ -87,43 +86,28 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
unsigned flags = curr->flags;
int ret;
- if (flags & WQ_FLAG_BOOKMARK)
- continue;
-
ret = curr->func(curr, mode, wake_flags, key);
if (ret < 0)
break;
if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
-
- if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
- (&next->entry != &wq_head->head)) {
- bookmark->flags = WQ_FLAG_BOOKMARK;
- list_add_tail(&bookmark->entry, &next->entry);
- break;
- }
}
return nr_exclusive;
}
-static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
- wait_queue_entry_t bookmark;
+ int remaining;
- bookmark.flags = 0;
- bookmark.private = NULL;
- bookmark.func = NULL;
- INIT_LIST_HEAD(&bookmark.entry);
+ spin_lock_irqsave(&wq_head->lock, flags);
+ remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
+ key);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
- do {
- spin_lock_irqsave(&wq_head->lock, flags);
- nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
- wake_flags, key, &bookmark);
- spin_unlock_irqrestore(&wq_head->lock, flags);
- } while (bookmark.flags & WQ_FLAG_BOOKMARK);
+ return nr_exclusive - remaining;
}
/**
@@ -133,38 +117,37 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
- * If this function wakes up a task, it executes a full memory barrier before
- * accessing the task state.
+ * If this function wakes up a task, it executes a full memory barrier
+ * before accessing the task state. Returns the number of exclusive
+ * tasks that were awaken.
*/
-void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, void *key)
+int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, void *key)
{
- __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+ return __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
+void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key)
+{
+ __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key);
+}
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
{
- __wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+ __wake_up_common(wq_head, mode, nr, 0, NULL);
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, 0, key, NULL);
+ __wake_up_common(wq_head, mode, 1, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark)
-{
- __wake_up_common(wq_head, mode, 1, 0, key, bookmark);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
-
/**
* __wake_up_sync_key - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
@@ -210,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
unsigned int mode, void *key)
{
- __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+ __wake_up_common(wq_head, mode, 1, WF_SYNC, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
@@ -223,6 +206,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+void __wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+ __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
+ /* POLLFREE must have cleared the queue. */
+ WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
@@ -249,17 +239,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent
}
EXPORT_SYMBOL(prepare_to_wait);
-void
+/* Returns true if we are the first waiter in the queue, false otherwise. */
+bool
prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
{
unsigned long flags;
+ bool was_empty = false;
wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&wq_head->lock, flags);
- if (list_empty(&wq_entry->entry))
+ if (list_empty(&wq_entry->entry)) {
+ was_empty = list_empty(&wq_head->head);
__add_wait_queue_entry_tail(wq_head, wq_entry);
+ }
set_current_state(state);
spin_unlock_irqrestore(&wq_head->lock, flags);
+ return was_empty;
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
@@ -389,17 +384,12 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
int ret = default_wake_function(wq_entry, mode, sync, key);
if (ret)
- list_del_init(&wq_entry->entry);
+ list_del_init_careful(&wq_entry->entry);
return ret;
}
EXPORT_SYMBOL(autoremove_wake_function);
-static inline bool is_kthread_should_stop(void)
-{
- return (current->flags & PF_KTHREAD) && kthread_should_stop();
-}
-
/*
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
*
@@ -429,7 +419,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
* or woken_wake_function() sees our store to current->state.
*/
set_current_state(mode); /* A */
- if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park())
timeout = schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292b9bc0..b410b61cec95 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,15 +1,15 @@
// SPDX-License-Identifier: GPL-2.0-only
+
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
-#include "sched.h"
#define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
+wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit)
{
const int shift = BITS_PER_LONG == 32 ? 5 : 6;
unsigned long val = (unsigned long)word << shift | bit;
@@ -33,7 +33,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
EXPORT_SYMBOL(wake_bit_function);
/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * To allow interruptible waiting and asynchronous (i.e. non-blocking)
* waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
* permitted return codes. Nonzero return codes halt waiting and return.
*/
@@ -47,7 +47,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
ret = (*action)(&wbq_entry->key, mode);
- } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+ } while (test_bit_acquire(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
finish_wait(wq_head, &wbq_entry->wq_entry);
@@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
}
EXPORT_SYMBOL(__wait_on_bit);
-int __sched out_of_line_wait_on_bit(void *word, int bit,
+int __sched out_of_line_wait_on_bit(unsigned long *word, int bit,
wait_bit_action_f *action, unsigned mode)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
EXPORT_SYMBOL(out_of_line_wait_on_bit);
int __sched out_of_line_wait_on_bit_timeout(
- void *word, int bit, wait_bit_action_f *action,
+ unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode, unsigned long timeout)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry
}
EXPORT_SYMBOL(__wait_on_bit_lock);
-int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit,
wait_bit_action_f *action, unsigned mode)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
}
EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
-void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
+void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit)
{
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
@@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
EXPORT_SYMBOL(__wake_up_bit);
/**
- * wake_up_bit - wake up a waiter on a bit
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * wake_up_bit - wake up waiters on a bit
+ * @word: the address containing the bit being waited on
+ * @bit: the bit at that address being waited on
+ *
+ * Wake up any process waiting in wait_on_bit() or similar for the
+ * given bit to be cleared.
+ *
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool. Only those tasks on that queue which have requested
+ * wake_up on this specific address and bit will be woken, and only if the
+ * bit is clear.
*
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that wakes up waiters
- * on a bit. For instance, if one were to have waiters on a bitflag,
- * one would call wake_up_bit() after clearing the bit.
+ * In order for this to function properly there must be a full memory
+ * barrier after the bit is cleared and before this function is called.
+ * If the bit was cleared atomically, such as a by clear_bit() then
+ * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed.
+ * If the bit was cleared with a fully-ordered operation, no further
+ * barrier is required.
*
- * In order for this to function properly, as it uses waitqueue_active()
- * internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_atomic(), but in some
- * cases where bitflags are manipulated non-atomically under a lock, one
- * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
- * because spin_unlock() does not guarantee a memory barrier.
+ * Normally the bit should be cleared by an operation with RELEASE
+ * semantics so that any changes to memory made before the bit is
+ * cleared are guaranteed to be visible after the matching wait_on_bit()
+ * completes.
*/
-void wake_up_bit(void *word, int bit)
+void wake_up_bit(unsigned long *word, int bit)
{
__wake_up_bit(bit_waitqueue(word, bit), word, bit);
}
@@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
}
EXPORT_SYMBOL(init_wait_var_entry);
+/**
+ * wake_up_var - wake up waiters on a variable (kernel address)
+ * @var: the address of the variable being waited on
+ *
+ * Wake up any process waiting in wait_var_event() or similar for the
+ * given variable to change. wait_var_event() can be waiting for an
+ * arbitrary condition to be true and associates that condition with an
+ * address. Calling wake_up_var() suggests that the condition has been
+ * made true, but does not strictly require the condtion to use the
+ * address given.
+ *
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool. Only those tasks on that queue which have requested
+ * wake_up on this specific address will be woken.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the variable is updated (or more accurately, after the
+ * condition waited on has been made to be true) and before this function
+ * is called. If the variable was updated atomically, such as a by
+ * atomic_dec() then smb_mb__after_atomic() can be used. If the
+ * variable was updated by a fully ordered operation such as
+ * atomic_dec_and_test() then no extra barrier is required. Otherwise
+ * smb_mb() is needed.
+ *
+ * Normally the variable should be updated (the condition should be made
+ * to be true) by an operation with RELEASE semantics such as
+ * smp_store_release() so that any changes to memory made before the
+ * variable was updated are guaranteed to be visible after the matching
+ * wait_var_event() completes.
+ */
void wake_up_var(void *var)
{
__wake_up_bit(__var_waitqueue(var), var, -1);
@@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
}
EXPORT_SYMBOL_GPL(bit_wait_timeout);
-__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
-{
- unsigned long now = READ_ONCE(jiffies);
-
- if (time_after_eq(now, word->timeout))
- return -EAGAIN;
- io_schedule_timeout(word->timeout - now);
- if (signal_pending_state(mode, current))
- return -EINTR;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
-
void __init wait_bit_init(void)
{
int i;